Skip to content
Snippets Groups Projects
Commit e31161c2 authored by Snyder, Amelia Marie's avatar Snyder, Amelia Marie
Browse files

remove extra cell

parent c5e2954d
No related branches found
No related tags found
1 merge request!22add cprep sample workflow and fixes to alaska workflow headers
%% Cell type:markdown id:6c10e07b-1e60-4926-af1d-fa75dc78e5d4 tags:
# CONUS404 Daily Zarr -> Collection Exploratory Workflow
This is a workflow for transforming the CONUS404 daily zarr dataset into a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md). We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.
To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:
1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/
2. I am assuming all coordinates are from the WGS84 datum if not specified.
%% Cell type:code id:201e0945-de55-45ff-b095-c2af009a4e62 tags:
``` python
import pystac
from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
import xarray as xr
import cf_xarray
import os
import fsspec
import cf_xarray
import hvplot.xarray
import pandas as pd
import json
import numpy as np
import metpy
import cartopy.crs as ccrs
import cfunits
import json
```
%% Cell type:markdown id:20b00e88-5a13-46b3-9787-d9ac2d4e7bd6 tags:
## Open up NHGF STAC Catalog
%% Cell type:code id:adf6c59d-58cd-48b1-a5fd-3bb205a3ef56 tags:
``` python
# define folder location where your STAC catalog json file is
catalog_path = os.path.join('..', '..', 'catalog')
# open catalog
catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
```
%% Cell type:markdown id:996e60ba-13e4-453a-8534-e62ce747f0fa tags:
## Collection Metadata Input
%% Cell type:code id:482d204d-b5b6-40e5-ac42-55b459be1097 tags:
``` python
# name for STAC collection
collection_id = 'conus404-daily'
# description of STAC collection
collection_description = 'CONUS404 40 years of daily values for subset of model output variables derived from hourly values on cloud storage'
# license for dataset
collection_license = 'CC0-1.0'
```
%% Cell type:markdown id:116b5837-8e85-4ae7-964a-803533ded714 tags:
## Asset Metadata Input
%% Cell type:code id:dd6fa323-132a-4794-8c80-576933f547a0 tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url = 's3://hytest/conus404/conus404_daily.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options": {"anon": True, "client_kwargs": {"endpoint_url":"https://usgs.osn.mghpcc.org/"}}}
# description for zarr url asset attached to collection (zarr_url)
asset_description = "Open Storage Network Pod S3 API access to collection zarr group"
# roles to tag zarr url asset with
asset_roles = ["data","zarr","s3"]
```
%% Cell type:code id:e1441cd4-e94c-4902-af46-8f1af470eb6b tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url2 = 's3://nhgf-development/conus404/conus404_daily_202210.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs2 = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options":{"requester_pays":True}}
# description for zarr url asset attached to collection (zarr_url)
asset_description2 = "S3 access to collection zarr group"
# roles to tag zarr url asset with
asset_roles2 = ["data","zarr","s3"]
```
%% Cell type:markdown id:b213b74f-ad17-4774-93b6-3b62be616b45 tags:
## Data Exploration
%% Cell type:code id:708f2cf5-79ab-49af-8067-de31d0d13ee6 tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={})
ds
```
%% Cell type:markdown id:0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc tags:
## Identify x, y, t dimensions of dataset
May require user input if dimensions cannot be auto-detected.
%% Cell type:code id:ab91268f-7200-4cb1-979a-c7d75531d2c0 tags:
``` python
dims_auto_extract = ['X', 'Y', 'T']
def extract_dim(ds, d):
try:
dim_list = ds.cf.axes[d]
assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'
dim = dim_list[0]
except KeyError:
print(f"Could not auto-extract {d} dimension name.")
print("Look at the xarray output above showing the dataset dimensions.")
dim = str(input(f"What is the name of the {d} dimension of this dataset?"))
assert dim in ds.dims, "That is not a valid dimension name for this dataset"
print(f"name of {d} dimension: {dim}\n")
return dim
dim_names_dict = {}
for d in dims_auto_extract:
dim_names_dict[d] = extract_dim(ds, d)
print(f"Dimension dictionary: {dim_names_dict}")
```
%% Cell type:code id:fd580276-82fe-4b00-808e-0bfa10998a1e tags:
``` python
```
%% Cell type:markdown id:810d7480-165d-41c0-bd09-163656a14003 tags:
## Get crs info
%% Cell type:code id:b03d52f3-1367-4255-a561-52ee4fc9e92d tags:
``` python
ds = ds.metpy.parse_cf()
crs = ds[list(ds.keys())[0]].metpy.cartopy_crs
```
%% Cell type:markdown id:8fbfecfb-9886-4d06-a34c-6471cb0a6053 tags:
## Plot a map
%% Cell type:code id:4eb4d027-4266-4a0b-8f16-bacfbef06242 tags:
``` python
# plot a map of a single variable
var_to_plot = 'SNOW'
da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()
da.hvplot.quadmesh(x='lon', y='lat', rasterize=True,
geo=True, tiles='OSM', alpha=0.7, cmap='turbo')
```
%% Cell type:markdown id:5e057a6c-06fb-4406-823b-e81c58e520e4 tags:
## Plot a time series at a specific point
This can help you verify a variable's values
%% Cell type:code id:c7de2681-88c2-4597-857c-8f169c596f8b tags:
``` python
# enter lat, lon of point you want to plot time series for
lat,lon = 39.978322,-105.2772194
time_start = '2013-01-01 00:00'
time_end = '2013-12-31 00:00'
x, y = crs.transform_point(lon, lat, src_crs=ccrs.PlateCarree()) # PlateCaree = Lat,Lon
da = ds[var_to_plot].sel(x=x, y=y, method='nearest').sel(time=slice(time_start,time_end)).load()
da.hvplot(x=dim_names_dict['T'], grid=True)
```
%% Cell type:markdown id:a8c3ed37-8564-400b-a7fb-25bd5e43d21c tags:
## Create Collection Extent
%% Cell type:markdown id:69f0d837-68a5-4fed-9a14-5d75cfbb0da4 tags:
### Spatial Extent
%% Cell type:code id:d46805e0-8e94-4ebe-aa01-d9a2d7051459 tags:
``` python
# pull out lat/lon bbox for data
# coordinates must be from WGS 84 datum
# left, bottom, right, top
# Note: I'm not sure why but I have some trouble getting the data type right here -
# I've included all the options I've had to run to get it to not be a regular float rather
# than a numpy float below - switch the commented line if you have this issue
#coord_bounds = [ds.lon.data.min().compute().astype(float), ds.lat.data.min().compute().astype(float), ds.lon.data.max().compute().astype(float), ds.lat.data.max().compute().astype(float)]
#coord_bounds = [ds.lon.data.min().compute().astype(float).tolist(), ds.lat.data.min().compute().astype(float).tolist(), ds.lon.data.max().compute().astype(float).tolist(), ds.lat.data.max().compute().astype(float).tolist()]
coord_bounds = [ds.lon.data.min().astype(float).item(), ds.lat.data.min().astype(float).item(), ds.lon.data.max().astype(float).item(), ds.lat.data.max().astype(float).item()]
print(coord_bounds)
# create a spatial extent object
spatial_extent = pystac.SpatialExtent(bboxes=[coord_bounds])
```
%% Cell type:markdown id:a04c8fca-1d33-43ac-9e2b-62d7be2887f7 tags:
### Temporal Extent
%% Cell type:code id:41a84995-867c-4152-8c57-85e3758bbb77 tags:
``` python
# pull out first and last timestamps
temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())
temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())
print(f'min: {temporal_extent_lower} \nmax: {temporal_extent_upper}')
# create a temporal extent object
temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])
```
%% Cell type:code id:1b1e37c4-5348-46ad-abc9-e005b5d6c02b tags:
``` python
collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
```
%% Cell type:markdown id:cfb71202-03df-45b5-ac2f-0dc2ee1ab780 tags:
## Create pystac collection
%% Cell type:code id:7e96811b-95ae-406a-9728-55fc429d4e1f tags:
``` python
if catalog.get_child(collection_id):
collection = catalog.get_child(collection_id)
print("existing collection opened")
collection.extent=collection_extent
collection.description=collection_description
collection.license=collection_license
else:
collection = pystac.Collection(id=collection_id,
description=collection_description,
extent=collection_extent,
license=collection_license)
print("new collection created")
```
%% Cell type:markdown id:a21c76e8-cd57-4eb5-a33f-7c668a3b3205 tags:
## Add zarr url asset to collection
%% Cell type:code id:094832af-d22b-4359-b0f6-cf687acce5cc tags:
``` python
asset_id = "zarr-s3-osn"
asset = pystac.Asset(href=zarr_url,
description=asset_description,
media_type="application/vnd+zarr",
roles=asset_roles,
extra_fields = xarray_opendataset_kwargs)
collection.add_asset(asset_id, asset)
```
%% Cell type:code id:0c298d07-f234-4a08-986d-87f4a39e9ae6 tags:
``` python
asset_id2 = "zarr-s3"
asset2 = pystac.Asset(href=zarr_url2,
description=asset_description2,
media_type="application/vnd+zarr",
roles=asset_roles2,
extra_fields = xarray_opendataset_kwargs2)
collection.add_asset(asset_id2, asset2)
```
%% Cell type:markdown id:f67cd5c9-db33-45c2-bc21-480cd67354f4 tags:
## Add datacube extension to collection
%% Cell type:code id:fc00946d-2880-491d-9b3b-3aeeb4414d6c tags:
``` python
# instantiate extention on collection
dc = DatacubeExtension.ext(collection, add_if_missing=True)
```
%% Cell type:markdown id:8bdd77a2-7587-485e-afb7-42af3a822241 tags:
### Add cube dimensions (required field for extension)
%% Cell type:code id:120a4914-3302-44a5-a282-0308ac84f040 tags:
``` python
# list out dataset dimensions
# When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,
# raising an error if it can’t be found.
dims = list(ds.dims)
print(dims)
```
%% Cell type:code id:00a18a29-fb9a-4b56-8009-493122997b16 tags:
``` python
# get x, y bounds for extent of those dimensions (required)
xy_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
print(xy_bounds)
```
%% Cell type:code id:0f49d6d7-9e30-4144-909b-fa1238e6c77a tags:
``` python
def get_step(ds, dim_name):
dim_vals = ds[dim_name].values
diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]
unique_steps = np.unique(diffs)
print(type(unique_steps[0]))
# set step - if all steps are the same length
# datacube spec specifies to use null for irregularly spaced steps
if len(unique_steps)==1:
# make sure time deltas are in np timedelta format
unique_steps = [np.array([step], dtype="timedelta64[ns]")[0] for step in unique_steps]
step = unique_steps[0].astype(float).item()
else:
step = None
return(step)
```
%% Cell type:code id:a20d12bf-a511-4c5e-84d0-77e2ec551518 tags:
``` python
def get_long_name(ds, v):
# try to get long_name attribute from variable
try:
long_name = ds[v].attrs['long_name']
# otherwise, leave empty
except:
long_name = None
return long_name
```
%% Cell type:markdown id:e7dc357c-91ec-49ae-83e5-400f791f9792 tags:
#### user input needed - you will need to look at the crs information and create a cartopy crs object after identifying the projection type:
reference list of cartopy projections: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html
%% Cell type:code id:ea452f62-5644-49b6-8a4e-7dc4f649fd1a tags:
``` python
# print ot crs information in dataset
crs_info = ds.crs
print(crs_info)
```
%% Cell type:code id:1b1d05ff-8e43-44a7-8343-178b112c4ad6 tags:
``` python
# create the appropriate cartopy projection
lcc = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian,
central_latitude=crs_info.latitude_of_projection_origin,
standard_parallels=crs_info.standard_parallel)
# the datacube extension can accept reference_system information as a numerical EPSG code,
# WKT2 (ISO 19162) string or PROJJSON object.
# we will use a projjson, as was done by Microsoft Planetary Computer here:
# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
projjson = json.loads(lcc.to_json())
# alternatively, I think we could do this:
#projjson = crs.to_json()
```
%% Cell type:markdown id:00a5e041-081d-428d-ac2e-75d16de205e6 tags:
#### user input needed - you will need to copy all of the dimensions from above into the dict and fill in the appropriate attributes(type, axis, extent):
%% Cell type:code id:5a443497-67a9-4dce-a8e9-b08d31a88223 tags:
``` python
# create a dictionary of datacube dimensions you would like to assign to this dataset
# dimension name should come from the coordinates printed above
# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)
# note that the extent of each dimension should be pulled from the dataset
dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(get_step(ds, 'time')).isoformat()}),
'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': get_step(ds, 'x'), 'reference_system': projjson}),
'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': get_step(ds, 'y'), 'reference_system': projjson}),
'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top_stag')}),
'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top')}),
'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'soil_layers_stag')}),
'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, 'x_stag')}),
'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y_stag')}),
'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snow_layers_stag')}),
'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snso_layers_stag')}),
}
```
%% Cell type:markdown id:0f277883-a3fd-425f-966a-ca2140d0ef2f tags:
### Add cube variables (optional field for extension)
%% Cell type:code id:92510876-7853-4d24-8563-c69f9012aeb6 tags:
``` python
# define functions to pull out datacube attributes and validate format
def get_unit(ds, v):
# check if unit is defined for variable
try:
unit = ds[v].attrs['units']
except:
unit = None
# check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database
# datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).
# gdptools expects this format as well
try:
cfunits.Units(unit).isvalid
except:
print("Unit is not valid as a UD unit.")
unit = str(input("Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database"))
assert cfunits.Units(unit).isvalid
return unit
def get_var_type(ds, v):
if v in ds.coords:
# type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.
# For example, the values of the datacube might be provided in the projected coordinate reference system,
# but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.
var_type = 'auxiliary'
# type = data for a variable indicating some measured value, for example "precipitation", "temperature", etc.
else:
var_type = 'data'
return var_type
```
%% Cell type:code id:e9272931-fc0b-4f2a-9546-283033e9cde8 tags:
``` python
# pull list of vars from dataset
vars = list(ds.variables)
# drop metpy_crs coordinate we have added
if 'metpy_crs' in ds.coords:
ds = ds.drop_vars('metpy_crs')
# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.
# we will drop all values in dims from vars
vars = [v for v in vars if v not in dims]
# Microsoft Planetary Computer includes coordinates and crs as variables here:
# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
# we will keep those in the var list
# create dictionary of dataset variables and associated dimensions
vars_dict={}
for v in vars:
unit = get_unit(ds, v)
var_type = get_var_type(ds, v)
long_name = get_long_name(ds, v)
vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})
```
%% Cell type:markdown id:11ad5352-884c-4472-8864-4570a96f66e5 tags:
### Finalize extension
%% Cell type:code id:10141fd4-91d6-491d-878b-02653720891d tags:
``` python
# add dimesions and variables to collection extension
dc.apply(dimensions=dims_dict, variables=vars_dict)
```
%% Cell type:markdown id:615ca168-75fb-4135-9941-0ef5fe4fd1cb tags:
## Add STAC Collection to Catalog and Save
%% Cell type:code id:e2120a55-3d04-4122-a93f-29afcdb8cb1b tags:
``` python
# # helper to find items of wrong type
# d = collection.to_dict()
# def find_paths(nested_dict, prepath=()):
# for k, v in nested_dict.items():
# try:
# path = prepath + (k,)
# if type(v) is np.float64: # found value
# yield path
# elif hasattr(v, 'items'): # v is a dict
# yield from find_paths(v, path)
# except:
# print(prepath)
# print(*find_paths(d))
```
%% Cell type:code id:4b75791b-6b2d-40be-b7c6-330a60888fb5 tags:
``` python
if catalog.get_child(collection_id):
collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
else:
catalog.add_child(collection)
catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)
```
%% Cell type:code id:d6f676b5-e892-4bfb-8d73-2828addd838c tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment