Skip to content
Snippets Groups Projects
Commit 5679b20f authored by Snyder, Amelia Marie's avatar Snyder, Amelia Marie
Browse files

Merge branch 'alaska_et_2020_ccsm4_historical_simulation' into 'main'

update file paths for alaska_et_2020_ccsm4_historical_simulation

See merge request !19
parents 5a80484e 5e63fa06
No related branches found
No related tags found
1 merge request!19update file paths for alaska_et_2020_ccsm4_historical_simulation
%% Cell type:markdown id:6c10e07b-1e60-4926-af1d-fa75dc78e5d4 tags:
# CONUS404 Daily Zarr -> Collection Exploratory Workflow
This is a workflow for transforming the CONUS404 daily zarr dataset into a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md). We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.
To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:
1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/
2. I am assuming all coordinates are from the WGS84 datum if not specified.
%% Cell type:code id:201e0945-de55-45ff-b095-c2af009a4e62 tags:
``` python
import pystac
from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
import xarray as xr
import cf_xarray
import os
import fsspec
import cf_xarray
import hvplot.xarray
import pandas as pd
import json
import numpy as np
import metpy
import cartopy.crs as ccrs
import cfunits
import json
```
%% Cell type:markdown id:20b00e88-5a13-46b3-9787-d9ac2d4e7bd6 tags:
## Open up NHGF STAC Catalog
%% Cell type:code id:adf6c59d-58cd-48b1-a5fd-3bb205a3ef56 tags:
``` python
# define folder location where your STAC catalog json file is
catalog_path = os.path.join('..', '..', 'catalog')
# open catalog
catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
```
%% Cell type:markdown id:996e60ba-13e4-453a-8534-e62ce747f0fa tags:
## Collection Metadata Input
%% Cell type:code id:482d204d-b5b6-40e5-ac42-55b459be1097 tags:
``` python
# name for STAC collection
collection_id = 'alaska_et_2020_ccsm4_historical_simulation'
# description of STAC collection
collection_description = 'alaska_et_2020_ccsm4_historical_simulation'
# license for dataset
collection_license = 'CC0-1.0'
```
%% Cell type:markdown id:116b5837-8e85-4ae7-964a-803533ded714 tags:
## Asset Metadata Input
%% Cell type:code id:dd6fa323-132a-4794-8c80-576933f547a0 tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url = 's3://mdmf/gdp/alaska_et_2020_ccsm4_historical_simulation.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options": {"anon": True, "client_kwargs": {"endpoint_url":"https://usgs.osn.mghpcc.org/"}}}
# description for zarr url asset attached to collection (zarr_url)
asset_description = "Open Storage Network Pod S3 API access to collection zarr group"
# roles to tag zarr url asset with
asset_roles = ["data","zarr","s3"]
```
%% Cell type:code id:e1441cd4-e94c-4902-af46-8f1af470eb6b tags:
``` python
# url to zarr store that you want to create a collection for
zarr_url2 = 's3://nhgf-development/workspace/DataConversion/alaska_et_2020_ccsm4_historical_simulation.zarr/'
# define keyword arguments needed for opening the dataset with xarray
# ref: https://github.com/stac-extensions/xarray-assets
xarray_opendataset_kwargs2 = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
"xarray:storage_options":{"requester_pays":True}}
# description for zarr url asset attached to collection (zarr_url)
asset_description2 = "S3 access to collection zarr group"
# roles to tag zarr url asset with
asset_roles2 = ["data","zarr","s3"]
```
%% Cell type:markdown id:b213b74f-ad17-4774-93b6-3b62be616b45 tags:
## Data Exploration
%% Cell type:code id:708f2cf5-79ab-49af-8067-de31d0d13ee6 tags:
``` python
# open and view zarr dataset
fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={})
ds
```
%% Output
---------------------------------------------------------------------------
NoSuchBucket Traceback (most recent call last)
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:113, in _error_wrapper(func, args, kwargs, retries)
112 try:
--> 113 return await func(*args, **kwargs)
114 except S3_RETRYABLE_ERRORS as e:
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/aiobotocore/client.py:371, in AioBaseClient._make_api_call(self, operation_name, api_params)
370 error_class = self.exceptions.from_code(error_code)
--> 371 raise error_class(parsed_response, operation_name)
372 else:
NoSuchBucket: An error occurred (NoSuchBucket) when calling the GetObject operation: Unknown
The above exception was the direct cause of the following exception:
FileNotFoundError Traceback (most recent call last)
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/mapping.py:143, in FSMap.__getitem__(self, key, default)
142 try:
--> 143 result = self.fs.cat(k)
144 except self.missing_exceptions:
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/asyn.py:121, in sync_wrapper.<locals>.wrapper(*args, **kwargs)
120 self = obj or args[0]
--> 121 return sync(self.loop, func, *args, **kwargs)
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/asyn.py:106, in sync(loop, func, timeout, *args, **kwargs)
105 elif isinstance(return_result, BaseException):
--> 106 raise return_result
107 else:
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/asyn.py:61, in _runner(event, coro, result, timeout)
60 try:
---> 61 result[0] = await coro
62 except Exception as ex:
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/asyn.py:433, in AsyncFileSystem._cat(self, path, recursive, on_error, batch_size, **kwargs)
432 if ex:
--> 433 raise ex
434 if (
435 len(paths) > 1
436 or isinstance(path, list)
437 or paths[0] != self._strip_protocol(path)
438 ):
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/asyncio/tasks.py:408, in wait_for(fut, timeout)
407 if timeout is None:
--> 408 return await fut
410 if timeout <= 0:
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:1071, in S3FileSystem._cat_file(self, path, version_id, start, end)
1069 resp["Body"].close()
-> 1071 return await _error_wrapper(_call_and_read, retries=self.retries)
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:140, in _error_wrapper(func, args, kwargs, retries)
139 err = translate_boto_error(err)
--> 140 raise err
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:113, in _error_wrapper(func, args, kwargs, retries)
112 try:
--> 113 return await func(*args, **kwargs)
114 except S3_RETRYABLE_ERRORS as e:
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:1058, in S3FileSystem._cat_file.<locals>._call_and_read()
1057 async def _call_and_read():
-> 1058 resp = await self._call_s3(
1059 "get_object",
1060 Bucket=bucket,
1061 Key=key,
1062 **version_id_kw(version_id or vers),
1063 **head,
1064 **self.req_kw,
1065 )
1066 try:
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:348, in S3FileSystem._call_s3(self, method, *akwarglist, **kwargs)
347 additional_kwargs = self._get_s3_method_kwargs(method, *akwarglist, **kwargs)
--> 348 return await _error_wrapper(
349 method, kwargs=additional_kwargs, retries=self.retries
350 )
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:140, in _error_wrapper(func, args, kwargs, retries)
139 err = translate_boto_error(err)
--> 140 raise err
FileNotFoundError: An error occurred (NoSuchBucket) when calling the GetObject operation: Unknown
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
Cell In[5], line 3
1 # open and view zarr dataset
2 fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
----> 3 ds = xr.open_dataset(fs2.get_mapper(zarr_url2), engine='zarr',
4 backend_kwargs={'consolidated':True}, chunks={})
5 ds
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/xarray/backends/api.py:539, in open_dataset(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)
527 decoders = _resolve_decoders_kwargs(
528 decode_cf,
529 open_backend_dataset_parameters=backend.open_dataset_parameters,
(...)
535 decode_coords=decode_coords,
536 )
538 overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None)
--> 539 backend_ds = backend.open_dataset(
540 filename_or_obj,
541 drop_variables=drop_variables,
542 **decoders,
543 **kwargs,
544 )
545 ds = _dataset_from_backend_dataset(
546 backend_ds,
547 filename_or_obj,
(...)
555 **kwargs,
556 )
557 return ds
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/xarray/backends/zarr.py:840, in ZarrBackendEntrypoint.open_dataset(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, stacklevel)
820 def open_dataset(
821 self,
822 filename_or_obj,
(...)
836 stacklevel=3,
837 ):
839 filename_or_obj = _normalize_path(filename_or_obj)
--> 840 store = ZarrStore.open_group(
841 filename_or_obj,
842 group=group,
843 mode=mode,
844 synchronizer=synchronizer,
845 consolidated=consolidated,
846 consolidate_on_close=False,
847 chunk_store=chunk_store,
848 storage_options=storage_options,
849 stacklevel=stacklevel + 1,
850 )
852 store_entrypoint = StoreBackendEntrypoint()
853 with close_on_error(store):
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/xarray/backends/zarr.py:407, in ZarrStore.open_group(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, stacklevel)
404 raise FileNotFoundError(f"No such file or directory: '{store}'")
405 elif consolidated:
406 # TODO: an option to pass the metadata_key keyword
--> 407 zarr_group = zarr.open_consolidated(store, **open_kwargs)
408 else:
409 zarr_group = zarr.open_group(store, **open_kwargs)
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/zarr/convenience.py:1300, in open_consolidated(store, metadata_key, mode, **kwargs)
1297 metadata_key = 'meta/root/consolidated/' + metadata_key
1299 # setup metadata store
-> 1300 meta_store = ConsolidatedStoreClass(store, metadata_key=metadata_key)
1302 # pass through
1303 chunk_store = kwargs.pop('chunk_store', None) or store
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/zarr/storage.py:2861, in ConsolidatedMetadataStore.__init__(self, store, metadata_key)
2858 self.store = Store._ensure_store(store)
2860 # retrieve consolidated metadata
-> 2861 meta = json_loads(self.store[metadata_key])
2863 # check format of consolidated metadata
2864 consolidated_format = meta.get('zarr_consolidated_format', None)
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/zarr/storage.py:724, in KVStore.__getitem__(self, key)
723 def __getitem__(self, key):
--> 724 return self._mutable_mapping[key]
File /caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/mapping.py:147, in FSMap.__getitem__(self, key, default)
145 if default is not None:
146 return default
--> 147 raise KeyError(key)
148 return result
KeyError: '.zmetadata'
%% Cell type:markdown id:0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc tags:
## Identify x, y, t dimensions of dataset
May require user input if dimensions cannot be auto-detected.
%% Cell type:code id:ab91268f-7200-4cb1-979a-c7d75531d2c0 tags:
``` python
dims_auto_extract = ['X', 'Y', 'T']
def extract_dim(ds, d):
try:
dim_list = ds.cf.axes[d]
assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'
dim = dim_list[0]
except KeyError:
print(f"Could not auto-extract {d} dimension name.")
print("Look at the xarray output above showing the dataset dimensions.")
dim = str(input(f"What is the name of the {d} dimension of this dataset?"))
assert dim in ds.dims, "That is not a valid dimension name for this dataset"
print(f"name of {d} dimension: {dim}\n")
return dim
dim_names_dict = {}
for d in dims_auto_extract:
dim_names_dict[d] = extract_dim(ds, d)
print(f"Dimension dictionary: {dim_names_dict}")
```
%% Cell type:markdown id:8fbfecfb-9886-4d06-a34c-6471cb0a6053 tags:
## Plot a map
%% Cell type:code id:4eb4d027-4266-4a0b-8f16-bacfbef06242 tags:
``` python
# plot a map of a single variable
var_to_plot = 'SNOW'
da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()
da.hvplot.quadmesh(x='lon', y='lat', rasterize=True,
geo=True, tiles='OSM', alpha=0.7, cmap='turbo')
```
%% Cell type:markdown id:5e057a6c-06fb-4406-823b-e81c58e520e4 tags:
## Plot a time series at a specific point
This can help you verify a variable's values
%% Cell type:code id:c7de2681-88c2-4597-857c-8f169c596f8b tags:
``` python
# enter lat, lon of point you want to plot time series for
lat,lon = 39.978322,-105.2772194
time_start = '2013-01-01 00:00'
time_end = '2013-12-31 00:00'
ds = ds.metpy.parse_cf()
crs = ds[var_to_plot].metpy.cartopy_crs
x, y = crs.transform_point(lon, lat, src_crs=ccrs.PlateCarree()) # PlateCaree = Lat,Lon
da = ds[var_to_plot].sel(x=x, y=y, method='nearest').sel(time=slice(time_start,time_end)).load()
da.hvplot(x=dim_names_dict['T'], grid=True)
```
%% Cell type:markdown id:a8c3ed37-8564-400b-a7fb-25bd5e43d21c tags:
## Create Collection Extent
%% Cell type:markdown id:69f0d837-68a5-4fed-9a14-5d75cfbb0da4 tags:
### Spatial Extent
%% Cell type:code id:d46805e0-8e94-4ebe-aa01-d9a2d7051459 tags:
``` python
# pull out lat/lon bbox for data
# coordiantes must be from WGS 84 datum
# left, bottom, right, top
# Note: I'm not sure why but half the time you need to run .compute() to get the value and the other half
# you shouldn't - I've included both options below - switch the commented line if you have this issue
#coord_bounds = [ds.lon.data.min().compute().astype(float), ds.lat.data.min().compute().astype(float), ds.lon.data.max().compute().astype(float), ds.lat.data.max().compute().astype(float)]
coord_bounds = [ds.lon.data.min().astype(float).item(), ds.lat.data.min().astype(float).item(), ds.lon.data.max().astype(float).item(), ds.lat.data.max().astype(float).item()]
print(coord_bounds)
# create a spatial extent object
spatial_extent = pystac.SpatialExtent(bboxes=[coord_bounds])
```
%% Cell type:markdown id:a04c8fca-1d33-43ac-9e2b-62d7be2887f7 tags:
### Temporal Extent
%% Cell type:code id:41a84995-867c-4152-8c57-85e3758bbb77 tags:
``` python
# pull out first and last timestamps
temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())
temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())
print(f'min: {temporal_extent_lower} \nmax: {temporal_extent_upper}')
# create a temporal extent object
temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])
```
%% Cell type:code id:1b1e37c4-5348-46ad-abc9-e005b5d6c02b tags:
``` python
collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
```
%% Cell type:markdown id:cfb71202-03df-45b5-ac2f-0dc2ee1ab780 tags:
## Create pystac collection
%% Cell type:code id:7e96811b-95ae-406a-9728-55fc429d4e1f tags:
``` python
if catalog.get_child(collection_id):
collection = catalog.get_child(collection_id)
print("existing collection opened")
collection.extent=collection_extent
collection.description=collection_description
collection.license=collection_license
else:
collection = pystac.Collection(id=collection_id,
description=collection_description,
extent=collection_extent,
license=collection_license)
print("new collection created")
```
%% Cell type:markdown id:a21c76e8-cd57-4eb5-a33f-7c668a3b3205 tags:
## Add zarr url asset to collection
%% Cell type:code id:094832af-d22b-4359-b0f6-cf687acce5cc tags:
``` python
asset_id = "zarr-s3-osn"
asset = pystac.Asset(href=zarr_url,
description=asset_description,
media_type="application/vnd+zarr",
roles=asset_roles,
extra_fields = xarray_opendataset_kwargs)
collection.add_asset(asset_id, asset)
```
%% Cell type:code id:0c298d07-f234-4a08-986d-87f4a39e9ae6 tags:
``` python
asset_id2 = "zarr-s3"
asset2 = pystac.Asset(href=zarr_url2,
description=asset_description2,
media_type="application/vnd+zarr",
roles=asset_roles2,
extra_fields = xarray_opendataset_kwargs2)
collection.add_asset(asset_id2, asset2)
```
%% Cell type:markdown id:f67cd5c9-db33-45c2-bc21-480cd67354f4 tags:
## Add datacube extension to collection
%% Cell type:code id:fc00946d-2880-491d-9b3b-3aeeb4414d6c tags:
``` python
# instantiate extention on collection
dc = DatacubeExtension.ext(collection, add_if_missing=True)
```
%% Cell type:markdown id:8bdd77a2-7587-485e-afb7-42af3a822241 tags:
### Add cube dimensions (required field for extension)
%% Cell type:code id:120a4914-3302-44a5-a282-0308ac84f040 tags:
``` python
# list out dataset dimensions
# When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,
# raising an error if it can’t be found.
dims = list(ds.dims)
print(dims)
```
%% Cell type:code id:00a18a29-fb9a-4b56-8009-493122997b16 tags:
``` python
# get x, y bounds for extent of those dimensions (required)
xy_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
print(xy_bounds)
```
%% Cell type:code id:0f49d6d7-9e30-4144-909b-fa1238e6c77a tags:
``` python
def get_step(dim_name):
dim_vals = ds[dim_name].values
diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]
unique_steps = np.unique(diffs)
# set step - if all steps are the same length
# datacube spec specifies to use null for irregularly spaced steps
if len(unique_steps)==1:
step = unique_steps[0].astype(float).item()
else:
step = None
return(step)
```
%% Cell type:code id:a20d12bf-a511-4c5e-84d0-77e2ec551518 tags:
``` python
def get_long_name(ds, v):
# try to get long_name attribute from variable
try:
long_name = ds[v].attrs['long_name']
# otherwise, leave empty
except:
long_name = None
return long_name
```
%% Cell type:markdown id:e7dc357c-91ec-49ae-83e5-400f791f9792 tags:
#### user input needed - you will need to look at the crs information and create a cartopy crs object after identifying the projection type:
reference list of cartopy projections: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html
%% Cell type:code id:ea452f62-5644-49b6-8a4e-7dc4f649fd1a tags:
``` python
# print ot crs information in dataset
crs_info = ds.crs
print(crs_info)
```
%% Cell type:code id:1b1d05ff-8e43-44a7-8343-178b112c4ad6 tags:
``` python
# create the appropriate cartopy projection
lcc = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian,
central_latitude=crs_info.latitude_of_projection_origin,
standard_parallels=crs_info.standard_parallel)
# the datacube extension can accept reference_system information as a numerical EPSG code,
# WKT2 (ISO 19162) string or PROJJSON object.
# we will use a projjson, as was done by Microsoft Planetary Computer here:
# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
projjson = json.loads(lcc.to_json())
```
%% Cell type:markdown id:00a5e041-081d-428d-ac2e-75d16de205e6 tags:
#### user input needed - you will need to copy all of the dimensions from above into the dict and fill in the appropriate attributes(type, axis, extent):
%% Cell type:code id:5a443497-67a9-4dce-a8e9-b08d31a88223 tags:
``` python
# create a dictionary of datacube dimensions you would like to assign to this dataset
# dimension name should come from the coordinates printed above
# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)
# note that the extent of each dimension should be pulled from the dataset
dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(get_step('time')).isoformat()}),
'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': get_step('x'), 'reference_system': projjson}),
'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': get_step('y'), 'reference_system': projjson}),
'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top_stag')}),
'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top')}),
'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'soil_layers_stag')}),
'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, 'x_stag')}),
'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y_stag')}),
'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snow_layers_stag')}),
'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snso_layers_stag')}),
}
```
%% Cell type:markdown id:0f277883-a3fd-425f-966a-ca2140d0ef2f tags:
### Add cube variables (optional field for extension)
%% Cell type:code id:92510876-7853-4d24-8563-c69f9012aeb6 tags:
``` python
# define functions to pull out datacube attributes and validate format
def get_unit(ds, v):
# check if unit is defined for variable
try:
unit = ds[v].attrs['units']
except:
unit = None
# check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database
# datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).
# gdptools expects this format as well
try:
cfunits.Units(unit).isvalid
except:
print("Unit is not valid as a UD unit.")
unit = str(input("Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database"))
assert cfunits.Units(unit).isvalid
return unit
def get_var_type(ds, v):
if v in ds.coords:
# type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.
# For example, the values of the datacube might be provided in the projected coordinate reference system,
# but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.
var_type = 'auxiliary'
# type = data for a variable indicating some measured value, for example "precipitation", "temperature", etc.
else:
var_type = 'data'
return var_type
```
%% Cell type:code id:e9272931-fc0b-4f2a-9546-283033e9cde8 tags:
``` python
# pull list of vars from dataset
vars = list(ds.variables)
# drop metpy_crs coordinate we have added
if 'metpy_crs' in ds.coords:
ds = ds.drop_vars('metpy_crs')
# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.
# we will drop all values in dims from vars
vars = [v for v in vars if v not in dims]
# Microsoft Planetary Computer includes coordinates and crs as variables here:
# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
# we will keep those in the var list
# create dictionary of dataset variables and associated dimensions
vars_dict={}
for v in vars:
unit = get_unit(ds, v)
var_type = get_var_type(ds, v)
long_name = get_long_name(ds, v)
vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})
```
%% Cell type:markdown id:11ad5352-884c-4472-8864-4570a96f66e5 tags:
### Finalize extension
%% Cell type:code id:10141fd4-91d6-491d-878b-02653720891d tags:
``` python
# add dimesions and variables to collection extension
dc.apply(dimensions=dims_dict, variables=vars_dict)
```
%% Cell type:markdown id:615ca168-75fb-4135-9941-0ef5fe4fd1cb tags:
## Add STAC Collection to Catalog and Save
%% Cell type:code id:e2120a55-3d04-4122-a93f-29afcdb8cb1b tags:
``` python
# # helper to find items of wrong type
# d = collection.to_dict()
# def find_paths(nested_dict, prepath=()):
# for k, v in nested_dict.items():
# try:
# path = prepath + (k,)
# if type(v) is np.float64: # found value
# yield path
# elif hasattr(v, 'items'): # v is a dict
# yield from find_paths(v, path)
# except:
# print(prepath)
# print(*find_paths(d))
```
%% Cell type:code id:4b75791b-6b2d-40be-b7c6-330a60888fb5 tags:
``` python
if catalog.get_child(collection_id):
collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
else:
catalog.add_child(collection)
catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)
```
%% Cell type:code id:d6f676b5-e892-4bfb-8d73-2828addd838c tags:
``` python
```
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment