remove extra cell

e31161c2 · Snyder, Amelia Marie · c5e2954d · e31161c2
Commit e31161c2 authored 1 year ago by Snyder, Amelia Marie
--- a/workflows/examples/create_collection_from_zarr_conus404-daily.ipynb
+++ b/workflows/examples/create_collection_from_zarr_conus404-daily.ipynb
@@ -195,14 +195,6 @@
    "print(f\"Dimension dictionary: {dim_names_dict}\")"
   ]
  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fd580276-82fe-4b00-808e-0bfa10998a1e",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
  {
   "cell_type": "markdown",
   "id": "810d7480-165d-41c0-bd09-163656a14003",

 %% Cell type:markdown id:6c10e07b-1e60-4926-af1d-fa75dc78e5d4 tags:

 # CONUS404 Daily Zarr -> Collection Exploratory Workflow
 This is a workflow for transforming the CONUS404 daily zarr dataset into a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md). We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.

 To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:
 1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/
 2. I am assuming all coordinates are from the WGS84 datum if not specified.

 %% Cell type:code id:201e0945-de55-45ff-b095-c2af009a4e62 tags:

 ``` python
 import pystac
 from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
 import xarray as xr
 import cf_xarray
 import os
 import fsspec
 import cf_xarray
 import hvplot.xarray
 import pandas as pd
 import json
 import numpy as np
 import metpy
 import cartopy.crs as ccrs
 import cfunits
 import json
 ```

 %% Cell type:markdown id:20b00e88-5a13-46b3-9787-d9ac2d4e7bd6 tags:

 ## Open up NHGF STAC Catalog

 %% Cell type:code id:adf6c59d-58cd-48b1-a5fd-3bb205a3ef56 tags:

 ``` python
 # define folder location where your STAC catalog json file is
 catalog_path = os.path.join('..', '..', 'catalog')
 # open catalog
 catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
 ```

 %% Cell type:markdown id:996e60ba-13e4-453a-8534-e62ce747f0fa tags:

 ## Collection Metadata Input

 %% Cell type:code id:482d204d-b5b6-40e5-ac42-55b459be1097 tags:

 ``` python
 # name for STAC collection
 collection_id = 'conus404-daily'
 # description of STAC collection
 collection_description = 'CONUS404 40 years of daily values for subset of model output variables derived from hourly values on cloud storage'
 # license for dataset
 collection_license = 'CC0-1.0'
 ```

 %% Cell type:markdown id:116b5837-8e85-4ae7-964a-803533ded714 tags:

 ## Asset Metadata Input

 %% Cell type:code id:dd6fa323-132a-4794-8c80-576933f547a0 tags:

 ``` python
 # url to zarr store that you want to create a collection for
 zarr_url = 's3://hytest/conus404/conus404_daily.zarr/'

 # define keyword arguments needed for opening the dataset with xarray
 # ref: https://github.com/stac-extensions/xarray-assets
 xarray_opendataset_kwargs = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
                          "xarray:storage_options": {"anon": True, "client_kwargs": {"endpoint_url":"https://usgs.osn.mghpcc.org/"}}}
 # description for zarr url asset attached to collection (zarr_url)
 asset_description = "Open Storage Network Pod S3 API access to collection zarr group"
 # roles to tag zarr url asset with
 asset_roles = ["data","zarr","s3"]
 ```

 %% Cell type:code id:e1441cd4-e94c-4902-af46-8f1af470eb6b tags:

 ``` python
 # url to zarr store that you want to create a collection for
 zarr_url2 = 's3://nhgf-development/conus404/conus404_daily_202210.zarr/'

 # define keyword arguments needed for opening the dataset with xarray
 # ref: https://github.com/stac-extensions/xarray-assets
 xarray_opendataset_kwargs2 = {"xarray:open_kwargs":{"chunks":{},"engine":"zarr","consolidated":True},
                          "xarray:storage_options":{"requester_pays":True}}
 # description for zarr url asset attached to collection (zarr_url)
 asset_description2 = "S3 access to collection zarr group"
 # roles to tag zarr url asset with
 asset_roles2 = ["data","zarr","s3"]
 ```

 %% Cell type:markdown id:b213b74f-ad17-4774-93b6-3b62be616b45 tags:

 ## Data Exploration

 %% Cell type:code id:708f2cf5-79ab-49af-8067-de31d0d13ee6 tags:

 ``` python
 # open and view zarr dataset
 fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')
 ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
                             backend_kwargs={'consolidated':True}, chunks={})
 ds
 ```

 %% Cell type:markdown id:0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc tags:

 ## Identify x, y, t dimensions of dataset
 May require user input if dimensions cannot be auto-detected.

 %% Cell type:code id:ab91268f-7200-4cb1-979a-c7d75531d2c0 tags:

 ``` python
 dims_auto_extract = ['X', 'Y', 'T']
 def extract_dim(ds, d):
    try:
        dim_list = ds.cf.axes[d]
        assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'
        dim = dim_list[0]
    except KeyError:
        print(f"Could not auto-extract {d} dimension name.")
        print("Look at the xarray output above showing the dataset dimensions.")
        dim = str(input(f"What is the name of the {d} dimension of this dataset?"))
    assert dim in ds.dims, "That is not a valid dimension name for this dataset"
    print(f"name of {d} dimension: {dim}\n")
    return dim

 dim_names_dict = {}
 for d in dims_auto_extract:
    dim_names_dict[d] = extract_dim(ds, d)
 print(f"Dimension dictionary: {dim_names_dict}")
 ```

-%% Cell type:code id:fd580276-82fe-4b00-808e-0bfa10998a1e tags:
-
-``` python
-```
-
 %% Cell type:markdown id:810d7480-165d-41c0-bd09-163656a14003 tags:

 ## Get crs info

 %% Cell type:code id:b03d52f3-1367-4255-a561-52ee4fc9e92d tags:

 ``` python
 ds  = ds.metpy.parse_cf()
 crs = ds[list(ds.keys())[0]].metpy.cartopy_crs
 ```

 %% Cell type:markdown id:8fbfecfb-9886-4d06-a34c-6471cb0a6053 tags:

 ## Plot a map

 %% Cell type:code id:4eb4d027-4266-4a0b-8f16-bacfbef06242 tags:

 ``` python
 # plot a map of a single variable
 var_to_plot = 'SNOW'
 da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()
 da.hvplot.quadmesh(x='lon', y='lat', rasterize=True,
                             geo=True, tiles='OSM', alpha=0.7, cmap='turbo')
 ```

 %% Cell type:markdown id:5e057a6c-06fb-4406-823b-e81c58e520e4 tags:

 ## Plot a time series at a specific point
 This can help you verify a variable's values

 %% Cell type:code id:c7de2681-88c2-4597-857c-8f169c596f8b tags:

 ``` python
 # enter lat, lon of point you want to plot time series for
 lat,lon = 39.978322,-105.2772194
 time_start = '2013-01-01 00:00'
 time_end = '2013-12-31 00:00'
 x, y = crs.transform_point(lon, lat, src_crs=ccrs.PlateCarree())   # PlateCaree = Lat,Lon
 da = ds[var_to_plot].sel(x=x, y=y, method='nearest').sel(time=slice(time_start,time_end)).load()
 da.hvplot(x=dim_names_dict['T'], grid=True)
 ```

 %% Cell type:markdown id:a8c3ed37-8564-400b-a7fb-25bd5e43d21c tags:

 ## Create Collection Extent

 %% Cell type:markdown id:69f0d837-68a5-4fed-9a14-5d75cfbb0da4 tags:

 ### Spatial Extent

 %% Cell type:code id:d46805e0-8e94-4ebe-aa01-d9a2d7051459 tags:

 ``` python
 # pull out lat/lon bbox for data
 # coordinates must be from WGS 84 datum
 # left, bottom, right, top
 # Note: I'm not sure why but I have some trouble getting the data type right here -
 # I've included all the options I've had to run to get it to not be a regular float rather
 # than a numpy float below - switch the commented line if you have this issue
 #coord_bounds = [ds.lon.data.min().compute().astype(float), ds.lat.data.min().compute().astype(float), ds.lon.data.max().compute().astype(float), ds.lat.data.max().compute().astype(float)]
 #coord_bounds = [ds.lon.data.min().compute().astype(float).tolist(), ds.lat.data.min().compute().astype(float).tolist(), ds.lon.data.max().compute().astype(float).tolist(), ds.lat.data.max().compute().astype(float).tolist()]
 coord_bounds = [ds.lon.data.min().astype(float).item(), ds.lat.data.min().astype(float).item(), ds.lon.data.max().astype(float).item(), ds.lat.data.max().astype(float).item()]
 print(coord_bounds)
 # create a spatial extent object
 spatial_extent = pystac.SpatialExtent(bboxes=[coord_bounds])
 ```

 %% Cell type:markdown id:a04c8fca-1d33-43ac-9e2b-62d7be2887f7 tags:

 ### Temporal Extent

 %% Cell type:code id:41a84995-867c-4152-8c57-85e3758bbb77 tags:

 ``` python
 # pull out first and last timestamps
 temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())
 temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())
 print(f'min: {temporal_extent_lower} \nmax: {temporal_extent_upper}')
 # create a temporal extent object
 temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])
 ```

 %% Cell type:code id:1b1e37c4-5348-46ad-abc9-e005b5d6c02b tags:

 ``` python
 collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
 ```

 %% Cell type:markdown id:cfb71202-03df-45b5-ac2f-0dc2ee1ab780 tags:

 ## Create pystac collection

 %% Cell type:code id:7e96811b-95ae-406a-9728-55fc429d4e1f tags:

 ``` python
 if catalog.get_child(collection_id):
    collection = catalog.get_child(collection_id)
    print("existing collection opened")
    collection.extent=collection_extent
    collection.description=collection_description
    collection.license=collection_license
 else:
    collection = pystac.Collection(id=collection_id,
                                   description=collection_description,
                                   extent=collection_extent,
                                   license=collection_license)
    print("new collection created")
 ```

 %% Cell type:markdown id:a21c76e8-cd57-4eb5-a33f-7c668a3b3205 tags:

 ## Add zarr url asset to collection

 %% Cell type:code id:094832af-d22b-4359-b0f6-cf687acce5cc tags:

 ``` python
 asset_id = "zarr-s3-osn"
 asset = pystac.Asset(href=zarr_url,
                     description=asset_description,
                     media_type="application/vnd+zarr",
                     roles=asset_roles,
                     extra_fields = xarray_opendataset_kwargs)
 collection.add_asset(asset_id, asset)
 ```

 %% Cell type:code id:0c298d07-f234-4a08-986d-87f4a39e9ae6 tags:

 ``` python
 asset_id2 = "zarr-s3"
 asset2 = pystac.Asset(href=zarr_url2,
                     description=asset_description2,
                     media_type="application/vnd+zarr",
                     roles=asset_roles2,
                     extra_fields = xarray_opendataset_kwargs2)
 collection.add_asset(asset_id2, asset2)
 ```

 %% Cell type:markdown id:f67cd5c9-db33-45c2-bc21-480cd67354f4 tags:

 ## Add datacube extension to collection

 %% Cell type:code id:fc00946d-2880-491d-9b3b-3aeeb4414d6c tags:

 ``` python
 # instantiate extention on collection
 dc = DatacubeExtension.ext(collection, add_if_missing=True)
 ```

 %% Cell type:markdown id:8bdd77a2-7587-485e-afb7-42af3a822241 tags:

 ### Add cube dimensions (required field for extension)

 %% Cell type:code id:120a4914-3302-44a5-a282-0308ac84f040 tags:

 ``` python
 # list out dataset dimensions
 # When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,
 # raising an error if it can’t be found.
 dims = list(ds.dims)
 print(dims)
 ```

 %% Cell type:code id:00a18a29-fb9a-4b56-8009-493122997b16 tags:

 ``` python
 # get x, y bounds for extent of those dimensions (required)
 xy_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
 print(xy_bounds)
 ```

 %% Cell type:code id:0f49d6d7-9e30-4144-909b-fa1238e6c77a tags:

 ``` python
 def get_step(ds, dim_name):
    dim_vals = ds[dim_name].values
    diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]
    unique_steps = np.unique(diffs)
    print(type(unique_steps[0]))
    # set step - if all steps are the same length
    # datacube spec specifies to use null for irregularly spaced steps
    if len(unique_steps)==1:
        # make sure time deltas are in np timedelta format
        unique_steps = [np.array([step], dtype="timedelta64[ns]")[0] for step in unique_steps]
        step = unique_steps[0].astype(float).item()
    else:
        step = None
    return(step)
 ```

 %% Cell type:code id:a20d12bf-a511-4c5e-84d0-77e2ec551518 tags:

 ``` python
 def get_long_name(ds, v):
    # try to get long_name attribute from variable
    try:
        long_name = ds[v].attrs['long_name']
    # otherwise, leave empty
    except:
        long_name = None
    return long_name
 ```

 %% Cell type:markdown id:e7dc357c-91ec-49ae-83e5-400f791f9792 tags:

 #### user input needed - you will need to look at the crs information and create a cartopy crs object after identifying the projection type:
 reference list of cartopy projections: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html

 %% Cell type:code id:ea452f62-5644-49b6-8a4e-7dc4f649fd1a tags:

 ``` python
 # print ot crs information in dataset
 crs_info = ds.crs
 print(crs_info)
 ```

 %% Cell type:code id:1b1d05ff-8e43-44a7-8343-178b112c4ad6 tags:

 ``` python
 # create the appropriate cartopy projection
 lcc = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian,
                            central_latitude=crs_info.latitude_of_projection_origin,
                            standard_parallels=crs_info.standard_parallel)
 # the datacube extension can accept reference_system information as a numerical EPSG code,
 # WKT2 (ISO 19162) string or PROJJSON object.
 # we will use a projjson, as was done by Microsoft Planetary Computer here:
 # https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
 # https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
 projjson = json.loads(lcc.to_json())

 # alternatively, I think we could do this:
 #projjson = crs.to_json()
 ```

 %% Cell type:markdown id:00a5e041-081d-428d-ac2e-75d16de205e6 tags:

 #### user input needed - you will need to copy all of the dimensions from above into the dict and fill in the appropriate attributes(type, axis, extent):

 %% Cell type:code id:5a443497-67a9-4dce-a8e9-b08d31a88223 tags:

 ``` python
 # create a dictionary of datacube dimensions you would like to assign to this dataset
 # dimension name should come from the coordinates printed above
 # we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)
 # note that the extent of each dimension should be pulled from the dataset
 dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(get_step(ds, 'time')).isoformat()}),
             'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': get_step(ds, 'x'), 'reference_system': projjson}),
             'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': get_step(ds, 'y'), 'reference_system': projjson}),
             'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top_stag')}),
             'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top')}),
             'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'soil_layers_stag')}),
             'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, 'x_stag')}),
             'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y_stag')}),
             'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snow_layers_stag')}),
             'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snso_layers_stag')}),
            }
 ```

 %% Cell type:markdown id:0f277883-a3fd-425f-966a-ca2140d0ef2f tags:

 ### Add cube variables (optional field for extension)

 %% Cell type:code id:92510876-7853-4d24-8563-c69f9012aeb6 tags:

 ``` python
 # define functions to pull out datacube attributes and validate format
 def get_unit(ds, v):
    # check if unit is defined for variable
    try:
        unit = ds[v].attrs['units']
    except:
        unit = None
    # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database
    # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).
    # gdptools expects this format as well
    try:
        cfunits.Units(unit).isvalid
    except:
        print("Unit is not valid as a UD unit.")
        unit = str(input("Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database"))
        assert cfunits.Units(unit).isvalid
    return unit

 def get_var_type(ds, v):
    if v in ds.coords:
        # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.
        # For example, the values of the datacube might be provided in the projected coordinate reference system,
        # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.
        var_type = 'auxiliary'
    # type = data for a variable indicating some measured value, for example "precipitation", "temperature", etc.
    else:
        var_type = 'data'
    return var_type
 ```

 %% Cell type:code id:e9272931-fc0b-4f2a-9546-283033e9cde8 tags:

 ``` python
 # pull list of vars from dataset
 vars = list(ds.variables)

 # drop metpy_crs coordinate we have added
 if 'metpy_crs' in ds.coords:
    ds = ds.drop_vars('metpy_crs')

 # spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.
 # we will drop all values in dims from vars
 vars = [v for v in vars if v not in dims]

 # Microsoft Planetary Computer includes coordinates and crs as variables here:
 # https://planetarycomputer.microsoft.com/dataset/daymet-annual-na
 # https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na
 # we will keep those in the var list

 # create dictionary of dataset variables and associated dimensions
 vars_dict={}
 for v in vars:
    unit = get_unit(ds, v)
    var_type = get_var_type(ds, v)
    long_name = get_long_name(ds, v)
    vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})
 ```

 %% Cell type:markdown id:11ad5352-884c-4472-8864-4570a96f66e5 tags:

 ### Finalize extension

 %% Cell type:code id:10141fd4-91d6-491d-878b-02653720891d tags:

 ``` python
 # add dimesions and variables to collection extension
 dc.apply(dimensions=dims_dict, variables=vars_dict)
 ```

 %% Cell type:markdown id:615ca168-75fb-4135-9941-0ef5fe4fd1cb tags:

 ## Add STAC Collection to Catalog and Save

 %% Cell type:code id:e2120a55-3d04-4122-a93f-29afcdb8cb1b tags:

 ``` python
 # # helper to find items of wrong type
 # d = collection.to_dict()
 # def find_paths(nested_dict, prepath=()):
 #     for k, v in nested_dict.items():
 #         try:
 #             path = prepath + (k,)
 #             if type(v) is np.float64: # found value
 #                 yield path
 #             elif hasattr(v, 'items'): # v is a dict
 #                 yield from find_paths(v, path)
 #         except:
 #             print(prepath)

 # print(*find_paths(d))
 ```

 %% Cell type:code id:4b75791b-6b2d-40be-b7c6-330a60888fb5 tags:

 ``` python
 if catalog.get_child(collection_id):
    collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
 else:
    catalog.add_child(collection)
    catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)
 ```

 %% Cell type:code id:d6f676b5-e892-4bfb-8d73-2828addd838c tags:

 ``` python
 ```