From 0e70e1380a1d5528b326bb9b785caf1f3c86e71f Mon Sep 17 00:00:00 2001 From: amsnyder <asnyder@usgs.gov> Date: Thu, 2 Feb 2023 21:43:53 -0600 Subject: [PATCH] finalize questions in c404 workflows --- catalog/conus404-daily/collection.json | 51 +++-------- ..._exploratory_workflow_conus404-daily.ipynb | 90 ++++++++----------- 2 files changed, 51 insertions(+), 90 deletions(-) diff --git a/catalog/conus404-daily/collection.json b/catalog/conus404-daily/collection.json index a681315a..ec33ffcd 100644 --- a/catalog/conus404-daily/collection.json +++ b/catalog/conus404-daily/collection.json @@ -26,56 +26,49 @@ "2021-09-25T00:00:00Z" ] }, - "lon": { + "x": { "type": "spatial", "axis": "x", "extent": [ - -138.73135375976562, - -57.068634033203125 + -2732097.901153542, + 2731902.098846458 ] }, - "lat": { + "y": { "type": "spatial", "axis": "y", "extent": [ - 17.647308349609375, - 57.34341812133789 + -2027960.8996368449, + 2028039.1003631551 ] }, "bottom_top_stag": { "type": "spatial", - "axis": "z", - "description": "" + "axis": "z" }, "bottom_top": { "type": "spatial", - "axis": "z", - "description": "" + "axis": "z" }, "soil_layers_stag": { "type": "spatial", - "axis": "z", - "description": "" + "axis": "z" }, "x_stag": { "type": "spatial", - "axis": "z", - "description": "" + "axis": "x" }, "y_stag": { "type": "spatial", - "axis": "z", - "description": "" + "axis": "y" }, "snow_layers_stag": { "type": "spatial", - "axis": "z", - "description": "" + "axis": "z" }, "snso_layers_stag": { "type": "spatial", - "axis": "z", - "description": "" + "axis": "z" } }, "cube:variables": { @@ -1243,24 +1236,6 @@ "x" ], "type": "data" - }, - "time": { - "dimensions": [ - "time" - ], - "type": "data" - }, - "x": { - "dimensions": [ - "x" - ], - "type": "data" - }, - "y": { - "dimensions": [ - "y" - ], - "type": "data" } }, "assets": { diff --git a/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb b/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb index 0d5a275b..3d3b2fb6 100644 --- a/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb +++ b/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb @@ -12,9 +12,7 @@ "\n", "To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:\n", "1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/\n", - "2. I am assuming all coordinates are from the WGS84 datum if not specified.\n", - "\n", - "Note that some work needs to be done for this particular dataset's dimension metadata. I need to collect more information the z dataset dimensions. Also, I currently use lon/lat as dimensions, but x/y are also available (and are actually dataset dimensions). I am still deciding if I will use x/y or lon/lat. I will update this workflow when those details are finalized." + "2. I am assuming all coordinates are from the WGS84 datum if not specified." ] }, { @@ -115,20 +113,6 @@ "ds" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "f49f6d7e-af4f-471f-9a32-5db7d3b2674b", - "metadata": {}, - "outputs": [], - "source": [ - "# plot a map of a single variable\n", - "var_to_plot = 'SNOW'\n", - "da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()\n", - "da.hvplot.quadmesh(x='lon', y='lat', rasterize=True, \n", - " geo=True, tiles='OSM', alpha=0.7, cmap='turbo')" - ] - }, { "cell_type": "markdown", "id": "a8c3ed37-8564-400b-a7fb-25bd5e43d21c", @@ -155,7 +139,7 @@ "# pull out lat/lon bbox for data\n", "# coordiantes must be from WGS 84 datum\n", "# left, bottom, right, top\n", - "coord_bounds = [ds.lon.data.min().astype(float), ds.lat.data.min().astype(float), ds.lon.data.max().astype(float), ds.lat.data.max().astype(float)]\n", + "coord_bounds = [ds.lon.data.min().compute().astype(float), ds.lat.data.min().compute().astype(float), ds.lon.data.max().compute().astype(float), ds.lat.data.max().compute().astype(float)]\n", "print(coord_bounds)\n", "# create a spatial extent object \n", "spatial_extent = pystac.SpatialExtent(bboxes=[coord_bounds])" @@ -281,17 +265,31 @@ "metadata": {}, "outputs": [], "source": [ - "# list out dataset coordinates\n", + "# list out dataset dimensions\n", + "# When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,\n", + "# raising an error if it can’t be found.\n", "dims = list(ds.dims)\n", "print(dims)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "00a18a29-fb9a-4b56-8009-493122997b16", + "metadata": {}, + "outputs": [], + "source": [ + "# get x, y bounds for extent of those dimensions (required)\n", + "xy_bounds = [ds.x.data.min().astype(float), ds.y.data.min().astype(float), ds.x.data.max().astype(float), ds.y.data.max().astype(float)]\n", + "print(xy_bounds)" + ] + }, { "cell_type": "markdown", "id": "00a5e041-081d-428d-ac2e-75d16de205e6", "metadata": {}, "source": [ - "user input needed:" + "#### user input needed - you will need to copy all of the dimensions from above into the dict and fill in the appropriate attributes(type, axis, extent):" ] }, { @@ -306,27 +304,16 @@ "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')]}),\n", - " 'lon': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'extent': [coord_bounds[0], coord_bounds[2]]}),\n", - " 'lat': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'extent': [coord_bounds[1], coord_bounds[3]]}),\n", - " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", - " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", - " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", - " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", - " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", - " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", - " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n", - " }\n", - "# unresolved questions:\n", - "# do we want other fields? https://github.com/stac-extensions/datacube\n", - "# are these all z spatial dimension?\n", - "# can i have multiple z dimensions on one collection?\n", - "# could i also have multiple x,y dimensions\n", - "# add descriptions\n", - "\n", - "# var dims x, y - I used lat, lon here because I thought it would be more intuitive \n", - "# but not really dims\n", - "# if we use x, y need to use reference_system - how to harvest from crs?\n", - "# options: numerical EPSG code, WKT2 (ISO 19162) string or PROJJSON object" + " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]]}),\n", + " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'extent': [xy_bounds[1], xy_bounds[3]]}),\n", + " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n", + " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n", + " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n", + " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x'}),\n", + " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y'}),\n", + " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n", + " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n", + " }" ] }, { @@ -346,10 +333,17 @@ "source": [ "# pull list of vars from dataset\n", "vars = list(ds.variables)\n", + "# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.\n", + "# we will drop all values in dims from vars\n", + "vars = [v for v in vars if v not in dims]\n", + "\n", + "# Microsoft Planetary Computer includes coorindates and crs as variables here: https://planetarycomputer.microsoft.com/dataset/daymet-annual-na\n", + "# we will keep those in the var list\n", + "\n", "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", - "for var in vars:\n", - " vars_dict[var] = pystac.extensions.datacube.Variable({'dimensions':list(ds[var].dims), 'type': 'data'})" + "for v in vars:\n", + " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': 'data'})" ] }, { @@ -387,19 +381,11 @@ "outputs": [], "source": [ "if catalog.get_child(collection_id):\n", - " collection.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)\n", + " collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)\n", "else:\n", " catalog.add_child(collection)\n", " catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5264f432-d4ce-4160-a6c9-ed0c45fa6f4b", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { -- GitLab