From 0e70e1380a1d5528b326bb9b785caf1f3c86e71f Mon Sep 17 00:00:00 2001
From: amsnyder <asnyder@usgs.gov>
Date: Thu, 2 Feb 2023 21:43:53 -0600
Subject: [PATCH] finalize questions in c404 workflows

---
 catalog/conus404-daily/collection.json        | 51 +++--------
 ..._exploratory_workflow_conus404-daily.ipynb | 90 ++++++++-----------
 2 files changed, 51 insertions(+), 90 deletions(-)

diff --git a/catalog/conus404-daily/collection.json b/catalog/conus404-daily/collection.json
index a681315a..ec33ffcd 100644
--- a/catalog/conus404-daily/collection.json
+++ b/catalog/conus404-daily/collection.json
@@ -26,56 +26,49 @@
         "2021-09-25T00:00:00Z"
       ]
     },
-    "lon": {
+    "x": {
       "type": "spatial",
       "axis": "x",
       "extent": [
-        -138.73135375976562,
-        -57.068634033203125
+        -2732097.901153542,
+        2731902.098846458
       ]
     },
-    "lat": {
+    "y": {
       "type": "spatial",
       "axis": "y",
       "extent": [
-        17.647308349609375,
-        57.34341812133789
+        -2027960.8996368449,
+        2028039.1003631551
       ]
     },
     "bottom_top_stag": {
       "type": "spatial",
-      "axis": "z",
-      "description": ""
+      "axis": "z"
     },
     "bottom_top": {
       "type": "spatial",
-      "axis": "z",
-      "description": ""
+      "axis": "z"
     },
     "soil_layers_stag": {
       "type": "spatial",
-      "axis": "z",
-      "description": ""
+      "axis": "z"
     },
     "x_stag": {
       "type": "spatial",
-      "axis": "z",
-      "description": ""
+      "axis": "x"
     },
     "y_stag": {
       "type": "spatial",
-      "axis": "z",
-      "description": ""
+      "axis": "y"
     },
     "snow_layers_stag": {
       "type": "spatial",
-      "axis": "z",
-      "description": ""
+      "axis": "z"
     },
     "snso_layers_stag": {
       "type": "spatial",
-      "axis": "z",
-      "description": ""
+      "axis": "z"
     }
   },
   "cube:variables": {
@@ -1243,24 +1236,6 @@
         "x"
       ],
       "type": "data"
-    },
-    "time": {
-      "dimensions": [
-        "time"
-      ],
-      "type": "data"
-    },
-    "x": {
-      "dimensions": [
-        "x"
-      ],
-      "type": "data"
-    },
-    "y": {
-      "dimensions": [
-        "y"
-      ],
-      "type": "data"
     }
   },
   "assets": {
diff --git a/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb b/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb
index 0d5a275b..3d3b2fb6 100644
--- a/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb
+++ b/workflows/create_collection_zarr_exploratory_workflow_conus404-daily.ipynb
@@ -12,9 +12,7 @@
     "\n",
     "To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:\n",
     "1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/\n",
-    "2. I am assuming all coordinates are from the WGS84 datum if not specified.\n",
-    "\n",
-    "Note that some work needs to be done for this particular dataset's dimension metadata. I need to collect more information the z dataset dimensions. Also, I currently use lon/lat as dimensions, but x/y are also available (and are actually dataset dimensions). I am still deciding if I will use x/y or lon/lat. I will update this workflow when those details are finalized."
+    "2. I am assuming all coordinates are from the WGS84 datum if not specified."
    ]
   },
   {
@@ -115,20 +113,6 @@
     "ds"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f49f6d7e-af4f-471f-9a32-5db7d3b2674b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# plot a map of a single variable\n",
-    "var_to_plot = 'SNOW'\n",
-    "da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()\n",
-    "da.hvplot.quadmesh(x='lon', y='lat', rasterize=True, \n",
-    "                             geo=True, tiles='OSM', alpha=0.7, cmap='turbo')"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "a8c3ed37-8564-400b-a7fb-25bd5e43d21c",
@@ -155,7 +139,7 @@
     "# pull out lat/lon bbox for data\n",
     "# coordiantes must be from WGS 84 datum\n",
     "# left, bottom, right, top\n",
-    "coord_bounds = [ds.lon.data.min().astype(float), ds.lat.data.min().astype(float), ds.lon.data.max().astype(float), ds.lat.data.max().astype(float)]\n",
+    "coord_bounds = [ds.lon.data.min().compute().astype(float), ds.lat.data.min().compute().astype(float), ds.lon.data.max().compute().astype(float), ds.lat.data.max().compute().astype(float)]\n",
     "print(coord_bounds)\n",
     "# create a spatial extent object \n",
     "spatial_extent = pystac.SpatialExtent(bboxes=[coord_bounds])"
@@ -281,17 +265,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# list out dataset coordinates\n",
+    "# list out dataset dimensions\n",
+    "# When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,\n",
+    "# raising an error if it can’t be found.\n",
     "dims = list(ds.dims)\n",
     "print(dims)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00a18a29-fb9a-4b56-8009-493122997b16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# get x, y bounds for extent of those dimensions (required)\n",
+    "xy_bounds = [ds.x.data.min().astype(float), ds.y.data.min().astype(float), ds.x.data.max().astype(float), ds.y.data.max().astype(float)]\n",
+    "print(xy_bounds)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "00a5e041-081d-428d-ac2e-75d16de205e6",
    "metadata": {},
    "source": [
-    "user input needed:"
+    "#### user input needed - you will need to copy all of the dimensions from above into the dict and fill in the appropriate attributes(type, axis, extent):"
    ]
   },
   {
@@ -306,27 +304,16 @@
     "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n",
     "# note that the extent of each dimension should be pulled from the dataset\n",
     "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')]}),\n",
-    "             'lon': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'extent': [coord_bounds[0], coord_bounds[2]]}),\n",
-    "             'lat': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'extent': [coord_bounds[1], coord_bounds[3]]}),\n",
-    "             'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n",
-    "             'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n",
-    "             'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n",
-    "             'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n",
-    "             'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n",
-    "             'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n",
-    "             'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': ''}),\n",
-    "            }\n",
-    "# unresolved questions:\n",
-    "# do we want other fields? https://github.com/stac-extensions/datacube\n",
-    "# are these all z spatial dimension?\n",
-    "# can i have multiple z dimensions on one collection?\n",
-    "# could i also have multiple x,y dimensions\n",
-    "# add descriptions\n",
-    "\n",
-    "# var dims x, y - I used lat, lon here because I thought it would be more intuitive \n",
-    "# but not really dims\n",
-    "# if we use x, y need to use reference_system - how to harvest from crs?\n",
-    "# options: numerical EPSG code, WKT2 (ISO 19162) string or PROJJSON object"
+    "             'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]]}),\n",
+    "             'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'extent': [xy_bounds[1], xy_bounds[3]]}),\n",
+    "             'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n",
+    "             'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n",
+    "             'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n",
+    "             'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x'}),\n",
+    "             'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y'}),\n",
+    "             'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n",
+    "             'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z'}),\n",
+    "            }"
    ]
   },
   {
@@ -346,10 +333,17 @@
    "source": [
     "# pull list of vars from dataset\n",
     "vars = list(ds.variables)\n",
+    "# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.\n",
+    "# we will drop all values in dims from vars\n",
+    "vars = [v for v in vars if v not in dims]\n",
+    "\n",
+    "# Microsoft Planetary Computer includes coorindates and crs as variables here: https://planetarycomputer.microsoft.com/dataset/daymet-annual-na\n",
+    "# we will keep those in the var list\n",
+    "\n",
     "# create dictionary of dataset variables and associated dimensions\n",
     "vars_dict={}\n",
-    "for var in vars:\n",
-    "    vars_dict[var] = pystac.extensions.datacube.Variable({'dimensions':list(ds[var].dims), 'type': 'data'})"
+    "for v in vars:\n",
+    "    vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': 'data'})"
    ]
   },
   {
@@ -387,19 +381,11 @@
    "outputs": [],
    "source": [
     "if catalog.get_child(collection_id):\n",
-    "    collection.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)\n",
+    "    collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)\n",
     "else:\n",
     "    catalog.add_child(collection)\n",
     "    catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5264f432-d4ce-4160-a6c9-ed0c45fa6f4b",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
-- 
GitLab