diff --git a/workflows/archive/alaska_et_2020_ccsm4_historical_simulation_create_collection_from_zarr.ipynb b/workflows/archive/alaska_et_2020_ccsm4_historical_simulation_create_collection_from_zarr.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..f47db1c427216dcb55483e93a5ce79ddfa8ee32c --- /dev/null +++ b/workflows/archive/alaska_et_2020_ccsm4_historical_simulation_create_collection_from_zarr.ipynb @@ -0,0 +1,760 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6c10e07b-1e60-4926-af1d-fa75dc78e5d4", + "metadata": { + "tags": [] + }, + "source": [ + "# CONUS404 Daily Zarr -> Collection Exploratory Workflow\n", + "This is a workflow for transforming the CONUS404 daily zarr dataset into a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md). We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.\n", + "\n", + "To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:\n", + "1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/\n", + "2. I am assuming all coordinates are from the WGS84 datum if not specified." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "201e0945-de55-45ff-b095-c2af009a4e62", + "metadata": {}, + "outputs": [], + "source": [ + "import pystac\n", + "from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension\n", + "import xarray as xr\n", + "import cf_xarray\n", + "import os\n", + "import fsspec\n", + "import cf_xarray\n", + "import hvplot.xarray\n", + "import pandas as pd\n", + "import json\n", + "import numpy as np\n", + "import metpy\n", + "import cartopy.crs as ccrs\n", + "import cfunits\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "20b00e88-5a13-46b3-9787-d9ac2d4e7bd6", + "metadata": {}, + "source": [ + "## Open up NHGF STAC Catalog" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "adf6c59d-58cd-48b1-a5fd-3bb205a3ef56", + "metadata": {}, + "outputs": [], + "source": [ + "# define folder location where your STAC catalog json file is\n", + "catalog_path = os.path.join('..', '..', 'catalog')\n", + "# open catalog\n", + "catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))" + ] + }, + { + "cell_type": "markdown", + "id": "996e60ba-13e4-453a-8534-e62ce747f0fa", + "metadata": {}, + "source": [ + "## Collection Metadata Input" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "482d204d-b5b6-40e5-ac42-55b459be1097", + "metadata": {}, + "outputs": [], + "source": [ + "# name for STAC collection\n", + "collection_id = 'alaska_et_2020_ccsm4_historical_simulation'\n", + "# description of STAC collection\n", + "collection_description = 'alaska_et_2020_ccsm4_historical_simulation'\n", + "# license for dataset\n", + "collection_license = 'CC0-1.0'" + ] + }, + { + "cell_type": "markdown", + "id": "116b5837-8e85-4ae7-964a-803533ded714", + "metadata": {}, + "source": [ + "## Asset Metadata Input" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "dd6fa323-132a-4794-8c80-576933f547a0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# url to zarr store that you want to create a collection for\n", + "zarr_url = 's3://mdmf/gdp/alaska_et_2020_ccsm4_historical_simulation.zarr/'\n", + "\n", + "# define keyword arguments needed for opening the dataset with xarray\n", + "# ref: https://github.com/stac-extensions/xarray-assets\n", + "xarray_opendataset_kwargs = {\"xarray:open_kwargs\":{\"chunks\":{},\"engine\":\"zarr\",\"consolidated\":True},\n", + " \"xarray:storage_options\": {\"anon\": True, \"client_kwargs\": {\"endpoint_url\":\"https://usgs.osn.mghpcc.org/\"}}}\n", + "# description for zarr url asset attached to collection (zarr_url)\n", + "asset_description = \"Open Storage Network Pod S3 API access to collection zarr group\"\n", + "# roles to tag zarr url asset with\n", + "asset_roles = [\"data\",\"zarr\",\"s3\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e1441cd4-e94c-4902-af46-8f1af470eb6b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# url to zarr store that you want to create a collection for\n", + "zarr_url2 = 's3://nhgf-development/workspace/DataConversion/alaska_et_2020_ccsm4_historical_simulation.zarr/'\n", + "\n", + "# define keyword arguments needed for opening the dataset with xarray\n", + "# ref: https://github.com/stac-extensions/xarray-assets\n", + "xarray_opendataset_kwargs2 = {\"xarray:open_kwargs\":{\"chunks\":{},\"engine\":\"zarr\",\"consolidated\":True},\n", + " \"xarray:storage_options\":{\"requester_pays\":True}}\n", + "# description for zarr url asset attached to collection (zarr_url)\n", + "asset_description2 = \"S3 access to collection zarr group\"\n", + "# roles to tag zarr url asset with\n", + "asset_roles2 = [\"data\",\"zarr\",\"s3\"]" + ] + }, + { + "cell_type": "markdown", + "id": "b213b74f-ad17-4774-93b6-3b62be616b45", + "metadata": { + "tags": [] + }, + "source": [ + "## Data Exploration" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "708f2cf5-79ab-49af-8067-de31d0d13ee6", + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "'.zmetadata'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNoSuchBucket\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:113\u001b[0m, in \u001b[0;36m_error_wrapper\u001b[0;34m(func, args, kwargs, retries)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 113\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m S3_RETRYABLE_ERRORS \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/aiobotocore/client.py:371\u001b[0m, in \u001b[0;36mAioBaseClient._make_api_call\u001b[0;34m(self, operation_name, api_params)\u001b[0m\n\u001b[1;32m 370\u001b[0m error_class \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mfrom_code(error_code)\n\u001b[0;32m--> 371\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m error_class(parsed_response, operation_name)\n\u001b[1;32m 372\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[0;31mNoSuchBucket\u001b[0m: An error occurred (NoSuchBucket) when calling the GetObject operation: Unknown", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/mapping.py:143\u001b[0m, in \u001b[0;36mFSMap.__getitem__\u001b[0;34m(self, key, default)\u001b[0m\n\u001b[1;32m 142\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 143\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcat\u001b[49m\u001b[43m(\u001b[49m\u001b[43mk\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mmissing_exceptions:\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/asyn.py:121\u001b[0m, in \u001b[0;36msync_wrapper.<locals>.wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m obj \u001b[38;5;129;01mor\u001b[39;00m args[\u001b[38;5;241m0\u001b[39m]\n\u001b[0;32m--> 121\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/asyn.py:106\u001b[0m, in \u001b[0;36msync\u001b[0;34m(loop, func, timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m 105\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(return_result, \u001b[38;5;167;01mBaseException\u001b[39;00m):\n\u001b[0;32m--> 106\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m return_result\n\u001b[1;32m 107\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/asyn.py:61\u001b[0m, in \u001b[0;36m_runner\u001b[0;34m(event, coro, result, timeout)\u001b[0m\n\u001b[1;32m 60\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m---> 61\u001b[0m result[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m coro\n\u001b[1;32m 62\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ex:\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/asyn.py:433\u001b[0m, in \u001b[0;36mAsyncFileSystem._cat\u001b[0;34m(self, path, recursive, on_error, batch_size, **kwargs)\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ex:\n\u001b[0;32m--> 433\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m ex\n\u001b[1;32m 434\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 435\u001b[0m \u001b[38;5;28mlen\u001b[39m(paths) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m1\u001b[39m\n\u001b[1;32m 436\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(path, \u001b[38;5;28mlist\u001b[39m)\n\u001b[1;32m 437\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m paths[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m!=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_strip_protocol(path)\n\u001b[1;32m 438\u001b[0m ):\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/asyncio/tasks.py:408\u001b[0m, in \u001b[0;36mwait_for\u001b[0;34m(fut, timeout)\u001b[0m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 408\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m fut\n\u001b[1;32m 410\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:1071\u001b[0m, in \u001b[0;36mS3FileSystem._cat_file\u001b[0;34m(self, path, version_id, start, end)\u001b[0m\n\u001b[1;32m 1069\u001b[0m resp[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBody\u001b[39m\u001b[38;5;124m\"\u001b[39m]\u001b[38;5;241m.\u001b[39mclose()\n\u001b[0;32m-> 1071\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _error_wrapper(_call_and_read, retries\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mretries)\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:140\u001b[0m, in \u001b[0;36m_error_wrapper\u001b[0;34m(func, args, kwargs, retries)\u001b[0m\n\u001b[1;32m 139\u001b[0m err \u001b[38;5;241m=\u001b[39m translate_boto_error(err)\n\u001b[0;32m--> 140\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:113\u001b[0m, in \u001b[0;36m_error_wrapper\u001b[0;34m(func, args, kwargs, retries)\u001b[0m\n\u001b[1;32m 112\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 113\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 114\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m S3_RETRYABLE_ERRORS \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:1058\u001b[0m, in \u001b[0;36mS3FileSystem._cat_file.<locals>._call_and_read\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1057\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_call_and_read\u001b[39m():\n\u001b[0;32m-> 1058\u001b[0m resp \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call_s3(\n\u001b[1;32m 1059\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mget_object\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 1060\u001b[0m Bucket\u001b[38;5;241m=\u001b[39mbucket,\n\u001b[1;32m 1061\u001b[0m Key\u001b[38;5;241m=\u001b[39mkey,\n\u001b[1;32m 1062\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mversion_id_kw(version_id \u001b[38;5;129;01mor\u001b[39;00m vers),\n\u001b[1;32m 1063\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mhead,\n\u001b[1;32m 1064\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mreq_kw,\n\u001b[1;32m 1065\u001b[0m )\n\u001b[1;32m 1066\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:348\u001b[0m, in \u001b[0;36mS3FileSystem._call_s3\u001b[0;34m(self, method, *akwarglist, **kwargs)\u001b[0m\n\u001b[1;32m 347\u001b[0m additional_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_s3_method_kwargs(method, \u001b[38;5;241m*\u001b[39makwarglist, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m--> 348\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m _error_wrapper(\n\u001b[1;32m 349\u001b[0m method, kwargs\u001b[38;5;241m=\u001b[39madditional_kwargs, retries\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mretries\n\u001b[1;32m 350\u001b[0m )\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/s3fs/core.py:140\u001b[0m, in \u001b[0;36m_error_wrapper\u001b[0;34m(func, args, kwargs, retries)\u001b[0m\n\u001b[1;32m 139\u001b[0m err \u001b[38;5;241m=\u001b[39m translate_boto_error(err)\n\u001b[0;32m--> 140\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m err\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: An error occurred (NoSuchBucket) when calling the GetObject operation: Unknown", + "\nDuring handling of the above exception, another exception occurred:\n", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[5], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# open and view zarr dataset\u001b[39;00m\n\u001b[1;32m 2\u001b[0m fs2 \u001b[38;5;241m=\u001b[39m fsspec\u001b[38;5;241m.\u001b[39mfilesystem(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124ms3\u001b[39m\u001b[38;5;124m'\u001b[39m, anon\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, endpoint_url\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhttps://usgs.osn.mghpcc.org/\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[0;32m----> 3\u001b[0m ds \u001b[38;5;241m=\u001b[39m \u001b[43mxr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfs2\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_mapper\u001b[49m\u001b[43m(\u001b[49m\u001b[43mzarr_url2\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mengine\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mzarr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[43m \u001b[49m\u001b[43mbackend_kwargs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mconsolidated\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m:\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m}\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mchunks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 5\u001b[0m ds\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/xarray/backends/api.py:539\u001b[0m, in \u001b[0;36mopen_dataset\u001b[0;34m(filename_or_obj, engine, chunks, cache, decode_cf, mask_and_scale, decode_times, decode_timedelta, use_cftime, concat_characters, decode_coords, drop_variables, inline_array, backend_kwargs, **kwargs)\u001b[0m\n\u001b[1;32m 527\u001b[0m decoders \u001b[38;5;241m=\u001b[39m _resolve_decoders_kwargs(\n\u001b[1;32m 528\u001b[0m decode_cf,\n\u001b[1;32m 529\u001b[0m open_backend_dataset_parameters\u001b[38;5;241m=\u001b[39mbackend\u001b[38;5;241m.\u001b[39mopen_dataset_parameters,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 535\u001b[0m decode_coords\u001b[38;5;241m=\u001b[39mdecode_coords,\n\u001b[1;32m 536\u001b[0m )\n\u001b[1;32m 538\u001b[0m overwrite_encoded_chunks \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124moverwrite_encoded_chunks\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m--> 539\u001b[0m backend_ds \u001b[38;5;241m=\u001b[39m \u001b[43mbackend\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_dataset\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 540\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename_or_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 541\u001b[0m \u001b[43m \u001b[49m\u001b[43mdrop_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdrop_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 542\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mdecoders\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 543\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 544\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 545\u001b[0m ds \u001b[38;5;241m=\u001b[39m _dataset_from_backend_dataset(\n\u001b[1;32m 546\u001b[0m backend_ds,\n\u001b[1;32m 547\u001b[0m filename_or_obj,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 555\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 556\u001b[0m )\n\u001b[1;32m 557\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m ds\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/xarray/backends/zarr.py:840\u001b[0m, in \u001b[0;36mZarrBackendEntrypoint.open_dataset\u001b[0;34m(self, filename_or_obj, mask_and_scale, decode_times, concat_characters, decode_coords, drop_variables, use_cftime, decode_timedelta, group, mode, synchronizer, consolidated, chunk_store, storage_options, stacklevel)\u001b[0m\n\u001b[1;32m 820\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mopen_dataset\u001b[39m(\n\u001b[1;32m 821\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 822\u001b[0m filename_or_obj,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 836\u001b[0m stacklevel\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m3\u001b[39m,\n\u001b[1;32m 837\u001b[0m ):\n\u001b[1;32m 839\u001b[0m filename_or_obj \u001b[38;5;241m=\u001b[39m _normalize_path(filename_or_obj)\n\u001b[0;32m--> 840\u001b[0m store \u001b[38;5;241m=\u001b[39m \u001b[43mZarrStore\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_group\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 841\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilename_or_obj\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 842\u001b[0m \u001b[43m \u001b[49m\u001b[43mgroup\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 843\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 844\u001b[0m \u001b[43m \u001b[49m\u001b[43msynchronizer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msynchronizer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 845\u001b[0m \u001b[43m \u001b[49m\u001b[43mconsolidated\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mconsolidated\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 846\u001b[0m \u001b[43m \u001b[49m\u001b[43mconsolidate_on_close\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 847\u001b[0m \u001b[43m \u001b[49m\u001b[43mchunk_store\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mchunk_store\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 848\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 849\u001b[0m \u001b[43m \u001b[49m\u001b[43mstacklevel\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstacklevel\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 850\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 852\u001b[0m store_entrypoint \u001b[38;5;241m=\u001b[39m StoreBackendEntrypoint()\n\u001b[1;32m 853\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m close_on_error(store):\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/xarray/backends/zarr.py:407\u001b[0m, in \u001b[0;36mZarrStore.open_group\u001b[0;34m(cls, store, mode, synchronizer, group, consolidated, consolidate_on_close, chunk_store, storage_options, append_dim, write_region, safe_chunks, stacklevel)\u001b[0m\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNo such file or directory: \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstore\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m consolidated:\n\u001b[1;32m 406\u001b[0m \u001b[38;5;66;03m# TODO: an option to pass the metadata_key keyword\u001b[39;00m\n\u001b[0;32m--> 407\u001b[0m zarr_group \u001b[38;5;241m=\u001b[39m \u001b[43mzarr\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen_consolidated\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstore\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mopen_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 408\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 409\u001b[0m zarr_group \u001b[38;5;241m=\u001b[39m zarr\u001b[38;5;241m.\u001b[39mopen_group(store, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mopen_kwargs)\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/zarr/convenience.py:1300\u001b[0m, in \u001b[0;36mopen_consolidated\u001b[0;34m(store, metadata_key, mode, **kwargs)\u001b[0m\n\u001b[1;32m 1297\u001b[0m metadata_key \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mmeta/root/consolidated/\u001b[39m\u001b[38;5;124m'\u001b[39m \u001b[38;5;241m+\u001b[39m metadata_key\n\u001b[1;32m 1299\u001b[0m \u001b[38;5;66;03m# setup metadata store\u001b[39;00m\n\u001b[0;32m-> 1300\u001b[0m meta_store \u001b[38;5;241m=\u001b[39m \u001b[43mConsolidatedStoreClass\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstore\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata_key\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1302\u001b[0m \u001b[38;5;66;03m# pass through\u001b[39;00m\n\u001b[1;32m 1303\u001b[0m chunk_store \u001b[38;5;241m=\u001b[39m kwargs\u001b[38;5;241m.\u001b[39mpop(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mchunk_store\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;129;01mor\u001b[39;00m store\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/zarr/storage.py:2861\u001b[0m, in \u001b[0;36mConsolidatedMetadataStore.__init__\u001b[0;34m(self, store, metadata_key)\u001b[0m\n\u001b[1;32m 2858\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstore \u001b[38;5;241m=\u001b[39m Store\u001b[38;5;241m.\u001b[39m_ensure_store(store)\n\u001b[1;32m 2860\u001b[0m \u001b[38;5;66;03m# retrieve consolidated metadata\u001b[39;00m\n\u001b[0;32m-> 2861\u001b[0m meta \u001b[38;5;241m=\u001b[39m json_loads(\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstore\u001b[49m\u001b[43m[\u001b[49m\u001b[43mmetadata_key\u001b[49m\u001b[43m]\u001b[49m)\n\u001b[1;32m 2863\u001b[0m \u001b[38;5;66;03m# check format of consolidated metadata\u001b[39;00m\n\u001b[1;32m 2864\u001b[0m consolidated_format \u001b[38;5;241m=\u001b[39m meta\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mzarr_consolidated_format\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m)\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/zarr/storage.py:724\u001b[0m, in \u001b[0;36mKVStore.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 723\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__getitem__\u001b[39m(\u001b[38;5;28mself\u001b[39m, key):\n\u001b[0;32m--> 724\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_mutable_mapping\u001b[49m\u001b[43m[\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m]\u001b[49m\n", + "File \u001b[0;32m/caldera/projects/usgs/water/impd/hytest/conda/envs/hytest/lib/python3.10/site-packages/fsspec/mapping.py:147\u001b[0m, in \u001b[0;36mFSMap.__getitem__\u001b[0;34m(self, key, default)\u001b[0m\n\u001b[1;32m 145\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m default \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 146\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m default\n\u001b[0;32m--> 147\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(key)\n\u001b[1;32m 148\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "\u001b[0;31mKeyError\u001b[0m: '.zmetadata'" + ] + } + ], + "source": [ + "# open and view zarr dataset\n", + "fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')\n", + "ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr', \n", + " backend_kwargs={'consolidated':True}, chunks={})\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc", + "metadata": {}, + "source": [ + "## Identify x, y, t dimensions of dataset\n", + "May require user input if dimensions cannot be auto-detected." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab91268f-7200-4cb1-979a-c7d75531d2c0", + "metadata": {}, + "outputs": [], + "source": [ + "dims_auto_extract = ['X', 'Y', 'T']\n", + "def extract_dim(ds, d):\n", + " try:\n", + " dim_list = ds.cf.axes[d]\n", + " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", + " dim = dim_list[0]\n", + " except KeyError:\n", + " print(f\"Could not auto-extract {d} dimension name.\")\n", + " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", + " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", + " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", + " print(f\"name of {d} dimension: {dim}\\n\")\n", + " return dim\n", + "\n", + "dim_names_dict = {}\n", + "for d in dims_auto_extract:\n", + " dim_names_dict[d] = extract_dim(ds, d)\n", + "print(f\"Dimension dictionary: {dim_names_dict}\")" + ] + }, + { + "cell_type": "markdown", + "id": "8fbfecfb-9886-4d06-a34c-6471cb0a6053", + "metadata": {}, + "source": [ + "## Plot a map" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4eb4d027-4266-4a0b-8f16-bacfbef06242", + "metadata": {}, + "outputs": [], + "source": [ + "# plot a map of a single variable\n", + "var_to_plot = 'SNOW'\n", + "da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()\n", + "da.hvplot.quadmesh(x='lon', y='lat', rasterize=True,\n", + " geo=True, tiles='OSM', alpha=0.7, cmap='turbo')" + ] + }, + { + "cell_type": "markdown", + "id": "5e057a6c-06fb-4406-823b-e81c58e520e4", + "metadata": {}, + "source": [ + "## Plot a time series at a specific point\n", + "This can help you verify a variable's values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7de2681-88c2-4597-857c-8f169c596f8b", + "metadata": {}, + "outputs": [], + "source": [ + "# enter lat, lon of point you want to plot time series for\n", + "lat,lon = 39.978322,-105.2772194\n", + "time_start = '2013-01-01 00:00'\n", + "time_end = '2013-12-31 00:00'\n", + "ds = ds.metpy.parse_cf()\n", + "crs = ds[var_to_plot].metpy.cartopy_crs\n", + "x, y = crs.transform_point(lon, lat, src_crs=ccrs.PlateCarree()) # PlateCaree = Lat,Lon\n", + "da = ds[var_to_plot].sel(x=x, y=y, method='nearest').sel(time=slice(time_start,time_end)).load()\n", + "da.hvplot(x=dim_names_dict['T'], grid=True)" + ] + }, + { + "cell_type": "markdown", + "id": "a8c3ed37-8564-400b-a7fb-25bd5e43d21c", + "metadata": {}, + "source": [ + "## Create Collection Extent" + ] + }, + { + "cell_type": "markdown", + "id": "69f0d837-68a5-4fed-9a14-5d75cfbb0da4", + "metadata": {}, + "source": [ + "### Spatial Extent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d46805e0-8e94-4ebe-aa01-d9a2d7051459", + "metadata": {}, + "outputs": [], + "source": [ + "# pull out lat/lon bbox for data\n", + "# coordiantes must be from WGS 84 datum\n", + "# left, bottom, right, top\n", + "# Note: I'm not sure why but half the time you need to run .compute() to get the value and the other half\n", + "# you shouldn't - I've included both options below - switch the commented line if you have this issue\n", + "#coord_bounds = [ds.lon.data.min().compute().astype(float), ds.lat.data.min().compute().astype(float), ds.lon.data.max().compute().astype(float), ds.lat.data.max().compute().astype(float)]\n", + "coord_bounds = [ds.lon.data.min().astype(float).item(), ds.lat.data.min().astype(float).item(), ds.lon.data.max().astype(float).item(), ds.lat.data.max().astype(float).item()]\n", + "print(coord_bounds)\n", + "# create a spatial extent object \n", + "spatial_extent = pystac.SpatialExtent(bboxes=[coord_bounds])" + ] + }, + { + "cell_type": "markdown", + "id": "a04c8fca-1d33-43ac-9e2b-62d7be2887f7", + "metadata": {}, + "source": [ + "### Temporal Extent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41a84995-867c-4152-8c57-85e3758bbb77", + "metadata": {}, + "outputs": [], + "source": [ + "# pull out first and last timestamps\n", + "temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())\n", + "temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())\n", + "print(f'min: {temporal_extent_lower} \\nmax: {temporal_extent_upper}')\n", + "# create a temporal extent object\n", + "temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b1e37c4-5348-46ad-abc9-e005b5d6c02b", + "metadata": {}, + "outputs": [], + "source": [ + "collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)" + ] + }, + { + "cell_type": "markdown", + "id": "cfb71202-03df-45b5-ac2f-0dc2ee1ab780", + "metadata": {}, + "source": [ + "## Create pystac collection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e96811b-95ae-406a-9728-55fc429d4e1f", + "metadata": {}, + "outputs": [], + "source": [ + "if catalog.get_child(collection_id):\n", + " collection = catalog.get_child(collection_id)\n", + " print(\"existing collection opened\")\n", + " collection.extent=collection_extent\n", + " collection.description=collection_description\n", + " collection.license=collection_license\n", + "else:\n", + " collection = pystac.Collection(id=collection_id,\n", + " description=collection_description,\n", + " extent=collection_extent,\n", + " license=collection_license)\n", + " print(\"new collection created\")" + ] + }, + { + "cell_type": "markdown", + "id": "a21c76e8-cd57-4eb5-a33f-7c668a3b3205", + "metadata": {}, + "source": [ + "## Add zarr url asset to collection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "094832af-d22b-4359-b0f6-cf687acce5cc", + "metadata": {}, + "outputs": [], + "source": [ + "asset_id = \"zarr-s3-osn\"\n", + "asset = pystac.Asset(href=zarr_url,\n", + " description=asset_description,\n", + " media_type=\"application/vnd+zarr\",\n", + " roles=asset_roles,\n", + " extra_fields = xarray_opendataset_kwargs)\n", + "collection.add_asset(asset_id, asset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c298d07-f234-4a08-986d-87f4a39e9ae6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "asset_id2 = \"zarr-s3\"\n", + "asset2 = pystac.Asset(href=zarr_url2,\n", + " description=asset_description2,\n", + " media_type=\"application/vnd+zarr\",\n", + " roles=asset_roles2,\n", + " extra_fields = xarray_opendataset_kwargs2)\n", + "collection.add_asset(asset_id2, asset2)" + ] + }, + { + "cell_type": "markdown", + "id": "f67cd5c9-db33-45c2-bc21-480cd67354f4", + "metadata": {}, + "source": [ + "## Add datacube extension to collection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc00946d-2880-491d-9b3b-3aeeb4414d6c", + "metadata": {}, + "outputs": [], + "source": [ + "# instantiate extention on collection\n", + "dc = DatacubeExtension.ext(collection, add_if_missing=True)" + ] + }, + { + "cell_type": "markdown", + "id": "8bdd77a2-7587-485e-afb7-42af3a822241", + "metadata": {}, + "source": [ + "### Add cube dimensions (required field for extension)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "120a4914-3302-44a5-a282-0308ac84f040", + "metadata": {}, + "outputs": [], + "source": [ + "# list out dataset dimensions\n", + "# When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,\n", + "# raising an error if it can’t be found.\n", + "dims = list(ds.dims)\n", + "print(dims)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "00a18a29-fb9a-4b56-8009-493122997b16", + "metadata": {}, + "outputs": [], + "source": [ + "# get x, y bounds for extent of those dimensions (required)\n", + "xy_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]\n", + "print(xy_bounds)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", + "metadata": {}, + "outputs": [], + "source": [ + "def get_step(dim_name):\n", + " dim_vals = ds[dim_name].values\n", + " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", + " unique_steps = np.unique(diffs)\n", + " # set step - if all steps are the same length\n", + " # datacube spec specifies to use null for irregularly spaced steps\n", + " if len(unique_steps)==1:\n", + " step = unique_steps[0].astype(float).item()\n", + " else:\n", + " step = None\n", + " return(step)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", + "metadata": {}, + "outputs": [], + "source": [ + "def get_long_name(ds, v):\n", + " # try to get long_name attribute from variable\n", + " try:\n", + " long_name = ds[v].attrs['long_name']\n", + " # otherwise, leave empty\n", + " except:\n", + " long_name = None\n", + " return long_name" + ] + }, + { + "cell_type": "markdown", + "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", + "metadata": {}, + "source": [ + "#### user input needed - you will need to look at the crs information and create a cartopy crs object after identifying the projection type:\n", + "reference list of cartopy projections: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea452f62-5644-49b6-8a4e-7dc4f649fd1a", + "metadata": {}, + "outputs": [], + "source": [ + "# print ot crs information in dataset\n", + "crs_info = ds.crs\n", + "print(crs_info)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1b1d05ff-8e43-44a7-8343-178b112c4ad6", + "metadata": {}, + "outputs": [], + "source": [ + "# create the appropriate cartopy projection\n", + "lcc = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian, \n", + " central_latitude=crs_info.latitude_of_projection_origin,\n", + " standard_parallels=crs_info.standard_parallel)\n", + "# the datacube extension can accept reference_system information as a numerical EPSG code, \n", + "# WKT2 (ISO 19162) string or PROJJSON object.\n", + "# we will use a projjson, as was done by Microsoft Planetary Computer here:\n", + "# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na\n", + "# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na\n", + "projjson = json.loads(lcc.to_json())" + ] + }, + { + "cell_type": "markdown", + "id": "00a5e041-081d-428d-ac2e-75d16de205e6", + "metadata": {}, + "source": [ + "#### user input needed - you will need to copy all of the dimensions from above into the dict and fill in the appropriate attributes(type, axis, extent):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5a443497-67a9-4dce-a8e9-b08d31a88223", + "metadata": {}, + "outputs": [], + "source": [ + "# create a dictionary of datacube dimensions you would like to assign to this dataset\n", + "# dimension name should come from the coordinates printed above\n", + "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", + "# note that the extent of each dimension should be pulled from the dataset\n", + "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(get_step('time')).isoformat()}),\n", + " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': get_step('x'), 'reference_system': projjson}),\n", + " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': get_step('y'), 'reference_system': projjson}),\n", + " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top_stag')}),\n", + " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top')}),\n", + " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'soil_layers_stag')}),\n", + " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, 'x_stag')}),\n", + " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y_stag')}),\n", + " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snow_layers_stag')}),\n", + " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snso_layers_stag')}),\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "0f277883-a3fd-425f-966a-ca2140d0ef2f", + "metadata": {}, + "source": [ + "### Add cube variables (optional field for extension)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "92510876-7853-4d24-8563-c69f9012aeb6", + "metadata": {}, + "outputs": [], + "source": [ + "# define functions to pull out datacube attributes and validate format\n", + "def get_unit(ds, v):\n", + " # check if unit is defined for variable\n", + " try:\n", + " unit = ds[v].attrs['units']\n", + " except:\n", + " unit = None\n", + " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", + " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", + " # gdptools expects this format as well\n", + " try:\n", + " cfunits.Units(unit).isvalid\n", + " except:\n", + " print(\"Unit is not valid as a UD unit.\")\n", + " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", + " assert cfunits.Units(unit).isvalid\n", + " return unit\n", + "\n", + "def get_var_type(ds, v):\n", + " if v in ds.coords:\n", + " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", + " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", + " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", + " var_type = 'auxiliary'\n", + " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", + " else:\n", + " var_type = 'data'\n", + " return var_type" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9272931-fc0b-4f2a-9546-283033e9cde8", + "metadata": {}, + "outputs": [], + "source": [ + "# pull list of vars from dataset\n", + "vars = list(ds.variables)\n", + "\n", + "# drop metpy_crs coordinate we have added\n", + "if 'metpy_crs' in ds.coords:\n", + " ds = ds.drop_vars('metpy_crs')\n", + "\n", + "# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.\n", + "# we will drop all values in dims from vars\n", + "vars = [v for v in vars if v not in dims]\n", + "\n", + "# Microsoft Planetary Computer includes coordinates and crs as variables here:\n", + "# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na\n", + "# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na\n", + "# we will keep those in the var list\n", + "\n", + "# create dictionary of dataset variables and associated dimensions\n", + "vars_dict={}\n", + "for v in vars:\n", + " unit = get_unit(ds, v)\n", + " var_type = get_var_type(ds, v)\n", + " long_name = get_long_name(ds, v)\n", + " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" + ] + }, + { + "cell_type": "markdown", + "id": "11ad5352-884c-4472-8864-4570a96f66e5", + "metadata": {}, + "source": [ + "### Finalize extension" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10141fd4-91d6-491d-878b-02653720891d", + "metadata": {}, + "outputs": [], + "source": [ + "# add dimesions and variables to collection extension\n", + "dc.apply(dimensions=dims_dict, variables=vars_dict)" + ] + }, + { + "cell_type": "markdown", + "id": "615ca168-75fb-4135-9941-0ef5fe4fd1cb", + "metadata": {}, + "source": [ + "## Add STAC Collection to Catalog and Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2120a55-3d04-4122-a93f-29afcdb8cb1b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# # helper to find items of wrong type\n", + "# d = collection.to_dict()\n", + "# def find_paths(nested_dict, prepath=()):\n", + "# for k, v in nested_dict.items():\n", + "# try:\n", + "# path = prepath + (k,)\n", + "# if type(v) is np.float64: # found value\n", + "# yield path\n", + "# elif hasattr(v, 'items'): # v is a dict\n", + "# yield from find_paths(v, path) \n", + "# except:\n", + "# print(prepath)\n", + "\n", + "# print(*find_paths(d))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b75791b-6b2d-40be-b7c6-330a60888fb5", + "metadata": {}, + "outputs": [], + "source": [ + "if catalog.get_child(collection_id):\n", + " collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)\n", + "else:\n", + " catalog.add_child(collection)\n", + " catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6f676b5-e892-4bfb-8d73-2828addd838c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "conda:hytest", + "language": "python", + "name": "conda-hytest" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/workflows/archive/create_collection_from_zarr_conus404-daily.ipynb b/workflows/archive/conus404-daily_create_collection_from_zarr.ipynb similarity index 100% rename from workflows/archive/create_collection_from_zarr_conus404-daily.ipynb rename to workflows/archive/conus404-daily_create_collection_from_zarr.ipynb