diff --git a/catalog/conus404-daily/collection.json b/catalog/conus404-daily/collection.json index c1243093bf70840c78875f849fb734c875b670c7..504dd7a16c9768c5729f649762b9d86f2ad23674 100644 --- a/catalog/conus404-daily/collection.json +++ b/catalog/conus404-daily/collection.json @@ -1843,4 +1843,4 @@ ] } } -} \ No newline at end of file +} diff --git a/workflows/archive/CooperMcKenzie_create_collection_from_zarr.ipynb b/workflows/archive/CooperMcKenzie_create_collection_from_zarr.ipynb index ce1bb6cce17d1cf5f14acf14aa9075567534a25c..e2de17a71fbba156904c02bb529f9a8a2b0bbeb1 100644 --- a/workflows/archive/CooperMcKenzie_create_collection_from_zarr.ipynb +++ b/workflows/archive/CooperMcKenzie_create_collection_from_zarr.ipynb @@ -192,22 +192,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -451,59 +438,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -570,7 +504,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -582,7 +516,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -614,7 +548,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -627,7 +561,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -660,7 +594,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -673,7 +607,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -724,16 +658,9 @@ "# dimension name should come from the coordinates printed above\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),\n", - " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top_stag')}),\n", - " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top')}),\n", - " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'soil_layers_stag')}),\n", - " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, 'x_stag')}),\n", - " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y_stag')}),\n", - " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snow_layers_stag')}),\n", - " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snso_layers_stag')}),\n", + "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),\n", + " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': stac_helpers.get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -745,43 +672,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -808,9 +698,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -852,18 +742,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/LOCA_future_create_collection_from_zarr.ipynb b/workflows/archive/LOCA_future_create_collection_from_zarr.ipynb index 5aab9551ae951aee3808e11a59107397dad04ddb..3fe3fee9adcde53fc2d97f293c5809e5c15841ab 100644 --- a/workflows/archive/LOCA_future_create_collection_from_zarr.ipynb +++ b/workflows/archive/LOCA_future_create_collection_from_zarr.ipynb @@ -192,22 +192,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -440,59 +427,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -559,7 +493,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -571,7 +505,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -603,7 +537,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -616,7 +550,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -649,7 +583,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -662,7 +596,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -717,10 +651,10 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", " }" ] }, @@ -732,43 +666,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -795,9 +692,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -839,18 +736,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/LOCA_historical_create_collection_from_zarr.ipynb b/workflows/archive/LOCA_historical_create_collection_from_zarr.ipynb index b0959c20d34c54abff3cf390b3789810f95ef202..b9c9e601b82cf97d6d66af0721d7ec66cfcafa38 100644 --- a/workflows/archive/LOCA_historical_create_collection_from_zarr.ipynb +++ b/workflows/archive/LOCA_historical_create_collection_from_zarr.ipynb @@ -192,22 +192,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -440,59 +427,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -559,7 +493,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -571,7 +505,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -603,7 +537,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -616,7 +550,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -649,7 +583,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -662,7 +596,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -717,10 +651,10 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", " }" ] }, @@ -732,43 +666,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -795,9 +692,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -839,18 +736,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/Red_River_MPI-ESM-LR_future_create_collection_from_zarr.ipynb b/workflows/archive/Red_River_MPI-ESM-LR_future_create_collection_from_zarr.ipynb index 0353d4a35d0a6c40629d0473bd568c7f09b7bab6..02374c828d157e89efd198d05db28882806ee618 100644 --- a/workflows/archive/Red_River_MPI-ESM-LR_future_create_collection_from_zarr.ipynb +++ b/workflows/archive/Red_River_MPI-ESM-LR_future_create_collection_from_zarr.ipynb @@ -192,22 +192,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -445,59 +432,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -564,7 +498,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -576,7 +510,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -610,7 +544,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'], round_dec=13)\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)\n", "print(f'x step: {x_step}')" ] }, @@ -623,7 +557,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -658,7 +592,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'], round_dec=13)\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)\n", "print(f'y step: {y_step}')" ] }, @@ -671,7 +605,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -726,10 +660,10 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", " }" ] }, @@ -741,43 +675,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -804,9 +701,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -848,18 +745,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/Red_River_MPI-ESM-LR_historical_create_collection_from_zarr.ipynb b/workflows/archive/Red_River_MPI-ESM-LR_historical_create_collection_from_zarr.ipynb index 00825081e48c6bc1c0bbc1e4a3958efa60a63a2f..210edd5ab44e978073d0f77c4a0e28c5993ad5e3 100644 --- a/workflows/archive/Red_River_MPI-ESM-LR_historical_create_collection_from_zarr.ipynb +++ b/workflows/archive/Red_River_MPI-ESM-LR_historical_create_collection_from_zarr.ipynb @@ -192,22 +192,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -445,59 +432,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -564,7 +498,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -576,7 +510,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -610,7 +544,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'], round_dec=13)\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)\n", "print(f'x step: {x_step}')" ] }, @@ -623,7 +557,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -658,7 +592,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'], round_dec=13)\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)\n", "print(f'y step: {y_step}')" ] }, @@ -671,7 +605,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -726,10 +660,10 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", " }" ] }, @@ -741,43 +675,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -804,9 +701,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -848,18 +745,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/Red_River_future_create_collection_from_zarr.ipynb b/workflows/archive/Red_River_future_create_collection_from_zarr.ipynb index e75a2459a96cc9ce5d791fc7f15903dc42bee1b7..60bc2ba7cff68909889d78dfb275c7afcac9e94c 100644 --- a/workflows/archive/Red_River_future_create_collection_from_zarr.ipynb +++ b/workflows/archive/Red_River_future_create_collection_from_zarr.ipynb @@ -192,22 +192,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -445,59 +432,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -564,7 +498,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -576,7 +510,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -610,7 +544,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'], round_dec=13)\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)\n", "print(f'x step: {x_step}')" ] }, @@ -623,7 +557,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -658,7 +592,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'], round_dec=13)\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)\n", "print(f'y step: {y_step}')" ] }, @@ -671,7 +605,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -726,10 +660,10 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", " }" ] }, @@ -741,43 +675,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -804,9 +701,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -848,18 +745,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/Red_River_historical_create_collection_from_zarr.ipynb b/workflows/archive/Red_River_historical_create_collection_from_zarr.ipynb index 19ca03157ebe6aac5a580c89615e377398d5a42a..eb6a317a91dc621635452bbad4cd43af95a76028 100644 --- a/workflows/archive/Red_River_historical_create_collection_from_zarr.ipynb +++ b/workflows/archive/Red_River_historical_create_collection_from_zarr.ipynb @@ -192,22 +192,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -445,59 +432,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -564,7 +498,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -576,7 +510,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -610,7 +544,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'], round_dec=13)\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)\n", "print(f'x step: {x_step}')" ] }, @@ -623,7 +557,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -658,7 +592,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'], round_dec=13)\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)\n", "print(f'y step: {y_step}')" ] }, @@ -671,7 +605,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -726,10 +660,10 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", " }" ] }, @@ -741,43 +675,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -804,9 +701,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -848,18 +745,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/TTU_2019_rcp45_time_slices_create_collection_from_zarr.ipynb b/workflows/archive/TTU_2019_rcp45_time_slices_create_collection_from_zarr.ipynb index 7cb328b50a059bbf16cb48cde8ed6ff054bb0b69..ab7ccc577a5b983683714062078c5575c9ef7e64 100644 --- a/workflows/archive/TTU_2019_rcp45_time_slices_create_collection_from_zarr.ipynb +++ b/workflows/archive/TTU_2019_rcp45_time_slices_create_collection_from_zarr.ipynb @@ -189,22 +189,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -443,59 +430,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -577,7 +511,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -609,7 +543,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -622,7 +556,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -655,7 +589,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -668,7 +602,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -723,10 +657,10 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'nv': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'nv'), 'extent': [ds.nv.min().item(), ds.nv.max().item()]}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'nv': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'nv'), 'extent': [ds.nv.min().item(), ds.nv.max().item()]}),\n", " }" ] }, @@ -738,43 +672,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -801,9 +698,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -845,18 +742,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/TTU_2019_rcp85_gridded_annual_create_collection_from_zarr.ipynb b/workflows/archive/TTU_2019_rcp85_gridded_annual_create_collection_from_zarr.ipynb index 355ca14fbe8a5ffd5225fbb9b9831bbbb06fb29d..c8586ac6bd7da8d4d924d0fe45d3f622177c5e3e 100644 --- a/workflows/archive/TTU_2019_rcp85_gridded_annual_create_collection_from_zarr.ipynb +++ b/workflows/archive/TTU_2019_rcp85_gridded_annual_create_collection_from_zarr.ipynb @@ -187,22 +187,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -441,59 +428,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -575,7 +509,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -607,7 +541,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -620,7 +554,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -653,7 +587,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -666,7 +600,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -721,10 +655,10 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'nv': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'nv'), 'extent': [ds.nv.min().item(), ds.nv.max().item()]}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'nv': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'nv'), 'extent': [ds.nv.min().item(), ds.nv.max().item()]}),\n", " }" ] }, @@ -736,43 +670,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -799,9 +696,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -843,18 +740,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/TTU_2019_rcp85_time_slices_create_collection_from_zarr.ipynb b/workflows/archive/TTU_2019_rcp85_time_slices_create_collection_from_zarr.ipynb index 831b1d24bd955d93f068f38944e8ad0f3c032791..3e90059b1adec4c9d5d7e2b94dcc963bb1d6b7ba 100644 --- a/workflows/archive/TTU_2019_rcp85_time_slices_create_collection_from_zarr.ipynb +++ b/workflows/archive/TTU_2019_rcp85_time_slices_create_collection_from_zarr.ipynb @@ -189,22 +189,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -443,59 +430,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -577,7 +511,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -609,7 +543,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -622,7 +556,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -655,7 +589,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -668,7 +602,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -723,10 +657,10 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'nv': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'nv'), 'extent': [ds.nv.min().item(), ds.nv.max().item()]}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'nv': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'nv'), 'extent': [ds.nv.min().item(), ds.nv.max().item()]}),\n", " }" ] }, @@ -738,43 +672,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -801,9 +698,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -845,18 +742,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/alaska_et_2020_ccsm4_historical_simulation_create_collection_from_zarr.ipynb b/workflows/archive/alaska_et_2020_ccsm4_historical_simulation_create_collection_from_zarr.ipynb index 57997f52ce849f4a843c52292fd296962cb4a0ae..4248535fdb130df173b2fd31fcedb96fb4dc9b7a 100644 --- a/workflows/archive/alaska_et_2020_ccsm4_historical_simulation_create_collection_from_zarr.ipynb +++ b/workflows/archive/alaska_et_2020_ccsm4_historical_simulation_create_collection_from_zarr.ipynb @@ -199,22 +199,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -466,56 +453,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, debug=False, step_ix=0):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -573,7 +510,7 @@ }, "outputs": [], "source": [ - "time_step = get_step(ds, dim_names_dict['T'])\n", + "time_step = stac_helpers.get_step(ds, dim_names_dict['T'])\n", "print(f'time step: {time_step}')" ] }, @@ -587,7 +524,7 @@ "outputs": [], "source": [ "# debugging for time steps: get all step values and locations\n", - "time_step = get_step(ds, dim_names_dict['T'], debug=True, step_ix=2)" + "time_step = stac_helpers.get_step(ds, dim_names_dict['T'], debug=True, step_ix=2)" ] }, { @@ -623,7 +560,7 @@ }, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -636,7 +573,7 @@ }, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -673,9 +610,9 @@ "# dimension name should come from the coordinates printed above\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(time_step).isoformat()}),\n", - " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(time_step).isoformat()}),\n", + " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': stac_helpers.get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -687,45 +624,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -754,9 +652,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -800,18 +698,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/alaska_et_2020_era-interim_reanalysis_create_collection_from_zarr.ipynb b/workflows/archive/alaska_et_2020_era-interim_reanalysis_create_collection_from_zarr.ipynb index b89065402a87400ab61cd900983b9c5fec20b7e4..eb4c969242c0326ede2994fe64a8329481423ac8 100644 --- a/workflows/archive/alaska_et_2020_era-interim_reanalysis_create_collection_from_zarr.ipynb +++ b/workflows/archive/alaska_et_2020_era-interim_reanalysis_create_collection_from_zarr.ipynb @@ -199,22 +199,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -466,56 +453,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, debug=False, step_ix=0):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -573,7 +510,7 @@ }, "outputs": [], "source": [ - "time_step = get_step(ds, dim_names_dict['T'])\n", + "time_step = stac_helpers.get_step(ds, dim_names_dict['T'])\n", "print(f'time step: {time_step}')" ] }, @@ -587,7 +524,7 @@ "outputs": [], "source": [ "# debugging for time steps: get all step values and locations\n", - "time_step = get_step(ds, dim_names_dict['T'], debug=True, step_ix=2)" + "time_step = stac_helpers.get_step(ds, dim_names_dict['T'], debug=True, step_ix=2)" ] }, { @@ -623,7 +560,7 @@ }, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -636,7 +573,7 @@ }, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -673,9 +610,9 @@ "# dimension name should come from the coordinates printed above\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(time_step).isoformat()}),\n", - " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(time_step).isoformat()}),\n", + " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': stac_helpers.get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -687,45 +624,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -754,9 +652,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -800,18 +698,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/alaska_et_2020_gfdl_historical_simulation_create_collection_from_zarr.ipynb b/workflows/archive/alaska_et_2020_gfdl_historical_simulation_create_collection_from_zarr.ipynb index b9af80918a2e7c49ce55761055fd04aba7139323..2de45463e65981e90a9ce233663039937eba0bf2 100644 --- a/workflows/archive/alaska_et_2020_gfdl_historical_simulation_create_collection_from_zarr.ipynb +++ b/workflows/archive/alaska_et_2020_gfdl_historical_simulation_create_collection_from_zarr.ipynb @@ -199,22 +199,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -466,56 +453,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, debug=False, step_ix=0):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -573,7 +510,7 @@ }, "outputs": [], "source": [ - "time_step = get_step(ds, dim_names_dict['T'])\n", + "time_step = stac_helpers.get_step(ds, dim_names_dict['T'])\n", "print(f'time step: {time_step}')" ] }, @@ -587,7 +524,7 @@ "outputs": [], "source": [ "# debugging for time steps: get all step values and locations\n", - "time_step = get_step(ds, dim_names_dict['T'], debug=True, step_ix=2)" + "time_step = stac_helpers.get_step(ds, dim_names_dict['T'], debug=True, step_ix=2)" ] }, { @@ -623,7 +560,7 @@ }, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -636,7 +573,7 @@ }, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -673,9 +610,9 @@ "# dimension name should come from the coordinates printed above\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(time_step).isoformat()}),\n", - " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(time_step).isoformat()}),\n", + " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': stac_helpers.get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -687,45 +624,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -754,9 +652,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -800,18 +698,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/conus404-daily_create_collection_from_zarr.ipynb b/workflows/archive/conus404-daily_create_collection_from_zarr.ipynb deleted file mode 100644 index 6fd10175c650e92b2dd863975619377c59ed4005..0000000000000000000000000000000000000000 --- a/workflows/archive/conus404-daily_create_collection_from_zarr.ipynb +++ /dev/null @@ -1,784 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "6c10e07b-1e60-4926-af1d-fa75dc78e5d4", - "metadata": { - "tags": [] - }, - "source": [ - "# CONUS404 Daily Zarr -> Collection Workflow\n", - "This is a workflow to build a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md) from the zarr asset for the dataset named above. We use the [datacube extension](https://github.com/stac-extensions/datacube) to define the spatial and temporal dimensions of the zarr store, as well as the variables it contains.\n", - "\n", - "To simplify this workflow so that it can scale to many datasets, a few simplifying suggestions and assumptions are made:\n", - "1. For USGS data, we can use the CC0-1.0 license. For all other data we can use Unlicense. Ref: https://spdx.org/licenses/\n", - "2. I am assuming all coordinates are from the WGS84 datum if not specified." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "201e0945-de55-45ff-b095-c2af009a4e62", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import pystac\n", - "from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension\n", - "import xarray as xr\n", - "import cf_xarray\n", - "import os\n", - "import fsspec\n", - "import cf_xarray\n", - "import hvplot.xarray\n", - "import pandas as pd\n", - "import json\n", - "import numpy as np\n", - "import metpy\n", - "import cartopy.crs as ccrs\n", - "import cfunits\n", - "import json" - ] - }, - { - "cell_type": "markdown", - "id": "20b00e88-5a13-46b3-9787-d9ac2d4e7bd6", - "metadata": {}, - "source": [ - "## Open up NHGF STAC Catalog" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "adf6c59d-58cd-48b1-a5fd-3bb205a3ef56", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# define folder location where your STAC catalog json file is\n", - "catalog_path = os.path.join('..', '..', 'catalog')\n", - "# open catalog\n", - "catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))" - ] - }, - { - "cell_type": "markdown", - "id": "996e60ba-13e4-453a-8534-e62ce747f0fa", - "metadata": {}, - "source": [ - "## Collection Metadata Input" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "482d204d-b5b6-40e5-ac42-55b459be1097", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# name for STAC collection\n", - "collection_id = 'conus404-daily'\n", - "# description of STAC collection\n", - "collection_description = 'CONUS404 40 years of daily values for subset of model output variables derived from hourly values on cloud storage'\n", - "# license for dataset\n", - "collection_license = 'CC0-1.0'" - ] - }, - { - "cell_type": "markdown", - "id": "116b5837-8e85-4ae7-964a-803533ded714", - "metadata": {}, - "source": [ - "## Asset Metadata Input" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dd6fa323-132a-4794-8c80-576933f547a0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# url to zarr store that you want to create a collection for\n", - "zarr_url = 's3://hytest/conus404/conus404_daily.zarr/'\n", - "\n", - "# define keyword arguments needed for opening the dataset with xarray\n", - "# ref: https://github.com/stac-extensions/xarray-assets\n", - "xarray_opendataset_kwargs = {\"xarray:open_kwargs\":{\"chunks\":{},\"engine\":\"zarr\",\"consolidated\":True},\n", - " \"xarray:storage_options\": {\"anon\": True, \"client_kwargs\": {\"endpoint_url\":\"https://usgs.osn.mghpcc.org/\"}}}\n", - "# description for zarr url asset attached to collection (zarr_url)\n", - "asset_description = \"Open Storage Network Pod S3 API access to collection zarr group\"\n", - "# roles to tag zarr url asset with\n", - "asset_roles = [\"data\",\"zarr\",\"s3\"]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1441cd4-e94c-4902-af46-8f1af470eb6b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# url to zarr store that you want to create a collection for\n", - "zarr_url2 = 's3://nhgf-development/conus404/conus404_daily_202210.zarr/'\n", - "\n", - "# define keyword arguments needed for opening the dataset with xarray\n", - "# ref: https://github.com/stac-extensions/xarray-assets\n", - "xarray_opendataset_kwargs2 = {\"xarray:open_kwargs\":{\"chunks\":{},\"engine\":\"zarr\",\"consolidated\":True},\n", - " \"xarray:storage_options\":{\"requester_pays\":True}}\n", - "# description for zarr url asset attached to collection (zarr_url)\n", - "asset_description2 = \"S3 access to collection zarr group\"\n", - "# roles to tag zarr url asset with\n", - "asset_roles2 = [\"data\",\"zarr\",\"s3\"]" - ] - }, - { - "cell_type": "markdown", - "id": "b213b74f-ad17-4774-93b6-3b62be616b45", - "metadata": { - "tags": [] - }, - "source": [ - "## Data Exploration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "708f2cf5-79ab-49af-8067-de31d0d13ee6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# open and view zarr dataset\n", - "fs2 = fsspec.filesystem('s3', anon=True, endpoint_url='https://usgs.osn.mghpcc.org/')\n", - "ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr', \n", - " backend_kwargs={'consolidated':True}, chunks={})\n", - "ds" - ] - }, - { - "cell_type": "markdown", - "id": "0bc7e9b3-ad62-4b10-a18e-66b7ed2d35dc", - "metadata": {}, - "source": [ - "## Identify x, y, t dimensions of dataset\n", - "May require user input if dimensions cannot be auto-detected." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab91268f-7200-4cb1-979a-c7d75531d2c0", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", - "# dim_names_dict = {}\n", - "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", - "dim_names_dict = {'X': 'x', 'Y': 'y', 'T': 'time'}\n", - "print(f\"Dimension dictionary: {dim_names_dict}\")" - ] - }, - { - "cell_type": "markdown", - "id": "667cb9e8-c4a5-4d26-8e30-be8c934adc37", - "metadata": {}, - "source": [ - "## Get crs info" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "172d9e6b-de88-4e3a-9184-706110a99659", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ds = ds.metpy.parse_cf()\n", - "crs = ds[list(ds.keys())[0]].metpy.cartopy_crs" - ] - }, - { - "cell_type": "markdown", - "id": "8fbfecfb-9886-4d06-a34c-6471cb0a6053", - "metadata": {}, - "source": [ - "## Plot a map" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4eb4d027-4266-4a0b-8f16-bacfbef06242", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# plot a map of a single variable\n", - "var_to_plot = 'SNOW'\n", - "da = ds[var_to_plot].sel(time='2014-03-01 00:00').load()\n", - "da.hvplot.quadmesh(x='lon', y='lat', rasterize=True,\n", - " geo=True, tiles='OSM', alpha=0.7, cmap='turbo')" - ] - }, - { - "cell_type": "markdown", - "id": "5e057a6c-06fb-4406-823b-e81c58e520e4", - "metadata": {}, - "source": [ - "## Plot a time series at a specific point\n", - "This can help you verify a variable's values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c7de2681-88c2-4597-857c-8f169c596f8b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# enter lat, lon of point you want to plot time series for\n", - "lat,lon = 39.978322,-105.2772194\n", - "time_start = '2013-01-01 00:00'\n", - "time_end = '2013-12-31 00:00'\n", - "x, y = crs.transform_point(lon, lat, src_crs=ccrs.PlateCarree()) # PlateCaree = Lat,Lon\n", - "da = ds[var_to_plot].sel(x=x, y=y, method='nearest').sel(time=slice(time_start,time_end)).load()\n", - "da.hvplot(x=dim_names_dict['T'], grid=True)" - ] - }, - { - "cell_type": "markdown", - "id": "a8c3ed37-8564-400b-a7fb-25bd5e43d21c", - "metadata": {}, - "source": [ - "## Create Collection Extent" - ] - }, - { - "cell_type": "markdown", - "id": "69f0d837-68a5-4fed-9a14-5d75cfbb0da4", - "metadata": {}, - "source": [ - "### Spatial Extent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d46805e0-8e94-4ebe-aa01-d9a2d7051459", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# pull out lat/lon bbox for data\n", - "# coordinates must be from WGS 84 datum\n", - "# left, bottom, right, top\n", - "# Note: I'm not sure why but I have some trouble getting the data type right here - \n", - "# I've included all the options I've had to run to get it to not be a regular float rather \n", - "# than a numpy float below - switch the commented line if you have this issue\n", - "#coord_bounds = [ds.lon.data.min().compute().astype(float), ds.lat.data.min().compute().astype(float), ds.lon.data.max().compute().astype(float), ds.lat.data.max().compute().astype(float)]\n", - "#coord_bounds = [ds.lon.data.min().compute().astype(float).tolist(), ds.lat.data.min().compute().astype(float).tolist(), ds.lon.data.max().compute().astype(float).tolist(), ds.lat.data.max().compute().astype(float).tolist()]\n", - "coord_bounds = [ds.lon.data.min().astype(float).item(), ds.lat.data.min().astype(float).item(), ds.lon.data.max().astype(float).item(), ds.lat.data.max().astype(float).item()]\n", - "print(coord_bounds)\n", - "# create a spatial extent object \n", - "spatial_extent = pystac.SpatialExtent(bboxes=[coord_bounds])" - ] - }, - { - "cell_type": "markdown", - "id": "a04c8fca-1d33-43ac-9e2b-62d7be2887f7", - "metadata": {}, - "source": [ - "### Temporal Extent" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "41a84995-867c-4152-8c57-85e3758bbb77", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# pull out first and last timestamps\n", - "temporal_extent_lower = pd.Timestamp(ds[dim_names_dict['T']].data.min())\n", - "temporal_extent_upper = pd.Timestamp(ds[dim_names_dict['T']].data.max())\n", - "print(f'min: {temporal_extent_lower} \\nmax: {temporal_extent_upper}')\n", - "# create a temporal extent object\n", - "temporal_extent = pystac.TemporalExtent(intervals=[[temporal_extent_lower, temporal_extent_upper]])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b1e37c4-5348-46ad-abc9-e005b5d6c02b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)" - ] - }, - { - "cell_type": "markdown", - "id": "cfb71202-03df-45b5-ac2f-0dc2ee1ab780", - "metadata": {}, - "source": [ - "## Create pystac collection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7e96811b-95ae-406a-9728-55fc429d4e1f", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "if catalog.get_child(collection_id):\n", - " collection = catalog.get_child(collection_id)\n", - " print(\"existing collection opened\")\n", - " collection.extent=collection_extent\n", - " collection.description=collection_description\n", - " collection.license=collection_license\n", - "else:\n", - " collection = pystac.Collection(id=collection_id,\n", - " description=collection_description,\n", - " extent=collection_extent,\n", - " license=collection_license)\n", - " print(\"new collection created\")" - ] - }, - { - "cell_type": "markdown", - "id": "a21c76e8-cd57-4eb5-a33f-7c668a3b3205", - "metadata": {}, - "source": [ - "## Add zarr url asset to collection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "094832af-d22b-4359-b0f6-cf687acce5cc", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "asset_id = \"zarr-s3-osn\"\n", - "asset = pystac.Asset(href=zarr_url,\n", - " description=asset_description,\n", - " media_type=\"application/vnd+zarr\",\n", - " roles=asset_roles,\n", - " extra_fields = xarray_opendataset_kwargs)\n", - "collection.add_asset(asset_id, asset)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c298d07-f234-4a08-986d-87f4a39e9ae6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "asset_id2 = \"zarr-s3\"\n", - "asset2 = pystac.Asset(href=zarr_url2,\n", - " description=asset_description2,\n", - " media_type=\"application/vnd+zarr\",\n", - " roles=asset_roles2,\n", - " extra_fields = xarray_opendataset_kwargs2)\n", - "collection.add_asset(asset_id2, asset2)" - ] - }, - { - "cell_type": "markdown", - "id": "f67cd5c9-db33-45c2-bc21-480cd67354f4", - "metadata": {}, - "source": [ - "## Add datacube extension to collection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fc00946d-2880-491d-9b3b-3aeeb4414d6c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# instantiate extention on collection\n", - "dc = DatacubeExtension.ext(collection, add_if_missing=True)" - ] - }, - { - "cell_type": "markdown", - "id": "8bdd77a2-7587-485e-afb7-42af3a822241", - "metadata": {}, - "source": [ - "### Add cube dimensions (required field for extension)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "120a4914-3302-44a5-a282-0308ac84f040", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# list out dataset dimensions\n", - "# When writing data to Zarr, Xarray sets this attribute on all variables based on the variable dimensions. When reading a Zarr group, Xarray looks for this attribute on all arrays,\n", - "# raising an error if it can’t be found.\n", - "dims = list(ds.dims)\n", - "print(dims)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "00a18a29-fb9a-4b56-8009-493122997b16", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# get x, y bounds for extent of those dimensions (required)\n", - "xy_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]\n", - "print(xy_bounds)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_step(dim_name):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " unique_steps = np.unique(diffs)\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(unique_steps)==1:\n", - " step = unique_steps[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, - { - "cell_type": "markdown", - "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", - "metadata": {}, - "source": [ - "#### user input needed - you will need to look at the crs information and create a cartopy crs object after identifying the projection type:\n", - "reference list of cartopy projections: https://scitools.org.uk/cartopy/docs/latest/reference/projections.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea452f62-5644-49b6-8a4e-7dc4f649fd1a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# print ot crs information in dataset\n", - "crs_info = ds.crs\n", - "print(crs_info)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b1d05ff-8e43-44a7-8343-178b112c4ad6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# create the appropriate cartopy projection\n", - "lcc = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian, \n", - " central_latitude=crs_info.latitude_of_projection_origin,\n", - " standard_parallels=crs_info.standard_parallel)\n", - "# the datacube extension can accept reference_system information as a numerical EPSG code, \n", - "# WKT2 (ISO 19162) string or PROJJSON object.\n", - "# we will use a projjson, as was done by Microsoft Planetary Computer here:\n", - "# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na\n", - "# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na\n", - "projjson = json.loads(lcc.to_json())\n", - "\n", - "# alternatively, I think we could do this:\n", - "#projjson = crs.to_json()" - ] - }, - { - "cell_type": "markdown", - "id": "00a5e041-081d-428d-ac2e-75d16de205e6", - "metadata": {}, - "source": [ - "#### user input needed - you will need to copy all of the dimensions from above into the dict and fill in the appropriate attributes(type, axis, extent):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a443497-67a9-4dce-a8e9-b08d31a88223", - "metadata": {}, - "outputs": [], - "source": [ - "# create a dictionary of datacube dimensions you would like to assign to this dataset\n", - "# dimension name should come from the coordinates printed above\n", - "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", - "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': pd.Timedelta(get_step('time')).isoformat()}),\n", - " 'x': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'x'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': get_step('x'), 'reference_system': projjson}),\n", - " 'y': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': get_step('y'), 'reference_system': projjson}),\n", - " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top_stag')}),\n", - " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top')}),\n", - " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'soil_layers_stag')}),\n", - " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, 'x_stag')}),\n", - " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y_stag')}),\n", - " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snow_layers_stag')}),\n", - " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snso_layers_stag')}),\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "0f277883-a3fd-425f-966a-ca2140d0ef2f", - "metadata": {}, - "source": [ - "### Add cube variables (optional field for extension)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9272931-fc0b-4f2a-9546-283033e9cde8", - "metadata": {}, - "outputs": [], - "source": [ - "# pull list of vars from dataset\n", - "vars = list(ds.variables)\n", - "\n", - "# drop metpy_crs coordinate we have added\n", - "if 'metpy_crs' in ds.coords:\n", - " ds = ds.drop_vars('metpy_crs')\n", - "\n", - "# spec says that the keys of cube:dimensions and cube:variables should be unique together; a key like lat should not be both a dimension and a variable.\n", - "# we will drop all values in dims from vars\n", - "vars = [v for v in vars if v not in dims]\n", - "\n", - "# Microsoft Planetary Computer includes coordinates and crs as variables here:\n", - "# https://planetarycomputer.microsoft.com/dataset/daymet-annual-na\n", - "# https://planetarycomputer.microsoft.com/api/stac/v1/collections/daymet-annual-na\n", - "# we will keep those in the var list\n", - "\n", - "# create dictionary of dataset variables and associated dimensions\n", - "vars_dict={}\n", - "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", - " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" - ] - }, - { - "cell_type": "markdown", - "id": "11ad5352-884c-4472-8864-4570a96f66e5", - "metadata": {}, - "source": [ - "### Finalize extension" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "10141fd4-91d6-491d-878b-02653720891d", - "metadata": {}, - "outputs": [], - "source": [ - "# add dimesions and variables to collection extension\n", - "dc.apply(dimensions=dims_dict, variables=vars_dict)" - ] - }, - { - "cell_type": "markdown", - "id": "615ca168-75fb-4135-9941-0ef5fe4fd1cb", - "metadata": {}, - "source": [ - "## Add STAC Collection to Catalog and Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2120a55-3d04-4122-a93f-29afcdb8cb1b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "# # helper to find items of wrong type\n", - "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4b75791b-6b2d-40be-b7c6-330a60888fb5", - "metadata": {}, - "outputs": [], - "source": [ - "if catalog.get_child(collection_id):\n", - " collection.normalize_and_save(root_href=os.path.join(catalog_path, collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)\n", - "else:\n", - " catalog.add_child(collection)\n", - " catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6f676b5-e892-4bfb-8d73-2828addd838c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "global-global-pangeo", - "language": "python", - "name": "conda-env-global-global-pangeo-py" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/workflows/archive/cprep_create_collection_from_zarr.ipynb b/workflows/archive/cprep_create_collection_from_zarr.ipynb index 06cdb24aea85bc7fb02571d23137e3f74164b5a2..38eff15f0e53c9f73413b9ea224ed1319f8e7546 100644 --- a/workflows/archive/cprep_create_collection_from_zarr.ipynb +++ b/workflows/archive/cprep_create_collection_from_zarr.ipynb @@ -255,22 +255,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -524,63 +511,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -646,7 +576,7 @@ }, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -660,7 +590,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -704,7 +634,7 @@ }, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'], round_dec=13)\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=13)\n", "print(f'x step: {x_step}')" ] }, @@ -719,7 +649,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -766,7 +696,7 @@ }, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'], round_dec=13)\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=13)\n", "print(f'y step: {y_step}')" ] }, @@ -781,7 +711,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -836,10 +766,10 @@ "# dimension name should come from the coordinates printed above\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),\n", - " 'lon': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': get_long_name(ds, 'lon'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " 'lat': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'lat'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", + "dims_dict = {'time': pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, 'time'), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),\n", + " 'lon': pystac.extensions.datacube.Dimension({'type': 'spatial', 'description': stac_helpers.get_long_name(ds, 'lon'), 'axis': 'x', 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " 'lat': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, 'lat'), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'bnds': pystac.extensions.datacube.Dimension({'type': 'count', 'description': stac_helpers.get_long_name(ds, 'bnds'), 'extent': [ds.bnds.min().item(), ds.bnds.max().item()]}),\n", " }" ] }, @@ -851,43 +781,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -914,9 +807,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -958,18 +851,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/dcp_conus_t_create_collection_from_zarr.ipynb b/workflows/archive/dcp_conus_t_create_collection_from_zarr.ipynb index ed830905b6f9ea0c4b4800d55ba80b168ef53def..47dfaebf878cf320e8dba0b934be32ee895028fd 100644 --- a/workflows/archive/dcp_conus_t_create_collection_from_zarr.ipynb +++ b/workflows/archive/dcp_conus_t_create_collection_from_zarr.ipynb @@ -191,22 +191,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -440,59 +427,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -561,7 +495,7 @@ }, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -575,7 +509,7 @@ "outputs": [], "source": [ "# debugging for time steps: get all step values and locations\n", - "time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -633,7 +567,7 @@ }, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -646,7 +580,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -681,7 +615,7 @@ }, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -694,7 +628,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -749,16 +683,9 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top_stag')}),\n", - " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top')}),\n", - " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'soil_layers_stag')}),\n", - " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, 'x_stag')}),\n", - " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y_stag')}),\n", - " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snow_layers_stag')}),\n", - " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snso_layers_stag')}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -770,43 +697,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -833,9 +723,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -877,18 +767,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/iclus_hc_create_collection_from_zarr.ipynb b/workflows/archive/iclus_hc_create_collection_from_zarr.ipynb index 5b7c31c9f22f4c379c8e2a40c68a658066bd85ac..8abe442feb2bdf6c078cfe9332fb6c6f83053f9d 100644 --- a/workflows/archive/iclus_hc_create_collection_from_zarr.ipynb +++ b/workflows/archive/iclus_hc_create_collection_from_zarr.ipynb @@ -192,22 +192,9 @@ "source": [ "# note - no time dimension in this dataset\n", "dims_auto_extract = ['X', 'Y']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -443,59 +430,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -562,7 +496,7 @@ "metadata": {}, "outputs": [], "source": [ - "# time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "# time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "# print(f'time step: {time_step}')" ] }, @@ -574,7 +508,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -606,7 +540,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -619,7 +553,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -654,7 +588,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -667,7 +601,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -722,8 +656,8 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", "}" ] }, @@ -735,43 +669,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -798,9 +695,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -842,18 +739,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/maca_vic_future_create_collection_from_zarr.ipynb b/workflows/archive/maca_vic_future_create_collection_from_zarr.ipynb index ef0a15293df3fe6e919b363608e9d3bc89c6fd60..c21c4d41c2d97001e430e2c4bf6aafa83c322bb7 100644 --- a/workflows/archive/maca_vic_future_create_collection_from_zarr.ipynb +++ b/workflows/archive/maca_vic_future_create_collection_from_zarr.ipynb @@ -191,22 +191,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -443,59 +430,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -562,7 +496,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -574,7 +508,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -606,7 +540,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -619,7 +553,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -652,7 +586,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -665,7 +599,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -720,9 +654,9 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -734,43 +668,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -797,9 +694,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -841,18 +738,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/maca_vic_past_create_collection_from_zarr.ipynb b/workflows/archive/maca_vic_past_create_collection_from_zarr.ipynb index 4af0a6eec9cfc7c1d061959a907eae73a349b079..1ceab06196e64ec3e112dba68cc6655d34fb1077 100644 --- a/workflows/archive/maca_vic_past_create_collection_from_zarr.ipynb +++ b/workflows/archive/maca_vic_past_create_collection_from_zarr.ipynb @@ -191,22 +191,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -443,59 +430,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -562,7 +496,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -574,7 +508,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -606,7 +540,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -619,7 +553,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -652,7 +586,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -665,7 +599,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -720,9 +654,9 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -734,43 +668,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -797,9 +694,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -841,18 +738,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/macav2_future_daily_create_collection_from_zarr.ipynb b/workflows/archive/macav2_future_daily_create_collection_from_zarr.ipynb index 7b309613a13bf8ed222a71fe5f0182cec3c1fcab..e6553681c7860c070b6f19288bebc0132c30e103 100644 --- a/workflows/archive/macav2_future_daily_create_collection_from_zarr.ipynb +++ b/workflows/archive/macav2_future_daily_create_collection_from_zarr.ipynb @@ -191,22 +191,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -440,59 +427,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -559,7 +493,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -571,7 +505,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -607,7 +541,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'], round_dec=4)\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=4)\n", "print(f'x step: {x_step}')" ] }, @@ -620,7 +554,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -657,7 +591,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'], round_dec=4)\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=4)\n", "print(f'y step: {y_step}')" ] }, @@ -670,7 +604,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -725,9 +659,9 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -739,43 +673,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -802,9 +699,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -846,18 +743,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/macav2_future_monthly_create_collection_from_zarr.ipynb b/workflows/archive/macav2_future_monthly_create_collection_from_zarr.ipynb index ecc3f0539a5cc7f889d5d32b552ba925e4daae20..2d2c3ab93eea05983ff8706978cb26d48b6f4f55 100644 --- a/workflows/archive/macav2_future_monthly_create_collection_from_zarr.ipynb +++ b/workflows/archive/macav2_future_monthly_create_collection_from_zarr.ipynb @@ -191,22 +191,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -443,59 +430,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -577,7 +511,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -613,7 +547,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'], round_dec=4)\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=4)\n", "print(f'x step: {x_step}')" ] }, @@ -626,7 +560,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -663,7 +597,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'], round_dec=4)\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=4)\n", "print(f'y step: {y_step}')" ] }, @@ -676,7 +610,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -731,9 +665,9 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -745,43 +679,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -808,9 +705,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -852,18 +749,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/macav2_historical_daily_create_collection_from_zarr.ipynb b/workflows/archive/macav2_historical_daily_create_collection_from_zarr.ipynb index f1da95f09d1617b205f2f075ac7ae012104b57a3..419ed588969d16a29805ac59d8a90c7c51a2299e 100644 --- a/workflows/archive/macav2_historical_daily_create_collection_from_zarr.ipynb +++ b/workflows/archive/macav2_historical_daily_create_collection_from_zarr.ipynb @@ -191,22 +191,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -440,59 +427,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -559,7 +493,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -571,7 +505,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -607,7 +541,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'], round_dec=4)\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=4)\n", "print(f'x step: {x_step}')" ] }, @@ -620,7 +554,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -657,7 +591,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'], round_dec=4)\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=4)\n", "print(f'y step: {y_step}')" ] }, @@ -670,7 +604,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -725,9 +659,9 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -739,43 +673,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -802,9 +699,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -846,18 +743,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/archive/macav2_historical_monthly_create_collection_from_zarr.ipynb b/workflows/archive/macav2_historical_monthly_create_collection_from_zarr.ipynb index 8f1e4adba68d488659b2f7263aa08490af31533d..7cfacd9c78f58ca4aee1664ff6a8ecab53ca5306 100644 --- a/workflows/archive/macav2_historical_monthly_create_collection_from_zarr.ipynb +++ b/workflows/archive/macav2_historical_monthly_create_collection_from_zarr.ipynb @@ -191,22 +191,9 @@ "outputs": [], "source": [ "# dims_auto_extract = ['X', 'Y', 'T']\n", - "# def extract_dim(ds, d):\n", - "# try:\n", - "# dim_list = ds.cf.axes[d]\n", - "# assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - "# dim = dim_list[0]\n", - "# except KeyError:\n", - "# print(f\"Could not auto-extract {d} dimension name.\")\n", - "# print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - "# dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - "# assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - "# print(f\"name of {d} dimension: {dim}\\n\")\n", - "# return dim\n", - "\n", "# dim_names_dict = {}\n", "# for d in dims_auto_extract:\n", - "# dim_names_dict[d] = extract_dim(ds, d)\n", + "# dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "dim_names_dict = {'X': 'lon', 'Y': 'lat', 'T': 'time'}\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] @@ -443,59 +430,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -577,7 +511,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -613,7 +547,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'], round_dec=4)\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'], round_dec=4)\n", "print(f'x step: {x_step}')" ] }, @@ -626,7 +560,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -663,7 +597,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'], round_dec=4)\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'], round_dec=4)\n", "print(f'y step: {y_step}')" ] }, @@ -676,7 +610,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -731,9 +665,9 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", " }" ] }, @@ -745,43 +679,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -808,9 +705,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -852,18 +749,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/examples/create_collection_from_zarr_conus404-daily.ipynb b/workflows/examples/create_collection_from_zarr_conus404-daily.ipynb index e92b50901dd55df995f9ca367969a41f289fc7f8..9cc827369fb22371950bbe079ae43be7226e26ea 100644 --- a/workflows/examples/create_collection_from_zarr_conus404-daily.ipynb +++ b/workflows/examples/create_collection_from_zarr_conus404-daily.ipynb @@ -36,7 +36,10 @@ "import metpy\n", "import cartopy.crs as ccrs\n", "import cfunits\n", - "import json" + "import json\n", + "import sys\n", + "sys.path.insert(1, '..')\n", + "import stac_helpers" ] }, { @@ -176,22 +179,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -461,59 +451,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -541,6 +478,7 @@ "metadata": {}, "outputs": [], "source": [ + "crs_info = ds.crs\n", "# create the appropriate cartopy projection\n", "lcc = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian, \n", " central_latitude=crs_info.latitude_of_projection_origin,\n", @@ -580,7 +518,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -592,7 +530,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -624,7 +562,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -637,7 +575,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -670,7 +608,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -683,7 +621,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -738,16 +676,16 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top_stag')}),\n", - " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top')}),\n", - " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'soil_layers_stag')}),\n", - " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, 'x_stag')}),\n", - " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y_stag')}),\n", - " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snow_layers_stag')}),\n", - " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snso_layers_stag')}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step':time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'bottom_top_stag')}),\n", + " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'bottom_top')}),\n", + " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'soil_layers_stag')}),\n", + " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, 'x_stag')}),\n", + " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, 'y_stag')}),\n", + " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'snow_layers_stag')}),\n", + " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'snso_layers_stag')}),\n", " }" ] }, @@ -759,43 +697,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -822,9 +723,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -866,18 +767,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = collection.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, { diff --git a/workflows/examples/create_item_from_zarr_conus404-daily.ipynb b/workflows/examples/create_item_from_zarr_conus404-daily.ipynb index c18ea7037109603d2fb130f27b23934fe1bcec40..ee25742835bfe01d9be056ec7d930ca8db03ec84 100644 --- a/workflows/examples/create_item_from_zarr_conus404-daily.ipynb +++ b/workflows/examples/create_item_from_zarr_conus404-daily.ipynb @@ -40,7 +40,10 @@ "import cfunits\n", "import json\n", "from shapely.geometry import Polygon, mapping\n", - "import orjson" + "import orjson\n", + "import sys\n", + "sys.path.insert(1, '..')\n", + "import stac_helpers" ] }, { @@ -184,22 +187,9 @@ "outputs": [], "source": [ "dims_auto_extract = ['X', 'Y', 'T']\n", - "def extract_dim(ds, d):\n", - " try:\n", - " dim_list = ds.cf.axes[d]\n", - " assert len(dim_list)==1, f'There are too many {d} dimensions in this dataset.'\n", - " dim = dim_list[0]\n", - " except KeyError:\n", - " print(f\"Could not auto-extract {d} dimension name.\")\n", - " print(\"Look at the xarray output above showing the dataset dimensions.\")\n", - " dim = str(input(f\"What is the name of the {d} dimension of this dataset?\"))\n", - " assert dim in ds.dims, \"That is not a valid dimension name for this dataset\"\n", - " print(f\"name of {d} dimension: {dim}\\n\")\n", - " return dim\n", - "\n", "dim_names_dict = {}\n", "for d in dims_auto_extract:\n", - " dim_names_dict[d] = extract_dim(ds, d)\n", + " dim_names_dict[d] = stac_helpers.extract_dim(ds, d)\n", "print(f\"Dimension dictionary: {dim_names_dict}\")" ] }, @@ -471,59 +461,6 @@ "print(xy_bounds)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "0f49d6d7-9e30-4144-909b-fa1238e6c77a", - "metadata": {}, - "outputs": [], - "source": [ - "def get_step(ds, dim_name, time_dim=False, debug=False, step_ix=0, round_dec=None):\n", - " dim_vals = ds[dim_name].values\n", - " diffs = [d2 - d1 for d1, d2 in zip(dim_vals, dim_vals[1:])]\n", - " # option to round number of decimals\n", - " # sometimes there are different steps calculated due to small rounding errors coming out of the diff\n", - " # calculation, rounding these can correct for that\n", - " if round_dec:\n", - " unique_steps = np.unique(np.array(diffs).round(decimals=round_dec), return_counts=True)\n", - " else:\n", - " unique_steps = np.unique(diffs, return_counts=True)\n", - " step_list = unique_steps[0]\n", - " # optional - for invesitgating uneven steps\n", - " if debug:\n", - " print(f'step_list: {step_list}')\n", - " print(f'step_count: {unique_steps[1]}')\n", - " indices = [i for i, x in enumerate(diffs) if x == step_list[step_ix]]\n", - " print(f'index locations of step index {step_ix} in step_list: {indices}')\n", - " # set step - if all steps are the same length\n", - " # datacube spec specifies to use null for irregularly spaced steps\n", - " if len(step_list)==1:\n", - " if time_dim:\n", - " # make sure time deltas are in np timedelta format\n", - " step_list = [np.array([step], dtype=\"timedelta64[ns]\")[0] for step in step_list]\n", - " step = step_list[0].astype(float).item()\n", - " else:\n", - " step = None\n", - " return(step)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a20d12bf-a511-4c5e-84d0-77e2ec551518", - "metadata": {}, - "outputs": [], - "source": [ - "def get_long_name(ds, v):\n", - " # try to get long_name attribute from variable\n", - " try:\n", - " long_name = ds[v].attrs['long_name']\n", - " # otherwise, leave empty\n", - " except:\n", - " long_name = None\n", - " return long_name" - ] - }, { "cell_type": "markdown", "id": "e7dc357c-91ec-49ae-83e5-400f791f9792", @@ -551,6 +488,7 @@ "metadata": {}, "outputs": [], "source": [ + "crs_info = ds.crs\n", "# create the appropriate cartopy projection\n", "lcc = ccrs.LambertConformal(central_longitude=crs_info.longitude_of_central_meridian, \n", " central_latitude=crs_info.latitude_of_projection_origin,\n", @@ -590,7 +528,7 @@ "metadata": {}, "outputs": [], "source": [ - "time_step = pd.Timedelta(get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", + "time_step = pd.Timedelta(stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True)).isoformat()\n", "print(f'time step: {time_step}')" ] }, @@ -602,7 +540,7 @@ "outputs": [], "source": [ "# # debugging for time steps: get all step values and locations\n", - "# time_step = get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" + "# time_step = stac_helpers.get_step(ds, dim_names_dict['T'], time_dim=True, debug=True, step_ix=1)" ] }, { @@ -634,7 +572,7 @@ "metadata": {}, "outputs": [], "source": [ - "x_step = get_step(ds, dim_names_dict['X'])\n", + "x_step = stac_helpers.get_step(ds, dim_names_dict['X'])\n", "print(f'x step: {x_step}')" ] }, @@ -647,7 +585,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# x_dim=dim_names_dict['X']\n", - "# x_step = get_step(ds, x_dim, debug=True, step_ix=1)\n", + "# x_step = stac_helpers.get_step(ds, x_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -680,7 +618,7 @@ "metadata": {}, "outputs": [], "source": [ - "y_step = get_step(ds, dim_names_dict['Y'])\n", + "y_step = stac_helpers.get_step(ds, dim_names_dict['Y'])\n", "print(f'y step: {y_step}')" ] }, @@ -693,7 +631,7 @@ "source": [ "# # debugging for spatial steps: get all step values and locations\n", "# y_dim=dim_names_dict['Y']\n", - "# y_step = get_step(ds, y_dim, debug=True, step_ix=1)\n", + "# y_step = stac_helpers.get_step(ds, y_dim, debug=True, step_ix=1)\n", "# print(f'\\nx dim name (for next cell): {x_dim}')" ] }, @@ -748,16 +686,16 @@ "\n", "# we do not recommend including redundant dimensions (do not include x,y if you have lon,lat)\n", "# note that the extent of each dimension should be pulled from the dataset\n", - "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),\n", - " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", - " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", - " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top_stag')}),\n", - " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'bottom_top')}),\n", - " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'soil_layers_stag')}),\n", - " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': get_long_name(ds, 'x_stag')}),\n", - " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': get_long_name(ds, 'y_stag')}),\n", - " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snow_layers_stag')}),\n", - " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': get_long_name(ds, 'snso_layers_stag')}),\n", + "dims_dict = {dim_names_dict['T']: pystac.extensions.datacube.Dimension({'type': 'temporal', 'description': stac_helpers.get_long_name(ds, dim_names_dict['T']), 'extent': [temporal_extent_lower.strftime('%Y-%m-%dT%XZ'), temporal_extent_upper.strftime('%Y-%m-%dT%XZ')], 'step': time_step}),\n", + " dim_names_dict['X']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, dim_names_dict['X']), 'extent': [xy_bounds[0], xy_bounds[2]], 'step': x_step, 'reference_system': projjson}),\n", + " dim_names_dict['Y']: pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, dim_names_dict['Y']), 'extent': [xy_bounds[1], xy_bounds[3]], 'step': y_step, 'reference_system': projjson}),\n", + " 'bottom_top_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'bottom_top_stag')}),\n", + " 'bottom_top': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'bottom_top')}),\n", + " 'soil_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'soil_layers_stag')}),\n", + " 'x_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'x', 'description': stac_helpers.get_long_name(ds, 'x_stag')}),\n", + " 'y_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'y', 'description': stac_helpers.get_long_name(ds, 'y_stag')}),\n", + " 'snow_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'snow_layers_stag')}),\n", + " 'snso_layers_stag': pystac.extensions.datacube.Dimension({'type': 'spatial', 'axis': 'z', 'description': stac_helpers.get_long_name(ds, 'snso_layers_stag')}),\n", " }" ] }, @@ -769,43 +707,6 @@ "### Add cube variables (optional field for extension)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "92510876-7853-4d24-8563-c69f9012aeb6", - "metadata": {}, - "outputs": [], - "source": [ - "# define functions to pull out datacube attributes and validate format\n", - "def get_unit(ds, v):\n", - " # check if unit is defined for variable\n", - " try:\n", - " unit = ds[v].attrs['units']\n", - " except:\n", - " unit = None\n", - " # check if unit comes from https://docs.unidata.ucar.edu/udunits/current/#Database\n", - " # datacube extension specifies: The unit of measurement for the data, preferably compliant to UDUNITS-2 units (singular).\n", - " # gdptools expects this format as well\n", - " try:\n", - " cfunits.Units(unit).isvalid\n", - " except:\n", - " print(\"Unit is not valid as a UD unit.\")\n", - " unit = str(input(\"Please enter a valid unit for {v} from here: https://docs.unidata.ucar.edu/udunits/current/#Database\"))\n", - " assert cfunits.Units(unit).isvalid\n", - " return unit\n", - "\n", - "def get_var_type(ds, v):\n", - " if v in ds.coords:\n", - " # type = auxiliary for a variable that contains coordinate data, but isn't a dimension in cube:dimensions.\n", - " # For example, the values of the datacube might be provided in the projected coordinate reference system, \n", - " # but the datacube could have a variable lon with dimensions (y, x), giving the longitude at each point.\n", - " var_type = 'auxiliary'\n", - " # type = data for a variable indicating some measured value, for example \"precipitation\", \"temperature\", etc.\n", - " else:\n", - " var_type = 'data'\n", - " return var_type" - ] - }, { "cell_type": "code", "execution_count": null, @@ -832,9 +733,9 @@ "# create dictionary of dataset variables and associated dimensions\n", "vars_dict={}\n", "for v in vars:\n", - " unit = get_unit(ds, v)\n", - " var_type = get_var_type(ds, v)\n", - " long_name = get_long_name(ds, v)\n", + " unit = stac_helpers.get_unit(ds, v)\n", + " var_type = stac_helpers.get_var_type(ds, v)\n", + " long_name = stac_helpers.get_long_name(ds, v)\n", " vars_dict[v] = pystac.extensions.datacube.Variable({'dimensions':list(ds[v].dims), 'type': var_type, 'description': long_name, 'unit': unit})" ] }, @@ -876,18 +777,7 @@ "source": [ "# # helper to find items of wrong type\n", "# d = item.to_dict()\n", - "# def find_paths(nested_dict, prepath=()):\n", - "# for k, v in nested_dict.items():\n", - "# try:\n", - "# path = prepath + (k,)\n", - "# if type(v) is np.float64: # found value\n", - "# yield path\n", - "# elif hasattr(v, 'items'): # v is a dict\n", - "# yield from find_paths(v, path) \n", - "# except:\n", - "# print(prepath)\n", - "\n", - "# print(*find_paths(d))" + "# print(*stac_helpers.find_paths(d))" ] }, {