Skip to content
Snippets Groups Projects
Commit 48445c86 authored by Snyder, Amelia Marie's avatar Snyder, Amelia Marie
Browse files

fix collection naming and save

parent 07ef3e57
No related branches found
No related tags found
1 merge request!91Aiem
%% Cell type:markdown id:66f9bf84-b0ee-4aae-8877-661b81f59bcc tags:
# AIEM_permafrost Collection Creation
This is a workflow to build a [STAC collection](https://github.com/radiantearth/stac-spec/blob/master/collection-spec/collection-spec.md) that will serve as the parent collection for several zarr datasets. The specific zarr datasets included in this collection can be seen in the collection_id_list below.
%% Cell type:code id:1c2249e2-5d3e-42ba-a584-aca58678232e tags:
``` python
import pystac
from pystac.extensions.datacube import CollectionDatacubeExtension, AssetDatacubeExtension, AdditionalDimension, DatacubeExtension
import xarray as xr
import cf_xarray
import os
import fsspec
import cf_xarray
import hvplot.xarray
import pandas as pd
import json
import numpy as np
import pandas as pd
import pyproj
from pyproj import Transformer
import cartopy.crs as ccrs
import cfunits
import json
import sys
sys.path.insert(1, '../..')
import stac_helpers
```
%% Cell type:code id:0e4491d3-d7a9-4437-9fe8-df2e35dc6ae5 tags:
``` python
# name of overall STAC collection we are creating with this workflow
overall_collection_id = 'AIEM_permafrost'
# name of child STAC collections that will be contained in this collection - should match name of zarr datasets
collection_id_list = ['northslope_rcp45_metadata', "northslope_rcp85_metadata", "selawik_rcp45_metadata", "selawik_rcp85_metadata",
"5m_rcp45_G1.8SW_S0.15m_wMeta", "5m_rcp45_G1.8SW_wMeta", "5m_rcp85_G1.8SW_wMeta", "5m_rcp45_G1.2SW_wMeta",
"5m_rcp45_G0.6SW_S0.15m_wMeta", "5m_rcp45_G1.2SW_S0.15m_wMeta", "5m_rcp85_G1.2SW_wMeta", "5m_rcp85_G1.8SW_S0.15m_wMeta",
"5m_rcp85_G0.6SW_S0.15m_wMeta", "5m_rcp45_G0.6SW_wMeta", "5m_rcp85_G1.2SW_S0.15m_wMeta", "5m_rcp85_G0.6SW_wMeta"]
```
%% Cell type:code id:a98fb989-65bb-49c5-8762-9cccd1cba6bb tags:
``` python
# url to zarr store that you want to create a collection for
fs2 = fsspec.filesystem('s3', profile='osn-mdmf-workspace', endpoint_url='https://usgs.osn.mghpcc.org/')
```
%% Cell type:code id:91f8f839-c862-4e15-b67e-00d6bee32d44 tags:
``` python
dim_names_dict = {'X': 'x', 'Y': 'y', 'T': 'time'}
```
%% Cell type:code id:5ab97501-1d74-422e-96a1-320898269b1a tags:
``` python
crs_var = 'crs'
```
%% Cell type:code id:e772d0e9-20cc-4e2f-923e-866aeb681d00 tags:
``` python
min_lons = []
min_lats = []
max_lons = []
max_lats = []
min_times = []
max_times = []
for collection_id in collection_id_list:
zarr_url = f's3://mdmf-workspace/gdp/{collection_id}.zarr/'
ds = xr.open_dataset(fs2.get_mapper(zarr_url), engine='zarr',
backend_kwargs={'consolidated':True}, chunks={})
collection_description = ds.attrs['title']
# print each zarr's description to see if they are the same and if they can be used for the overall collection
# in this case, they are appropriate to use
#print(f'possible collection description: {collection_description}')
collection_license = stac_helpers.license_picker(ds.attrs['license'])
# print each zarr's license to see if they are the same and if they can be used for the overall collection
# in this case, they are appropriate to use
#print(f'possible collection license: {collection_license}')
crs = pyproj.CRS.from_cf(ds[crs_var].attrs)
spatial_bounds = [ds[dim_names_dict['X']].data.min().astype(float).item(), ds[dim_names_dict['Y']].data.min().astype(float).item(), ds[dim_names_dict['X']].data.max().astype(float).item(), ds[dim_names_dict['Y']].data.max().astype(float).item()]
XX, YY = np.meshgrid(ds[dim_names_dict['X']].data, ds[dim_names_dict['Y']].data)
transformer = Transformer.from_crs(crs, "EPSG:4326", always_xy=True)
lon, lat = transformer.transform(XX.ravel(), YY.ravel())
min_lons.append(min(lon))
min_lats.append(min(lat))
max_lons.append(max(lon))
max_lats.append(max(lat))
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
spatial_extent = pystac.SpatialExtent(bboxes=[[min(min_lons).item(), min(min_lats).item(), max(max_lons).item(), max(max_lats).item()]])
temporal_extent = pystac.TemporalExtent(intervals=[[min(min_times), max(max_times)]])
collection_extent = pystac.Extent(spatial=spatial_extent, temporal=temporal_extent)
```
%% Output
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
license in dataset attrs: "Freely available"
For USGS data, we can use "CC0-1.0" as the license. For all other data we can use "Unlicense".
Ref: https://spdx.org/licenses/
license automatically chosen: Unlicense
/tmp/ipykernel_959/832730639.py:32: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
min_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().min()))
/tmp/ipykernel_959/832730639.py:33: RuntimeWarning: Converting a CFTimeIndex with dates from a non-standard calendar, 'noleap', to a pandas.DatetimeIndex, which uses dates from the standard calendar. This may lead to subtle errors in operations that depend on the length of time between dates.
max_times.append(pd.Timestamp(ds.indexes[dim_names_dict['T']].to_datetimeindex().max()))
%% Cell type:code id:5eafaa34-e079-4688-826b-383046ccbe2d tags:
``` python
# define folder location where your STAC catalog json file is
catalog_path = os.path.join('..', '..', '..','catalog')
# open catalog
catalog = pystac.Catalog.from_file(os.path.join(catalog_path, 'catalog.json'))
```
%% Cell type:code id:c380222b-b358-4239-9e13-467fbee32b56 tags:
``` python
if catalog.get_child(collection_id):
collection = catalog.get_child(collection_id)
if catalog.get_child(overall_collection_id):
collection = catalog.get_child(overall_collection_id)
print("existing collection opened")
collection.extent=collection_extent
collection.description=collection_description
collection.license=collection_license
else:
collection = pystac.Collection(id=collection_id,
collection = pystac.Collection(id=overall_collection_id,
description=collection_description,
extent=collection_extent,
license=collection_license)
print("new collection created")
```
%% Output
new collection created
%% Cell type:code id:ca6ca927-8b29-4352-85f7-7742ace1610f tags:
``` python
if catalog.get_child(overall_collection_id):
collection.normalize_and_save(root_href=os.path.join(catalog_path, overall_collection_id), catalog_type=pystac.CatalogType.SELF_CONTAINED)
else:
catalog.add_child(collection)
catalog.normalize_and_save(root_href=catalog_path, catalog_type=pystac.CatalogType.SELF_CONTAINED)
```
%% Cell type:code id:2c985c59-169b-49ea-9177-02276e802fe2 tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment