Skip to content

Commit 68ea3d3

Browse files
committed
feat: show how hls provided metadata were updated and published to stac
1 parent f6cd153 commit 68ea3d3

File tree

4 files changed

+269
-6
lines changed

4 files changed

+269
-6
lines changed

ingestion-data/production/collections/hls-l30-002-ej-reprocessed.json

+1-3
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,8 @@
3333
},
3434
"license": "MIT",
3535
"stac_extensions": [
36-
"https://stac-extensions.github.io/render/v1.0.0/schema.json",
37-
"https://stac-extensions.github.io/item-assets/v1.0.0/schema.json"
36+
"https://stac-extensions.github.io/render/v1.0.0/schema.json"
3837
],
39-
"item_assets": {},
4038
"dashboard:is_periodic": false,
4139
"dashboard:time_density": "day",
4240
"stac_version": "1.0.0",

ingestion-data/production/collections/hls-s30-002-ej-reprocessed.json

+1-3
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,8 @@
3333
},
3434
"license": "MIT",
3535
"stac_extensions": [
36-
"https://stac-extensions.github.io/render/v1.0.0/schema.json",
37-
"https://stac-extensions.github.io/item-assets/v1.0.0/schema.json"
36+
"https://stac-extensions.github.io/render/v1.0.0/schema.json"
3837
],
39-
"item_assets": {},
4038
"dashboard:is_periodic": false,
4139
"dashboard:time_density": "day",
4240
"stac_version": "1.0.0",
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "969528b3-e2db-462b-a5a0-9f6a469b643c",
6+
"metadata": {},
7+
"source": [
8+
"# Publish reprocessed HLS items using provider generated metadata in s3"
9+
]
10+
},
11+
{
12+
"cell_type": "markdown",
13+
"id": "7a5dd00e-6577-47ef-833f-d5ac07df78fb",
14+
"metadata": {},
15+
"source": [
16+
"#### Assertions\n",
17+
"- This notebook follows the [update-hrefs.ipynb](https://github.com/NASA-IMPACT/veda-data/blob/main/transformation-scripts/update-hrefs.ipynb) notebook which updates the provider metadata to use the s3 hrefs for the objects in veda-data-store\n",
18+
"- Assumption: the collection metadata in ingestion-data/production/collections is stac version 1.0.0 and has already been published to the target STAC catalog\n",
19+
"\n",
20+
"#### Update the stac version and store objects in s3\n",
21+
"- Search for all reprocessed item metadata in `s3://veda-data-store/<collection_id>`\n",
22+
"- Update json to stac version 1.0.0, validate, and post back to s3\n",
23+
"- Use target VEDA instance's ingest-api/ingestions endpoint to verify hrefs and publish item metadata to STAC"
24+
]
25+
},
26+
{
27+
"cell_type": "code",
28+
"execution_count": null,
29+
"id": "1a9f8bbd-1b62-404f-8c88-416cfe538575",
30+
"metadata": {},
31+
"outputs": [],
32+
"source": [
33+
"import boto3\n",
34+
"import json\n",
35+
"import requests\n",
36+
"from pystac import Item\n",
37+
"\n",
38+
"# Test\n",
39+
"# TARGET_STAC_API_URL = \"https://test.openveda.cloud/api/stac\"\n",
40+
"# TARGET_INGEST_API_URL = \"https://test.openveda.cloud/api/ingest\"\n",
41+
"\n",
42+
"# Prod\n",
43+
"TARGET_STAC_API_URL = \"https://openveda.cloud/api/stac\"\n",
44+
"TARGET_INGEST_API_URL = \"https://openveda.cloud/api/ingest\"\n",
45+
"\n",
46+
"TOKEN = \"SECRET\"\n",
47+
"authorization_header = f\"Bearer {TOKEN}\"\n",
48+
"headers = {\n",
49+
" \"Authorization\": authorization_header,\n",
50+
" \"content-type\": \"application/json\",\n",
51+
" \"accept\": \"application/json\",\n",
52+
"}\n",
53+
"authme_url = f\"{TARGET_INGEST_API_URL}/auth/me\"\n",
54+
"response = requests.get(authme_url, headers=headers)\n",
55+
"response.reason"
56+
]
57+
},
58+
{
59+
"cell_type": "code",
60+
"execution_count": null,
61+
"id": "0cdc5478-31bf-4c5f-a6c2-86141c6228bb",
62+
"metadata": {},
63+
"outputs": [],
64+
"source": [
65+
"AWS_ACCESS_KEY_ID = \"[CHANGE ME]\"\n",
66+
"AWS_SECRET_ACCESS_KEY = \"[CHANGE ME]\"\n",
67+
"AWS_SESSION_TOKEN = \"[CHANGE ME]\""
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": null,
73+
"id": "7a813b27-ba2b-463d-ac2b-8e2a083768f8",
74+
"metadata": {},
75+
"outputs": [],
76+
"source": [
77+
"s3_client = boto3.client(\n",
78+
" \"s3\",\n",
79+
" aws_access_key_id=AWS_ACCESS_KEY_ID,\n",
80+
" aws_secret_access_key=AWS_SECRET_ACCESS_KEY,\n",
81+
" aws_session_token=AWS_SESSION_TOKEN,\n",
82+
")"
83+
]
84+
},
85+
{
86+
"cell_type": "markdown",
87+
"id": "0e32dff3-0de2-448e-9c56-fbd12985fd02",
88+
"metadata": {},
89+
"source": [
90+
"## Update the json in s3 to stac version 1.0.0\n",
91+
"\n",
92+
"These provided metadata are `stac_version` `1.0.0-beta.2` but we can make a minor modification to how the `stac_extensions` are provided get them up to the same stac version `1.0.0` used for the rest of the collections in our STAC catalog(s). \n",
93+
"\n",
94+
"\n",
95+
"> **WARNING** this cell replaces an existing file in s3 instead of creating a new version, we are using it for a one time cleanup of a small known collection of invalid metadata that need to be corrected. "
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": null,
101+
"id": "58288478-e37e-4de8-bc37-19cbd8d044af",
102+
"metadata": {},
103+
"outputs": [],
104+
"source": [
105+
"bucket_name = \"veda-data-store\"\n",
106+
"collection_ids = [\"hlsl30-002-ej-reprocessed\", \"hlss30-002-ej-reprocessed\"]\n",
107+
"dry_run = True\n",
108+
"verbose = True\n",
109+
"\n",
110+
"for collection_id in collection_ids:\n",
111+
" s3_prefix = f\"{collection_id}/\"\n",
112+
" \n",
113+
" response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)\n",
114+
"\n",
115+
" # Filter for the STAC metadata files\n",
116+
" json_keys = [\n",
117+
" obj[\"Key\"] for obj in response[\"Contents\"] if obj[\"Key\"].endswith(\"stac-ej-reprocessed.json\")\n",
118+
" ]\n",
119+
" print(f\"\\n{collection_id=} matched metadata for {len(json_keys)} items\")\n",
120+
" \n",
121+
" for key in json_keys:\n",
122+
"\n",
123+
" # Backup the original version of this metadata\n",
124+
" # deprecated_key = key + \".deprecated\"\n",
125+
" \n",
126+
" # if not dry_run:\n",
127+
" # s3_client.copy_object(\n",
128+
" # CopySource={'Bucket': bucket_name, 'Key': key},\n",
129+
" # Bucket=bucket_name, \n",
130+
" # Key=deprecated_key, \n",
131+
" # )\n",
132+
" # if verbose:\n",
133+
" # print(f\"Copied {key} to {deprecated_key}\")\n",
134+
" \n",
135+
" # Get object to update the metadata\n",
136+
" response = s3_client.get_object(Bucket=bucket_name, Key=key)\n",
137+
" \n",
138+
" item_dict = json.loads(response[\"Body\"].read().decode(\"utf-8\"))\n",
139+
" \n",
140+
" # Add correct collection link\n",
141+
" links = [link for link in item_dict[\"links\"] if link[\"rel\"] != \"collection\"]\n",
142+
" links.append({\n",
143+
" \"rel\": \"collection\",\n",
144+
" \"href\": collection_id,\n",
145+
" \"type\": \"application/json\"\n",
146+
" })\n",
147+
" item_dict[\"links\"] = links\n",
148+
"\n",
149+
" # Update the stac version for these items from \"stac_version\": \"1.0.0-beta.2\" and touch up metadata to meet 1.0.0 spec\n",
150+
" item_dict[\"stac_version\"] = \"1.0.0\"\n",
151+
"\n",
152+
" # Add full extension hrefs https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#stac_extensions\n",
153+
" item_extensions = item_dict[\"stac_extensions\"]\n",
154+
" stac_extensions = []\n",
155+
" for ext in item_extensions:\n",
156+
" if \"https://stac-extensions.github.io\" not in ext:\n",
157+
" stac_extensions.append(f\"https://stac-extensions.github.io/{ext}/v1.0.0/schema.json\")\n",
158+
" else:\n",
159+
" stac_extensions.append(ext)\n",
160+
" item_dict[\"stac_extensions\"] = stac_extensions\n",
161+
"\n",
162+
" # Make sure the asset hrefs are pointed at the correct collection's prefix\n",
163+
" item_assets = item_dict[\"assets\"]\n",
164+
" # Previous location did not have data version number\n",
165+
" old_prefix = collection_id.replace(\"-002-\", \"-\")\n",
166+
" for asset_key in item_assets.keys():\n",
167+
" new_href = item_assets[asset_key][\"href\"].replace(old_prefix, collection_id)\n",
168+
" item_assets[asset_key][\"href\"] = new_href\n",
169+
" \n",
170+
" # Validate the updated item\n",
171+
" item = Item.from_dict(item_dict)\n",
172+
" try:\n",
173+
" item.validate()\n",
174+
" except Exception as e:\n",
175+
" print(f\"Invalid {collection_id=} {item.id=}\")\n",
176+
" \n",
177+
" # Replace the s3 object with the updated metadata for stac version 1.0.0\n",
178+
" if not dry_run:\n",
179+
" s3_client.put_object(Bucket=bucket_name, Key=key, Body=json.dumps(item_dict))\n",
180+
" if verbose:\n",
181+
" print(f\"Updated {key}\")\n",
182+
" "
183+
]
184+
},
185+
{
186+
"cell_type": "markdown",
187+
"id": "0e8ea1ae-1a0b-4656-abf8-dc3f8085a3b5",
188+
"metadata": {},
189+
"source": [
190+
"## Publish item records to STAC"
191+
]
192+
},
193+
{
194+
"cell_type": "code",
195+
"execution_count": null,
196+
"id": "7aea08e4-fb73-48d5-be7c-c3b147f52f63",
197+
"metadata": {},
198+
"outputs": [],
199+
"source": [
200+
"collection_ids = [\"hlsl30-002-ej-reprocessed\", \"hlss30-002-ej-reprocessed\"]\n",
201+
"dry_run = True\n",
202+
"verbose = True\n",
203+
"\n",
204+
"for collection_id in collection_ids:\n",
205+
" s3_prefix = f\"{collection_id}/\"\n",
206+
" \n",
207+
" response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)\n",
208+
"\n",
209+
" # Filter for the STAC metadata files\n",
210+
" json_keys = [\n",
211+
" obj[\"Key\"] for obj in response[\"Contents\"] if obj[\"Key\"].endswith(\"stac-ej-reprocessed.json\")\n",
212+
" ]\n",
213+
" print(f\"\\n{collection_id=} matched metadata for {len(json_keys)} items\")\n",
214+
" \n",
215+
" for key in json_keys:\n",
216+
"\n",
217+
" response = s3_client.get_object(Bucket=bucket_name, Key=key)\n",
218+
" \n",
219+
" item_dict = json.loads(response[\"Body\"].read().decode(\"utf-8\"))\n",
220+
" \n",
221+
" # Validate the item\n",
222+
" item = Item.from_dict(item_dict)\n",
223+
" try:\n",
224+
" item.validate()\n",
225+
" except Exception as e:\n",
226+
" print(f\"invalid {collection_id=} {item.id=}\")\n",
227+
" \n",
228+
" # Publish to target STAC catalog\n",
229+
" publish_url = f\"{TARGET_INGEST_API_URL}/ingestions\"\n",
230+
" if not dry_run:\n",
231+
" publish_response = requests.post(\n",
232+
" publish_url, \n",
233+
" headers=headers,\n",
234+
" json=item_dict\n",
235+
" )\n",
236+
" if verbose:\n",
237+
" print(f\"POST {publish_url} {collection_id=}\\n{item_dict['id']=} {publish_response.reason=}\")\n",
238+
" if not publish_response.reason == 'Created':\n",
239+
" print(f\"POST {publish_url} {collection_id=}\\n{item_dict['id']=} {publish_response.reason=}\")\n",
240+
" else:\n",
241+
" if verbose:\n",
242+
" print(f\"POST {publish_url} {collection_id=}\\n{item_dict['id']=} {dry_run=}\")\n"
243+
]
244+
}
245+
],
246+
"metadata": {
247+
"kernelspec": {
248+
"display_name": "Python 3 (ipykernel)",
249+
"language": "python",
250+
"name": "python3"
251+
},
252+
"language_info": {
253+
"codemirror_mode": {
254+
"name": "ipython",
255+
"version": 3
256+
},
257+
"file_extension": ".py",
258+
"mimetype": "text/x-python",
259+
"name": "python",
260+
"nbconvert_exporter": "python",
261+
"pygments_lexer": "ipython3",
262+
"version": "3.11.8"
263+
}
264+
},
265+
"nbformat": 4,
266+
"nbformat_minor": 5
267+
}

0 commit comments

Comments
 (0)