Skip to content

Commit cbcf9e1

Browse files
botanicalJennifer Tran
and
Jennifer Tran
authored
Create notebook to update collection hrefs (#112)
* Create notebook to update collection hrefs * Simplify notebook to get json_keys only * Move put object call to outside of loop * Remove unnecessary comment * Add output from cells that were run --------- Co-authored-by: Jennifer Tran <[email protected]>
1 parent 2e1c7bd commit cbcf9e1

File tree

1 file changed

+206
-0
lines changed

1 file changed

+206
-0
lines changed
+206
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Notebook to update hrefs in particular collections"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 106,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"import boto3\n",
17+
"import json"
18+
]
19+
},
20+
{
21+
"cell_type": "code",
22+
"execution_count": 107,
23+
"metadata": {},
24+
"outputs": [],
25+
"source": [
26+
"AWS_ACCESS_KEY_ID = \"[CHANGE ME]\"\n",
27+
"AWS_SECRET_ACCESS_KEY = \"[CHANGE ME]\"\n",
28+
"AWS_SESSION_TOKEN = \"[CHANGE ME]\""
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": 108,
34+
"metadata": {},
35+
"outputs": [],
36+
"source": [
37+
"s3_client = boto3.client(\n",
38+
" \"s3\",\n",
39+
" aws_access_key_id=AWS_ACCESS_KEY_ID,\n",
40+
" aws_secret_access_key=AWS_SECRET_ACCESS_KEY,\n",
41+
" aws_session_token=AWS_SESSION_TOKEN,\n",
42+
")"
43+
]
44+
},
45+
{
46+
"cell_type": "markdown",
47+
"metadata": {},
48+
"source": [
49+
"The `update_json_href` function takes in a bucket_name, s3_prefix, old_href_substring and new_href_substring"
50+
]
51+
},
52+
{
53+
"cell_type": "code",
54+
"execution_count": 109,
55+
"metadata": {},
56+
"outputs": [],
57+
"source": [
58+
"def update_json_href(\n",
59+
" bucket_name, collection_name, old_href_substring, new_href_substring\n",
60+
"):\n",
61+
" \"\"\"Given the bucket name, s3 prefix,\n",
62+
" update all hrefs in the path bucketname/s3_prefix\n",
63+
" and update all the old_href_substring to be new_href_substring.\n",
64+
"\n",
65+
" Keyword arguments:\n",
66+
" bucket_name -- the s3 bucket name\n",
67+
" collection_name -- the collection name\n",
68+
" old_href_substring -- the string to replace in href\n",
69+
" new_href_substring -- the new href substring\n",
70+
" \"\"\"\n",
71+
" s3 = s3_client\n",
72+
" s3_prefix = f\"{collection_name}/\"\n",
73+
"\n",
74+
" response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)\n",
75+
"\n",
76+
" json_keys = [\n",
77+
" item[\"Key\"] for item in response[\"Contents\"] if item[\"Key\"].endswith(\".json\")\n",
78+
" ]\n",
79+
"\n",
80+
" for key in json_keys:\n",
81+
" response = s3.get_object(Bucket=bucket_name, Key=key)\n",
82+
" json_data = response[\"Body\"].read().decode(\"utf-8\")\n",
83+
"\n",
84+
" data = json.loads(json_data)\n",
85+
" for assets_key in data[\"assets\"]:\n",
86+
" # Update href property\n",
87+
" data[\"assets\"][assets_key][\"href\"] = data[\"assets\"][assets_key][\n",
88+
" \"href\"\n",
89+
" ].replace(old_href_substring, new_href_substring)\n",
90+
"\n",
91+
" # Serialize updated JSON\n",
92+
" updated_json = json.dumps(data)\n",
93+
"\n",
94+
" # Upload updated JSON file back to S3, commented out the line below so it doesn't actually upload\n",
95+
" s3.put_object(Bucket=bucket_name, Key=key, Body=updated_json)\n",
96+
" print(f\"Updated {key}\")"
97+
]
98+
},
99+
{
100+
"cell_type": "markdown",
101+
"metadata": {},
102+
"source": [
103+
"The next cell calls update_json_href to update the hlsl30-ej-reprocessed and hlss30-ej-reprocessed collections from the veda-data-store bucket. Specifically, it updates the href instances of \"covid-eo-data\" to \"veda-data-store\""
104+
]
105+
},
106+
{
107+
"cell_type": "code",
108+
"execution_count": 110,
109+
"metadata": {},
110+
"outputs": [
111+
{
112+
"name": "stdout",
113+
"output_type": "stream",
114+
"text": [
115+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017157T144341.v2.0/HLS.L30.T19QHA.2017157T144341.v2.0_stac-ej-reprocessed.json\n",
116+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017173T144347.v2.0/HLS.L30.T19QHA.2017173T144347.v2.0_stac-ej-reprocessed.json\n",
117+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017205T144356.v2.0/HLS.L30.T19QHA.2017205T144356.v2.0_stac-ej-reprocessed.json\n",
118+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017221T144403.v2.0/HLS.L30.T19QHA.2017221T144403.v2.0_stac-ej-reprocessed.json\n",
119+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017237T144407.v2.0/HLS.L30.T19QHA.2017237T144407.v2.0_stac-ej-reprocessed.json\n",
120+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017269T144414.v2.0/HLS.L30.T19QHA.2017269T144414.v2.0_stac-ej-reprocessed.json\n",
121+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017285T144419.v2.0/HLS.L30.T19QHA.2017285T144419.v2.0_stac-ej-reprocessed.json\n",
122+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017301T144420.v2.0/HLS.L30.T19QHA.2017301T144420.v2.0_stac-ej-reprocessed.json\n",
123+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017317T144417.v2.0/HLS.L30.T19QHA.2017317T144417.v2.0_stac-ej-reprocessed.json\n",
124+
"Updated hlsl30-ej-reprocessed/2017/19QHA/HLS.L30.T19QHA.2017333T144411.v2.0/HLS.L30.T19QHA.2017333T144411.v2.0_stac-ej-reprocessed.json\n",
125+
"Updated hlsl30-ej-reprocessed/2021/15RYP/HLS.L30.T15RYP.2021182T163159.v2.0/HLS.L30.T15RYP.2021182T163159.v2.0_stac-ej-reprocessed.json\n",
126+
"Updated hlsl30-ej-reprocessed/2021/15RYP/HLS.L30.T15RYP.2021198T163201.v2.0/HLS.L30.T15RYP.2021198T163201.v2.0_stac-ej-reprocessed.json\n",
127+
"Updated hlsl30-ej-reprocessed/2021/15RYP/HLS.L30.T15RYP.2021230T163215.v2.0/HLS.L30.T15RYP.2021230T163215.v2.0_stac-ej-reprocessed.json\n",
128+
"Updated hlsl30-ej-reprocessed/2021/15RYP/HLS.L30.T15RYP.2021246T163220.v2.0/HLS.L30.T15RYP.2021246T163220.v2.0_stac-ej-reprocessed.json\n",
129+
"Updated hlsl30-ej-reprocessed/2021/15RYP/HLS.L30.T15RYP.2021262T163223.v2.0/HLS.L30.T15RYP.2021262T163223.v2.0_stac-ej-reprocessed.json\n",
130+
"Updated hlsl30-ej-reprocessed/2021/15RYP/HLS.L30.T15RYP.2021278T163229.v2.0/HLS.L30.T15RYP.2021278T163229.v2.0_stac-ej-reprocessed.json\n",
131+
"Updated hlsl30-ej-reprocessed/2021/15RYP/HLS.L30.T15RYP.2021294T163232.v2.0/HLS.L30.T15RYP.2021294T163232.v2.0_stac-ej-reprocessed.json\n"
132+
]
133+
}
134+
],
135+
"source": [
136+
"update_json_href(\n",
137+
" \"veda-data-store\", \"hlsl30-ej-reprocessed\", \"covid-eo-data\", \"veda-data-store\"\n",
138+
")"
139+
]
140+
},
141+
{
142+
"cell_type": "code",
143+
"execution_count": 111,
144+
"metadata": {},
145+
"outputs": [
146+
{
147+
"name": "stdout",
148+
"output_type": "stream",
149+
"text": [
150+
"Updated hlss30-ej-reprocessed/2017/19QHA/HLS.S30.T19QHA.2017193T150719.v2.0/HLS.S30.T19QHA.2017193T150719.v2.0_stac-ej-reprocessed.json\n",
151+
"Updated hlss30-ej-reprocessed/2017/19QHA/HLS.S30.T19QHA.2017218T150721.v2.0/HLS.S30.T19QHA.2017218T150721.v2.0_stac-ej-reprocessed.json\n",
152+
"Updated hlss30-ej-reprocessed/2017/19QHA/HLS.S30.T19QHA.2017233T150719.v2.0/HLS.S30.T19QHA.2017233T150719.v2.0_stac-ej-reprocessed.json\n",
153+
"Updated hlss30-ej-reprocessed/2017/19QHA/HLS.S30.T19QHA.2017278T150721.v2.0/HLS.S30.T19QHA.2017278T150721.v2.0_stac-ej-reprocessed.json\n",
154+
"Updated hlss30-ej-reprocessed/2017/19QHA/HLS.S30.T19QHA.2017293T150709.v2.0/HLS.S30.T19QHA.2017293T150709.v2.0_stac-ej-reprocessed.json\n",
155+
"Updated hlss30-ej-reprocessed/2017/19QHA/HLS.S30.T19QHA.2017318T150721.v2.0/HLS.S30.T19QHA.2017318T150721.v2.0_stac-ej-reprocessed.json\n",
156+
"Updated hlss30-ej-reprocessed/2017/19QHA/HLS.S30.T19QHA.2017323T150709.v2.0/HLS.S30.T19QHA.2017323T150709.v2.0_stac-ej-reprocessed.json\n",
157+
"Updated hlss30-ej-reprocessed/2017/19QHA/HLS.S30.T19QHA.2017333T150709.v2.0/HLS.S30.T19QHA.2017333T150709.v2.0_stac-ej-reprocessed.json\n",
158+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021185T163839.v2.0/HLS.S30.T15RYP.2021185T163839.v2.0_stac-ej-reprocessed.json\n",
159+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021195T163839.v2.0/HLS.S30.T15RYP.2021195T163839.v2.0_stac-ej-reprocessed.json\n",
160+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021200T163901.v2.0/HLS.S30.T15RYP.2021200T163901.v2.0_stac-ej-reprocessed.json\n",
161+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021205T163839.v2.0/HLS.S30.T15RYP.2021205T163839.v2.0_stac-ej-reprocessed.json\n",
162+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021210T163901.v2.0/HLS.S30.T15RYP.2021210T163901.v2.0_stac-ej-reprocessed.json\n",
163+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021220T163901.v2.0/HLS.S30.T15RYP.2021220T163901.v2.0_stac-ej-reprocessed.json\n",
164+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021225T163839.v2.0/HLS.S30.T15RYP.2021225T163839.v2.0_stac-ej-reprocessed.json\n",
165+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021235T163839.v2.0/HLS.S30.T15RYP.2021235T163839.v2.0_stac-ej-reprocessed.json\n",
166+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021245T163839.v2.0/HLS.S30.T15RYP.2021245T163839.v2.0_stac-ej-reprocessed.json\n",
167+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021250T163901.v2.0/HLS.S30.T15RYP.2021250T163901.v2.0_stac-ej-reprocessed.json\n",
168+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021265T163909.v2.0/HLS.S30.T15RYP.2021265T163909.v2.0_stac-ej-reprocessed.json\n",
169+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021270T164051.v2.0/HLS.S30.T15RYP.2021270T164051.v2.0_stac-ej-reprocessed.json\n",
170+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021280T164201.v2.0/HLS.S30.T15RYP.2021280T164201.v2.0_stac-ej-reprocessed.json\n",
171+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021285T164139.v2.0/HLS.S30.T15RYP.2021285T164139.v2.0_stac-ej-reprocessed.json\n",
172+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021290T164311.v2.0/HLS.S30.T15RYP.2021290T164311.v2.0_stac-ej-reprocessed.json\n",
173+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021295T164339.v2.0/HLS.S30.T15RYP.2021295T164339.v2.0_stac-ej-reprocessed.json\n",
174+
"Updated hlss30-ej-reprocessed/2021/15RYP/HLS.S30.T15RYP.2021300T164411.v2.0/HLS.S30.T15RYP.2021300T164411.v2.0_stac-ej-reprocessed.json\n"
175+
]
176+
}
177+
],
178+
"source": [
179+
"update_json_href(\n",
180+
" \"veda-data-store\", \"hlss30-ej-reprocessed\", \"covid-eo-data\", \"veda-data-store\"\n",
181+
")"
182+
]
183+
}
184+
],
185+
"metadata": {
186+
"kernelspec": {
187+
"display_name": "venv",
188+
"language": "python",
189+
"name": "python3"
190+
},
191+
"language_info": {
192+
"codemirror_mode": {
193+
"name": "ipython",
194+
"version": 3
195+
},
196+
"file_extension": ".py",
197+
"mimetype": "text/x-python",
198+
"name": "python",
199+
"nbconvert_exporter": "python",
200+
"pygments_lexer": "ipython3",
201+
"version": "3.11.7"
202+
}
203+
},
204+
"nbformat": 4,
205+
"nbformat_minor": 2
206+
}

0 commit comments

Comments
 (0)