-
Notifications
You must be signed in to change notification settings - Fork 94
252 lines (220 loc) · 9.07 KB
/
upload-to-s3.yml
File metadata and controls
252 lines (220 loc) · 9.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
name: Upload to S3
on:
workflow_dispatch:
inputs:
release_tag:
description: "Tag of the draft release holding the file to upload"
required: true
type: string
s3_destination:
description: "S3 path within the stdpopsim bucket (e.g. annotations/HomSap/file.tar.gz)"
required: true
type: string
expected_sha256:
description: "Expected SHA256 checksum of the file"
required: true
type: string
species_id:
description: "Species ID (e.g. HomSap)"
required: true
type: string
resource_type:
description: "Resource type: genetic_map or annotation"
required: true
type: string
resource_id:
description: "Resource ID as defined in the catalog (e.g. HapMapII_GRCh38)"
required: true
type: string
dry_run:
description: "If true, run all validation but skip the actual S3 upload"
required: false
type: boolean
default: false
concurrency:
group: s3-upload
cancel-in-progress: false
jobs:
upload:
runs-on: ubuntu-latest
environment: s3-upload
permissions:
contents: write # needed to download release assets and delete releases
steps:
- name: Validate inputs
run: |
DEST="${{ inputs.s3_destination }}"
TYPE="${{ inputs.resource_type }}"
# Resource type must be genetic_map or annotation
if [[ "$TYPE" != "genetic_map" && "$TYPE" != "annotation" ]]; then
echo "ERROR: resource_type must be 'genetic_map' or 'annotation', got: $TYPE"
exit 1
fi
# S3 destination must start with genetic_maps/ or annotations/
if [[ ! "$DEST" =~ ^(genetic_maps|annotations)/ ]]; then
echo "ERROR: S3 destination must start with 'genetic_maps/' or 'annotations/'"
exit 1
fi
# S3 destination must end with .tar.gz or .tgz
if [[ ! "$DEST" =~ \.(tar\.gz|tgz)$ ]]; then
echo "ERROR: S3 destination must end with .tar.gz or .tgz"
exit 1
fi
echo "Input validation passed."
echo " Species: ${{ inputs.species_id }}"
echo " Type: ${{ inputs.resource_type }}"
echo " Resource: ${{ inputs.resource_id }}"
echo " S3 dest: $DEST"
- name: Checkout repository
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install stdpopsim
run: |
pip install -e .
- name: Validate against catalog
run: |
python3 << 'PYEOF'
import stdpopsim
import sys
import re
species_id = "${{ inputs.species_id }}"
resource_type = "${{ inputs.resource_type }}"
resource_id = "${{ inputs.resource_id }}"
s3_dest = "${{ inputs.s3_destination }}"
expected_sha256 = "${{ inputs.expected_sha256 }}"
# Verify species exists
try:
species = stdpopsim.get_species(species_id)
except (ValueError, KeyError):
available = [s.id for s in stdpopsim.all_species()]
print(f"ERROR: Species '{species_id}' not found in catalog.")
print(f"Available species: {available}")
sys.exit(1)
# Verify resource exists and extract expected URL/SHA256
if resource_type == "genetic_map":
resources = {gm.id: gm for gm in species.genetic_maps}
if resource_id not in resources:
print(f"ERROR: Genetic map '{resource_id}' not found for {species_id}.")
print(f"Available genetic maps: {list(resources.keys())}")
sys.exit(1)
resource = resources[resource_id]
catalog_url = resource.url
catalog_sha256 = resource.sha256
elif resource_type == "annotation":
resources = {a.id: a for a in species.annotations}
if resource_id not in resources:
print(f"ERROR: Annotation '{resource_id}' not found for {species_id}.")
print(f"Available annotations: {list(resources.keys())}")
sys.exit(1)
resource = resources[resource_id]
catalog_url = resource.intervals_url
catalog_sha256 = resource.intervals_sha256
# Verify the S3 destination matches what the catalog expects
m = re.match(r"https://stdpopsim\.s3[.-]us-west-2\.amazonaws\.com/(.*)", catalog_url)
if not m:
print(f"ERROR: Could not parse S3 URL from catalog: {catalog_url}")
sys.exit(1)
catalog_s3_dest = m.group(1)
if catalog_s3_dest != s3_dest:
print(f"ERROR: S3 destination mismatch!")
print(f" Provided: {s3_dest}")
print(f" Catalog expects: {catalog_s3_dest}")
sys.exit(1)
# Verify SHA256 matches what the catalog expects
if catalog_sha256 != expected_sha256:
print(f"ERROR: SHA256 mismatch with catalog!")
print(f" Provided: {expected_sha256}")
print(f" Catalog expects: {catalog_sha256}")
sys.exit(1)
print("Catalog validation passed!")
print(f" Species: {species_id} ({species.name})")
print(f" Type: {resource_type}")
print(f" Resource: {resource_id}")
print(f" URL: {catalog_url}")
print(f" SHA256: {catalog_sha256}")
PYEOF
- name: Download asset from draft release
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
echo "Downloading asset from release: ${{ inputs.release_tag }}"
gh release download "${{ inputs.release_tag }}" \
--repo "${{ github.repository }}" \
--dir ./download
# Expect exactly one file
FILE_COUNT=$(find ./download -type f | wc -l)
if [ "$FILE_COUNT" -ne 1 ]; then
echo "ERROR: Expected exactly 1 asset, found $FILE_COUNT"
exit 1
fi
DOWNLOADED_FILE=$(find ./download -type f)
echo "Downloaded: $DOWNLOADED_FILE"
echo "DOWNLOADED_FILE=$DOWNLOADED_FILE" >> "$GITHUB_ENV"
- name: Verify SHA256
run: |
ACTUAL_SHA256=$(sha256sum "$DOWNLOADED_FILE" | awk '{print $1}')
EXPECTED="${{ inputs.expected_sha256 }}"
echo "Expected SHA256: $EXPECTED"
echo "Actual SHA256: $ACTUAL_SHA256"
if [ "$ACTUAL_SHA256" != "$EXPECTED" ]; then
echo "ERROR: SHA256 mismatch!"
exit 1
fi
echo "SHA256 verified."
- name: Check if S3 object already exists
if: ${{ inputs.dry_run != true }}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-west-2
run: |
DEST="${{ inputs.s3_destination }}"
if aws s3api head-object --bucket stdpopsim --key "$DEST" 2>/dev/null; then
echo "ERROR: s3://stdpopsim/$DEST already exists. Refusing to overwrite."
echo "If you need to replace this file, delete it from S3 first."
exit 1
fi
echo "Confirmed: s3://stdpopsim/$DEST does not exist yet."
- name: Upload to S3
if: ${{ inputs.dry_run != true }}
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
AWS_DEFAULT_REGION: us-west-2
run: |
DEST="${{ inputs.s3_destination }}"
echo "Uploading to s3://stdpopsim/$DEST"
aws s3 cp "$DOWNLOADED_FILE" "s3://stdpopsim/$DEST"
echo ""
echo "Upload complete."
echo "S3 URL: https://stdpopsim.s3-us-west-2.amazonaws.com/$DEST"
echo "SHA256: ${{ inputs.expected_sha256 }}"
- name: Dry run summary
if: ${{ inputs.dry_run == true }}
run: |
echo "=== DRY RUN COMPLETE ==="
echo "All validation passed. Skipped S3 upload."
echo ""
echo " Species: ${{ inputs.species_id }}"
echo " Type: ${{ inputs.resource_type }}"
echo " Resource: ${{ inputs.resource_id }}"
echo " S3 dest: s3://stdpopsim/${{ inputs.s3_destination }}"
echo " SHA256: ${{ inputs.expected_sha256 }}"
echo ""
echo "To perform the actual upload, re-run without --dry-run."
- name: Clean up draft release
if: always()
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
TAG="${{ inputs.release_tag }}"
echo "Cleaning up draft release: $TAG"
# Delete the release
gh release delete "$TAG" \
--repo "${{ github.repository }}" \
--yes \
--cleanup-tag 2>/dev/null || echo "Warning: could not delete release $TAG"
echo "Cleanup complete."