Skip to content

Commit fc7f7b8

Browse files
committed
feat(worker): Add backup remote for exported data
1 parent 6dc4585 commit fc7f7b8

File tree

6 files changed

+103
-19
lines changed

6 files changed

+103
-19
lines changed

config.env.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ AWS_ACCOUNT_ID=
3939
AWS_BATCH_QUEUE=
4040
AWS_BATCH_COMPUTE_ENVIRONMENT=
4141

42+
# GCP API keys for S3-compatible access
43+
GCP_ACCESS_KEY_ID=
44+
GCP_SECRET_ACCESS_KEY=
45+
GCP_S3_BACKUP_BUCKET=
46+
4247
# Dataset unpublished/published buckets - used by DataLad service
4348
AWS_S3_PRIVATE_BUCKET=
4449
AWS_S3_PUBLIC_BUCKET=

helm/openneuro/templates/secret.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ stringData:
2727
AWS_REGION: {{ required "AWS_REGION is required" .Values.secrets.aws.AWS_REGION | quote }}
2828
AWS_ACCOUNT_ID: {{ required "AWS_ACCOUNT_ID is required" .Values.secrets.aws.AWS_ACCOUNT_ID | quote }}
2929
AWS_ACCESS_KEY_ID: {{ required "AWS_ACCESS_KEY_ID is required" .Values.secrets.aws.AWS_ACCESS_KEY_ID | quote }}
30+
GCP_ACCESS_KEY_ID: {{ required "GCP_ACCESS_KEY_ID is required" .Values.secrets.aws.GCP_ACCESS_KEY_ID | quote }}
31+
GCP_SECRET_ACCESS_KEY: {{ required "GCP_SECRET_ACCESS_KEY is required" .Values.secrets.aws.GCP_SECRET_ACCESS_KEY | quote }}
32+
GCP_S3_BACKUP_BUCKET: {{ required "GCP_S3_BACKUP_BUCKET is required" .Values.secrets.aws.GCP_S3_BACKUP_BUCKET | quote }}
3033
AWS_SECRET_ACCESS_KEY: {{ required "AWS_SECRET_ACCESS_KEY is required" .Values.secrets.aws.AWS_SECRET_ACCESS_KEY | quote }}
3134
AWS_S3_PRIVATE_BUCKET: {{ required "AWS_REGION is required" .Values.secrets.aws.AWS_S3_PRIVATE_BUCKET | quote }}
3235
AWS_S3_PUBLIC_BUCKET: {{ required "AWS_REGION is required" .Values.secrets.aws.AWS_S3_PUBLIC_BUCKET | quote }}

services/datalad/datalad_service/common/s3.py

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,43 +12,88 @@ def get_s3_remote():
1212
return 's3-PUBLIC'
1313

1414

15+
def get_s3_backup_remote():
16+
return 's3-BACKUP'
17+
18+
1519
def get_s3_bucket():
1620
return getattr(datalad_service.config, 'AWS_S3_PUBLIC_BUCKET')
1721

1822

19-
def generate_s3_annex_options(dataset_path):
23+
def get_s3_backup_bucket():
24+
return getattr(datalad_service.config, 'GCP_S3_BACKUP_BUCKET')
25+
26+
27+
def generate_s3_annex_options(dataset_path, backup=False):
2028
dataset_id = os.path.basename(dataset_path)
2129
annex_options = [
2230
'type=S3',
23-
f'bucket={get_s3_bucket()}',
24-
'exporttree=yes',
25-
'versioning=yes',
2631
'partsize=1GiB',
2732
'encryption=none',
2833
f'fileprefix={dataset_id}/',
29-
'autoenable=true',
30-
f'publicurl=https://s3.amazonaws.com/{get_s3_bucket()}',
3134
'public=no',
3235
]
36+
if backup:
37+
annex_options += [
38+
f'bucket={get_s3_backup_bucket()}',
39+
'cost=400',
40+
'host=storage.googleapis.com',
41+
'storageclass=ARCHIVE',
42+
]
43+
else:
44+
annex_options += [
45+
'exporttree=yes',
46+
'versioning=yes',
47+
f'bucket={get_s3_bucket()}',
48+
'autoenable=true',
49+
f'publicurl=https://s3.amazonaws.com/{get_s3_bucket()}',
50+
]
3351
return annex_options
3452

3553

54+
def backup_remote_env():
55+
"""Copy and modify the environment for setup/modification of backup remote settings."""
56+
backup_remote_env = os.environ.copy()
57+
# Overwrite the AWS keys with the GCP key
58+
backup_remote_env['AWS_ACCESS_KEY_ID'] = datalad_service.config.GCP_ACCESS_KEY_ID
59+
backup_remote_env['AWS_SECRET_ACCESS_KEY'] = (
60+
datalad_service.config.GCP_SECRET_ACCESS_KEY
61+
)
62+
return backup_remote_env
63+
64+
3665
def setup_s3_sibling(dataset_path):
3766
"""Add a sibling for an S3 bucket publish."""
38-
annex_options = generate_s3_annex_options(dataset_path)
67+
# Public remote
68+
subprocess.run(
69+
['git-annex', 'initremote', get_s3_remote()]
70+
+ generate_s3_annex_options(dataset_path),
71+
cwd=dataset_path,
72+
)
73+
# Backup remote
3974
subprocess.run(
40-
['git-annex', 'initremote', get_s3_remote()] + annex_options, cwd=dataset_path
75+
['git-annex', 'initremote', get_s3_backup_remote()]
76+
+ generate_s3_annex_options(dataset_path, backup=True),
77+
cwd=dataset_path,
78+
env=backup_remote_env(),
4179
)
4280

4381

4482
def update_s3_sibling(dataset_path):
4583
"""Update S3 remote with latest config."""
46-
annex_options = generate_s3_annex_options(dataset_path)
4784
# note: enableremote command will only upsert config options, none are deleted
4885
subprocess.run(
49-
['git-annex', 'enableremote', get_s3_remote()] + annex_options,
86+
['git-annex', 'enableremote', get_s3_remote()]
87+
+ generate_s3_annex_options(dataset_path),
88+
check=True,
89+
cwd=dataset_path,
90+
)
91+
subprocess.run(
92+
['git-annex', 'enableremote', get_s3_backup_remote()]
93+
+ generate_s3_annex_options(dataset_path, backup=True),
5094
check=True,
5195
cwd=dataset_path,
96+
env=backup_remote_env(),
5297
)
5398

5499

@@ -87,3 +132,12 @@ def s3_export(dataset_path, target, treeish):
87132
subprocess.check_call(
88133
['git-annex', 'export', treeish, '--to', target], cwd=dataset_path
89134
)
135+
136+
137+
def s3_backup_push(dataset_path):
138+
"""Perform an S3 push to the backup remote on a git-annex repo."""
139+
subprocess.check_call(
140+
['git-annex', 'push', get_s3_backup_remote()],
141+
cwd=dataset_path,
142+
env=backup_remote_env(),
143+
)

services/datalad/datalad_service/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@
1919
AWS_S3_PRIVATE_BUCKET = os.getenv('AWS_S3_PRIVATE_BUCKET')
2020
AWS_S3_PUBLIC_BUCKET = os.getenv('AWS_S3_PUBLIC_BUCKET')
2121

22+
# GCP S3 compatible object storage
23+
GCP_ACCESS_KEY_ID = os.getenv('GCP_ACCESS_KEY_ID')
24+
GCP_SECRET_ACCESS_KEY = os.getenv('GCP_SECRET_ACCESS_KEY')
25+
GCP_S3_BACKUP_BUCKET = os.getenv('GCP_S3_BACKUP_BUCKET')
26+
27+
2228
# GraphQL URL - override if not docker-compose
2329
GRAPHQL_ENDPOINT = os.getenv('GRAPHQL_ENDPOINT', 'http://server:8111/crn/graphql')
2430

services/datalad/datalad_service/tasks/publish.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,18 @@
1515
from datalad_service.config import DATALAD_GITHUB_EXPORTS_ENABLED
1616
from datalad_service.config import AWS_ACCESS_KEY_ID
1717
from datalad_service.config import AWS_SECRET_ACCESS_KEY
18+
from datalad_service.config import GCP_ACCESS_KEY_ID
19+
from datalad_service.config import GCP_SECRET_ACCESS_KEY
1820
from datalad_service.common.annex import get_tag_info, is_git_annex_remote
1921
from datalad_service.common.openneuro import clear_dataset_cache
2022
from datalad_service.common.git import git_show, git_tag, git_tag_tree
2123
from datalad_service.common.github import github_export
2224
from datalad_service.common.s3 import (
2325
s3_export,
26+
s3_backup_push,
2427
get_s3_remote,
2528
get_s3_bucket,
29+
get_s3_backup_bucket,
2630
update_s3_sibling,
2731
)
2832
from datalad_service.broker import broker
@@ -90,6 +94,7 @@ def export_dataset(
9094
# Push the most recent tag
9195
if tags:
9296
s3_export(dataset_path, get_s3_remote(), tags[-1].name)
97+
s3_backup_push(dataset_path)
9398
# Once all S3 tags are exported, update GitHub
9499
if github_enabled:
95100
# Perform all GitHub export steps
@@ -129,16 +134,27 @@ def check_remote_has_version(dataset_path, remote, tag):
129134
def delete_s3_sibling(dataset_id):
130135
"""Run S3 sibling deletion in another process to avoid blocking any callers"""
131136
delete_executor.submit(delete_s3_sibling_executor, dataset_id)
137+
delete_executor.submit(delete_s3_sibling_executor, dataset_id, True)
132138

133139

134-
def delete_s3_sibling_executor(dataset_id):
140+
def delete_s3_sibling_executor(dataset_id, backup=False):
135141
"""Delete all versions of a dataset from S3."""
136142
try:
137-
client = boto3.client(
138-
's3',
139-
aws_access_key_id=AWS_ACCESS_KEY_ID,
140-
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
141-
)
143+
if backup:
144+
s3_bucket = get_s3_backup_bucket()
145+
client = boto3.client(
146+
's3',
147+
aws_access_key_id=GCP_ACCESS_KEY_ID,
148+
aws_secret_access_key=GCP_SECRET_ACCESS_KEY,
149+
endpoint_url='https://storage.googleapis.com',
150+
)
151+
else:
152+
s3_bucket = get_s3_bucket()
153+
client = boto3.client(
154+
's3',
155+
aws_access_key_id=AWS_ACCESS_KEY_ID,
156+
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
157+
)
142158
paginator = client.get_paginator('list_object_versions')
143159
object_delete_list = []
144160
for response in paginator.paginate(
@@ -154,12 +170,12 @@ def delete_s3_sibling_executor(dataset_id):
154170
)
155171
for i in range(0, len(object_delete_list), 1000):
156172
client.delete_objects(
157-
Bucket=get_s3_bucket(),
173+
Bucket=s3_bucket,
158174
Delete={'Objects': object_delete_list[i : i + 1000], 'Quiet': True},
159175
)
160176
except Exception as e:
161177
raise Exception(
162-
f'Attempt to delete dataset {dataset_id} from {get_s3_remote()} has failed. ({e})'
178+
f'Attempt to delete dataset {dataset_id} from {s3_bucket} has failed. ({e})'
163179
)
164180

165181

services/datalad/tests/test_publish.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def test_export_snapshots(no_init_remote, client, new_dataset):
6060
github_enabled=True,
6161
)
6262
# Verify export calls were made
63-
assert s3_export_mock.call_count == 1
63+
assert s3_export_mock.call_count == 2
6464
expect_calls = [call(new_dataset.path, 's3-PUBLIC', 'refs/tags/2.0.0')]
6565
s3_export_mock.assert_has_calls(expect_calls)
6666
assert github_export_mock.call_count == 1

0 commit comments

Comments
 (0)