Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions config.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ AWS_ACCOUNT_ID=
AWS_BATCH_QUEUE=
AWS_BATCH_COMPUTE_ENVIRONMENT=

# GCP API keys for S3-compatible access
GCP_ACCESS_KEY_ID=
GCP_SECRET_ACCESS_KEY=
GCP_S3_BACKUP_BUCKET=

# Dataset unpublished/published buckets - used by DataLad service
AWS_S3_PRIVATE_BUCKET=
AWS_S3_PUBLIC_BUCKET=
Expand Down
3 changes: 3 additions & 0 deletions helm/openneuro/templates/secret.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ stringData:
AWS_REGION: {{ required "AWS_REGION is required" .Values.secrets.aws.AWS_REGION | quote }}
AWS_ACCOUNT_ID: {{ required "AWS_ACCOUNT_ID is required" .Values.secrets.aws.AWS_ACCOUNT_ID | quote }}
AWS_ACCESS_KEY_ID: {{ required "AWS_ACCESS_KEY_ID is required" .Values.secrets.aws.AWS_ACCESS_KEY_ID | quote }}
GCP_ACCESS_KEY_ID: {{ required "GCP_ACCESS_KEY_ID is required" .Values.secrets.aws.GCP_ACCESS_KEY_ID | quote }}
GCP_SECRET_ACCESS_KEY: {{ required "GCP_SECRET_ACCESS_KEY is required" .Values.secrets.aws.GCP_SECRET_ACCESS_KEY | quote }}
GCP_S3_BACKUP_BUCKET: {{ required "GCP_S3_BACKUP_BUCKET is required" .Values.secrets.aws.GCP_S3_BACKUP_BUCKET | quote }}
AWS_SECRET_ACCESS_KEY: {{ required "AWS_SECRET_ACCESS_KEY is required" .Values.secrets.aws.AWS_SECRET_ACCESS_KEY | quote }}
AWS_S3_PRIVATE_BUCKET: {{ required "AWS_REGION is required" .Values.secrets.aws.AWS_S3_PRIVATE_BUCKET | quote }}
AWS_S3_PUBLIC_BUCKET: {{ required "AWS_REGION is required" .Values.secrets.aws.AWS_S3_PUBLIC_BUCKET | quote }}
Expand Down
75 changes: 65 additions & 10 deletions services/datalad/datalad_service/common/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,43 +12,88 @@ def get_s3_remote():
return 's3-PUBLIC'


def get_s3_backup_remote():
return 's3-BACKUP'


def get_s3_bucket():
return getattr(datalad_service.config, 'AWS_S3_PUBLIC_BUCKET')


def generate_s3_annex_options(dataset_path):
def get_s3_backup_bucket():
return getattr(datalad_service.config, 'GCP_S3_BACKUP_BUCKET')


def generate_s3_annex_options(dataset_path, backup=False):
dataset_id = os.path.basename(dataset_path)
annex_options = [
'type=S3',
f'bucket={get_s3_bucket()}',
'exporttree=yes',
'versioning=yes',
'partsize=1GiB',
'encryption=none',
f'fileprefix={dataset_id}/',
'autoenable=true',
f'publicurl=https://s3.amazonaws.com/{get_s3_bucket()}',
'public=no',
]
if backup:
annex_options += [
f'bucket={get_s3_backup_bucket()}',
'cost=400',
'host=storage.googleapis.com',
'storageclass=ARCHIVE',
]
else:
annex_options += [
'exporttree=yes',
'versioning=yes',
f'bucket={get_s3_bucket()}',
'autoenable=true',
f'publicurl=https://s3.amazonaws.com/{get_s3_bucket()}',
]
return annex_options


def backup_remote_env():
"""Copy and modify the environment for setup/modification of backup remote settings."""
backup_remote_env = os.environ.copy()
# Overwrite the AWS keys with the GCP key
backup_remote_env['AWS_ACCESS_KEY_ID'] = backup_remote_env['GCP_ACCESS_KEY_ID']
backup_remote_env['AWS_SECRET_ACCESS_KEY'] = backup_remote_env[
'GCP_SECRET_ACCESS_KEY'
]
return backup_remote_env


def setup_s3_sibling(dataset_path):
"""Add a sibling for an S3 bucket publish."""
annex_options = generate_s3_annex_options(dataset_path)
# Public remote
subprocess.run(
['git-annex', 'initremote', get_s3_remote()]
+ generate_s3_annex_options(dataset_path),
cwd=dataset_path,
)
# Backup remote
subprocess.run(
['git-annex', 'initremote', get_s3_remote()] + annex_options, cwd=dataset_path
['git-annex', 'initremote', get_s3_backup_remote()]
+ generate_s3_annex_options(dataset_path, backup=True),
cwd=dataset_path,
env=backup_remote_env(),
)


def update_s3_sibling(dataset_path):
"""Update S3 remote with latest config."""
annex_options = generate_s3_annex_options(dataset_path)
# note: enableremote command will only upsert config options, none are deleted
subprocess.run(
['git-annex', 'enableremote', get_s3_remote()] + annex_options,
['git-annex', 'enableremote', get_s3_remote()]
+ generate_s3_annex_options(dataset_path),
check=True,
cwd=dataset_path,
)
subprocess.run(
['git-annex', 'enableremote', get_s3_backup_remote()]
+ generate_s3_annex_options(dataset_path, backup=True),
check=True,
cwd=dataset_path,
env=backup_remote_env(),
)


Expand Down Expand Up @@ -87,3 +132,13 @@ def s3_export(dataset_path, target, treeish):
subprocess.check_call(
['git-annex', 'export', treeish, '--to', target], cwd=dataset_path
)


def s3_backup_push(dataset_path):
"""Perform an S3 push to the backup remote on a git-annex repo."""
print(backup_remote_env())
subprocess.check_call(
['git-annex', 'push', get_s3_backup_remote()],
cwd=dataset_path,
env=backup_remote_env(),
)
6 changes: 6 additions & 0 deletions services/datalad/datalad_service/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@
AWS_S3_PRIVATE_BUCKET = os.getenv('AWS_S3_PRIVATE_BUCKET')
AWS_S3_PUBLIC_BUCKET = os.getenv('AWS_S3_PUBLIC_BUCKET')

# GCP S3 compatible object storage
GCP_ACCESS_KEY_ID = os.getenv('GCP_ACCESS_KEY_ID')
GCP_SECRET_ACCESS_KEY = os.getenv('GCP_SECRET_ACCESS_KEY')
GCP_S3_BACKUP_BUCKET = os.getenv('GCP_S3_BACKUP_BUCKET')


# GraphQL URL - override if not docker-compose
GRAPHQL_ENDPOINT = os.getenv('GRAPHQL_ENDPOINT', 'http://server:8111/crn/graphql')

Expand Down
32 changes: 24 additions & 8 deletions services/datalad/datalad_service/tasks/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@
from datalad_service.config import DATALAD_GITHUB_EXPORTS_ENABLED
from datalad_service.config import AWS_ACCESS_KEY_ID
from datalad_service.config import AWS_SECRET_ACCESS_KEY
from datalad_service.config import GCP_ACCESS_KEY_ID
from datalad_service.config import GCP_SECRET_ACCESS_KEY
from datalad_service.common.annex import get_tag_info, is_git_annex_remote
from datalad_service.common.openneuro import clear_dataset_cache
from datalad_service.common.git import git_show, git_tag, git_tag_tree
from datalad_service.common.github import github_export
from datalad_service.common.s3 import (
s3_export,
s3_backup_push,
get_s3_remote,
get_s3_bucket,
get_s3_backup_bucket,
update_s3_sibling,
)
from datalad_service.broker import broker
Expand Down Expand Up @@ -90,6 +94,7 @@ def export_dataset(
# Push the most recent tag
if tags:
s3_export(dataset_path, get_s3_remote(), tags[-1].name)
s3_backup_push(dataset_path)
# Once all S3 tags are exported, update GitHub
if github_enabled:
# Perform all GitHub export steps
Expand Down Expand Up @@ -129,16 +134,27 @@ def check_remote_has_version(dataset_path, remote, tag):
def delete_s3_sibling(dataset_id):
"""Run S3 sibling deletion in another process to avoid blocking any callers"""
delete_executor.submit(delete_s3_sibling_executor, dataset_id)
delete_executor.submit(delete_s3_sibling_executor, dataset_id, True)


def delete_s3_sibling_executor(dataset_id):
def delete_s3_sibling_executor(dataset_id, backup=False):
"""Delete all versions of a dataset from S3."""
try:
client = boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
if backup:
s3_bucket = get_s3_backup_bucket()
client = boto3.client(
's3',
aws_access_key_id=GCP_ACCESS_KEY_ID,
aws_secret_access_key=GCP_SECRET_ACCESS_KEY,
endpoint_url='https://storage.googleapis.com',
)
else:
s3_bucket = get_s3_bucket()
client = boto3.client(
's3',
aws_access_key_id=AWS_ACCESS_KEY_ID,
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
paginator = client.get_paginator('list_object_versions')
object_delete_list = []
for response in paginator.paginate(
Expand All @@ -154,12 +170,12 @@ def delete_s3_sibling_executor(dataset_id):
)
for i in range(0, len(object_delete_list), 1000):
client.delete_objects(
Bucket=get_s3_bucket(),
Bucket=s3_bucket,
Delete={'Objects': object_delete_list[i : i + 1000], 'Quiet': True},
)
except Exception as e:
raise Exception(
f'Attempt to delete dataset {dataset_id} from {get_s3_remote()} has failed. ({e})'
f'Attempt to delete dataset {dataset_id} from {s3_bucket} has failed. ({e})'
)


Expand Down
23 changes: 23 additions & 0 deletions services/datalad/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,21 @@ def mock_s3_remote_setup(dataset_path):
check=True,
cwd=dataset_path,
)
path = tmpdir_factory.mktemp('fake_s3_backup_remote')
subprocess.run(
[
'git',
'annex',
'initremote',
's3-BACKUP',
'type=directory',
f'directory={path}',
'encryption=none',
'exporttree=no',
],
check=True,
cwd=dataset_path,
)

def mock_github_remote_setup(dataset_path, dataset_id):
path = tmpdir_factory.mktemp('fake_github_remote')
Expand Down Expand Up @@ -184,6 +199,14 @@ def s3_creds(monkeypatch):
monkeypatch.setenv('AWS_S3_PRIVATE_BUCKET', 'a-fake-test-private-bucket')


@pytest.fixture(autouse=True)
def access_keys(monkeypatch):
monkeypatch.setenv('AWS_ACCESS_KEY_ID', 'aws-id')
monkeypatch.setenv('AWS_SECRET_ACCESS_KEY', 'aws-secret')
monkeypatch.setenv('GCP_ACCESS_KEY_ID', 'gcp-id')
monkeypatch.setenv('GCP_SECRET_ACCESS_KEY', 'gcp-secret')


@pytest.fixture(autouse=True)
def mock_jwt_secret(monkeypatch):
monkeypatch.setenv('JWT_SECRET', 'test-secret-please-ignore')
Expand Down