diff --git a/config.env.example b/config.env.example index 1b826fced..af59b333b 100644 --- a/config.env.example +++ b/config.env.example @@ -39,6 +39,11 @@ AWS_ACCOUNT_ID= AWS_BATCH_QUEUE= AWS_BATCH_COMPUTE_ENVIRONMENT= +# GCP API keys for S3-compatible access +GCP_ACCESS_KEY_ID= +GCP_SECRET_ACCESS_KEY= +GCP_S3_BACKUP_BUCKET= + # Dataset unpublished/published buckets - used by DataLad service AWS_S3_PRIVATE_BUCKET= AWS_S3_PUBLIC_BUCKET= diff --git a/helm/openneuro/templates/secret.yaml b/helm/openneuro/templates/secret.yaml index d8eb7cd4f..7c126dba9 100644 --- a/helm/openneuro/templates/secret.yaml +++ b/helm/openneuro/templates/secret.yaml @@ -27,6 +27,9 @@ stringData: AWS_REGION: {{ required "AWS_REGION is required" .Values.secrets.aws.AWS_REGION | quote }} AWS_ACCOUNT_ID: {{ required "AWS_ACCOUNT_ID is required" .Values.secrets.aws.AWS_ACCOUNT_ID | quote }} AWS_ACCESS_KEY_ID: {{ required "AWS_ACCESS_KEY_ID is required" .Values.secrets.aws.AWS_ACCESS_KEY_ID | quote }} + GCP_ACCESS_KEY_ID: {{ required "GCP_ACCESS_KEY_ID is required" .Values.secrets.aws.GCP_ACCESS_KEY_ID | quote }} + GCP_SECRET_ACCESS_KEY: {{ required "GCP_SECRET_ACCESS_KEY is required" .Values.secrets.aws.GCP_SECRET_ACCESS_KEY | quote }} + GCP_S3_BACKUP_BUCKET: {{ required "GCP_S3_BACKUP_BUCKET is required" .Values.secrets.aws.GCP_S3_BACKUP_BUCKET | quote }} AWS_SECRET_ACCESS_KEY: {{ required "AWS_SECRET_ACCESS_KEY is required" .Values.secrets.aws.AWS_SECRET_ACCESS_KEY | quote }} AWS_S3_PRIVATE_BUCKET: {{ required "AWS_REGION is required" .Values.secrets.aws.AWS_S3_PRIVATE_BUCKET | quote }} AWS_S3_PUBLIC_BUCKET: {{ required "AWS_REGION is required" .Values.secrets.aws.AWS_S3_PUBLIC_BUCKET | quote }} diff --git a/services/datalad/datalad_service/common/s3.py b/services/datalad/datalad_service/common/s3.py index 112097f0e..7860d8b5c 100644 --- a/services/datalad/datalad_service/common/s3.py +++ b/services/datalad/datalad_service/common/s3.py @@ -12,43 +12,88 @@ def get_s3_remote(): return 's3-PUBLIC' +def get_s3_backup_remote(): + return 's3-BACKUP' + + def get_s3_bucket(): return getattr(datalad_service.config, 'AWS_S3_PUBLIC_BUCKET') -def generate_s3_annex_options(dataset_path): +def get_s3_backup_bucket(): + return getattr(datalad_service.config, 'GCP_S3_BACKUP_BUCKET') + + +def generate_s3_annex_options(dataset_path, backup=False): dataset_id = os.path.basename(dataset_path) annex_options = [ 'type=S3', - f'bucket={get_s3_bucket()}', - 'exporttree=yes', - 'versioning=yes', 'partsize=1GiB', 'encryption=none', f'fileprefix={dataset_id}/', - 'autoenable=true', - f'publicurl=https://s3.amazonaws.com/{get_s3_bucket()}', 'public=no', ] + if backup: + annex_options += [ + f'bucket={get_s3_backup_bucket()}', + 'cost=400', + 'host=storage.googleapis.com', + 'storageclass=ARCHIVE', + ] + else: + annex_options += [ + 'exporttree=yes', + 'versioning=yes', + f'bucket={get_s3_bucket()}', + 'autoenable=true', + f'publicurl=https://s3.amazonaws.com/{get_s3_bucket()}', + ] return annex_options +def backup_remote_env(): + """Copy and modify the environment for setup/modification of backup remote settings.""" + backup_remote_env = os.environ.copy() + # Overwrite the AWS keys with the GCP key + backup_remote_env['AWS_ACCESS_KEY_ID'] = backup_remote_env['GCP_ACCESS_KEY_ID'] + backup_remote_env['AWS_SECRET_ACCESS_KEY'] = backup_remote_env[ + 'GCP_SECRET_ACCESS_KEY' + ] + return backup_remote_env + + def setup_s3_sibling(dataset_path): """Add a sibling for an S3 bucket publish.""" - annex_options = generate_s3_annex_options(dataset_path) + # Public remote + subprocess.run( + ['git-annex', 'initremote', get_s3_remote()] + + generate_s3_annex_options(dataset_path), + cwd=dataset_path, + ) + # Backup remote subprocess.run( - ['git-annex', 'initremote', get_s3_remote()] + annex_options, cwd=dataset_path + ['git-annex', 'initremote', get_s3_backup_remote()] + + generate_s3_annex_options(dataset_path, backup=True), + cwd=dataset_path, + env=backup_remote_env(), ) def update_s3_sibling(dataset_path): """Update S3 remote with latest config.""" - annex_options = generate_s3_annex_options(dataset_path) # note: enableremote command will only upsert config options, none are deleted subprocess.run( - ['git-annex', 'enableremote', get_s3_remote()] + annex_options, + ['git-annex', 'enableremote', get_s3_remote()] + + generate_s3_annex_options(dataset_path), + check=True, + cwd=dataset_path, + ) + subprocess.run( + ['git-annex', 'enableremote', get_s3_backup_remote()] + + generate_s3_annex_options(dataset_path, backup=True), check=True, cwd=dataset_path, + env=backup_remote_env(), ) @@ -87,3 +132,13 @@ def s3_export(dataset_path, target, treeish): subprocess.check_call( ['git-annex', 'export', treeish, '--to', target], cwd=dataset_path ) + + +def s3_backup_push(dataset_path): + """Perform an S3 push to the backup remote on a git-annex repo.""" + print(backup_remote_env()) + subprocess.check_call( + ['git-annex', 'push', get_s3_backup_remote()], + cwd=dataset_path, + env=backup_remote_env(), + ) diff --git a/services/datalad/datalad_service/config.py b/services/datalad/datalad_service/config.py index 9cbf0b957..94e39ef48 100644 --- a/services/datalad/datalad_service/config.py +++ b/services/datalad/datalad_service/config.py @@ -19,6 +19,12 @@ AWS_S3_PRIVATE_BUCKET = os.getenv('AWS_S3_PRIVATE_BUCKET') AWS_S3_PUBLIC_BUCKET = os.getenv('AWS_S3_PUBLIC_BUCKET') +# GCP S3 compatible object storage +GCP_ACCESS_KEY_ID = os.getenv('GCP_ACCESS_KEY_ID') +GCP_SECRET_ACCESS_KEY = os.getenv('GCP_SECRET_ACCESS_KEY') +GCP_S3_BACKUP_BUCKET = os.getenv('GCP_S3_BACKUP_BUCKET') + + # GraphQL URL - override if not docker-compose GRAPHQL_ENDPOINT = os.getenv('GRAPHQL_ENDPOINT', 'http://server:8111/crn/graphql') diff --git a/services/datalad/datalad_service/tasks/publish.py b/services/datalad/datalad_service/tasks/publish.py index 68032456d..1f3e95e14 100644 --- a/services/datalad/datalad_service/tasks/publish.py +++ b/services/datalad/datalad_service/tasks/publish.py @@ -15,14 +15,18 @@ from datalad_service.config import DATALAD_GITHUB_EXPORTS_ENABLED from datalad_service.config import AWS_ACCESS_KEY_ID from datalad_service.config import AWS_SECRET_ACCESS_KEY +from datalad_service.config import GCP_ACCESS_KEY_ID +from datalad_service.config import GCP_SECRET_ACCESS_KEY from datalad_service.common.annex import get_tag_info, is_git_annex_remote from datalad_service.common.openneuro import clear_dataset_cache from datalad_service.common.git import git_show, git_tag, git_tag_tree from datalad_service.common.github import github_export from datalad_service.common.s3 import ( s3_export, + s3_backup_push, get_s3_remote, get_s3_bucket, + get_s3_backup_bucket, update_s3_sibling, ) from datalad_service.broker import broker @@ -90,6 +94,7 @@ def export_dataset( # Push the most recent tag if tags: s3_export(dataset_path, get_s3_remote(), tags[-1].name) + s3_backup_push(dataset_path) # Once all S3 tags are exported, update GitHub if github_enabled: # Perform all GitHub export steps @@ -129,16 +134,27 @@ def check_remote_has_version(dataset_path, remote, tag): def delete_s3_sibling(dataset_id): """Run S3 sibling deletion in another process to avoid blocking any callers""" delete_executor.submit(delete_s3_sibling_executor, dataset_id) + delete_executor.submit(delete_s3_sibling_executor, dataset_id, True) -def delete_s3_sibling_executor(dataset_id): +def delete_s3_sibling_executor(dataset_id, backup=False): """Delete all versions of a dataset from S3.""" try: - client = boto3.client( - 's3', - aws_access_key_id=AWS_ACCESS_KEY_ID, - aws_secret_access_key=AWS_SECRET_ACCESS_KEY, - ) + if backup: + s3_bucket = get_s3_backup_bucket() + client = boto3.client( + 's3', + aws_access_key_id=GCP_ACCESS_KEY_ID, + aws_secret_access_key=GCP_SECRET_ACCESS_KEY, + endpoint_url='https://storage.googleapis.com', + ) + else: + s3_bucket = get_s3_bucket() + client = boto3.client( + 's3', + aws_access_key_id=AWS_ACCESS_KEY_ID, + aws_secret_access_key=AWS_SECRET_ACCESS_KEY, + ) paginator = client.get_paginator('list_object_versions') object_delete_list = [] for response in paginator.paginate( @@ -154,12 +170,12 @@ def delete_s3_sibling_executor(dataset_id): ) for i in range(0, len(object_delete_list), 1000): client.delete_objects( - Bucket=get_s3_bucket(), + Bucket=s3_bucket, Delete={'Objects': object_delete_list[i : i + 1000], 'Quiet': True}, ) except Exception as e: raise Exception( - f'Attempt to delete dataset {dataset_id} from {get_s3_remote()} has failed. ({e})' + f'Attempt to delete dataset {dataset_id} from {s3_bucket} has failed. ({e})' ) diff --git a/services/datalad/tests/conftest.py b/services/datalad/tests/conftest.py index 88a4dfb09..1e8685e8e 100644 --- a/services/datalad/tests/conftest.py +++ b/services/datalad/tests/conftest.py @@ -136,6 +136,21 @@ def mock_s3_remote_setup(dataset_path): check=True, cwd=dataset_path, ) + path = tmpdir_factory.mktemp('fake_s3_backup_remote') + subprocess.run( + [ + 'git', + 'annex', + 'initremote', + 's3-BACKUP', + 'type=directory', + f'directory={path}', + 'encryption=none', + 'exporttree=no', + ], + check=True, + cwd=dataset_path, + ) def mock_github_remote_setup(dataset_path, dataset_id): path = tmpdir_factory.mktemp('fake_github_remote') @@ -184,6 +199,14 @@ def s3_creds(monkeypatch): monkeypatch.setenv('AWS_S3_PRIVATE_BUCKET', 'a-fake-test-private-bucket') +@pytest.fixture(autouse=True) +def access_keys(monkeypatch): + monkeypatch.setenv('AWS_ACCESS_KEY_ID', 'aws-id') + monkeypatch.setenv('AWS_SECRET_ACCESS_KEY', 'aws-secret') + monkeypatch.setenv('GCP_ACCESS_KEY_ID', 'gcp-id') + monkeypatch.setenv('GCP_SECRET_ACCESS_KEY', 'gcp-secret') + + @pytest.fixture(autouse=True) def mock_jwt_secret(monkeypatch): monkeypatch.setenv('JWT_SECRET', 'test-secret-please-ignore')