diff --git a/services/datalad/datalad_service/common/annex.py b/services/datalad/datalad_service/common/annex.py index a07b7330d..9c0f765b4 100644 --- a/services/datalad/datalad_service/common/annex.py +++ b/services/datalad/datalad_service/common/annex.py @@ -13,6 +13,7 @@ import datalad_service.config from datalad_service.common.git import git_show +from datalad_service.common.s3_client import presign_remote_url, get_s3_remote SERVICE_EMAIL = 'git@openneuro.org' SERVICE_USER = 'Git Worker' @@ -21,6 +22,7 @@ 'openneuro-dev-datalad-public', 'openneuro-derivatives', 'bobsrepository', + 'openneuro-datalad-public-nell-test', ] @@ -147,8 +149,17 @@ def parse_rmet_line(remote, rmetLine): remoteContext, remoteData = rmetLine.split('V +') slash = '' if remote['url'][-1] == '/' else '/' s3version, path = remoteData.split('#') - return '{}{}{}?versionId={}'.format(remote['url'], slash, path, s3version) + if remote['name'] == get_s3_remote(): + # Presigned via OpenNeuro's credentials + url = presign_remote_url(path, s3version) + return url + else: + # Anonymous access for any other buckets + return encode_remote_url( + '{}{}{}?versionId={}'.format(remote['url'], slash, path, s3version) + ) except: + raise return None @@ -260,8 +271,7 @@ def get_repo_urls(path, files): for path in rmetPaths: url = read_rmet_file(remote, catFile) if url: - encoded_url = encode_remote_url(url) - rmetFiles[path]['urls'].append(encoded_url) + rmetFiles[path]['urls'].append(url) return files @@ -378,9 +388,9 @@ def test_key_remote(dataset_path, key, remote_name='s3-PUBLIC'): remote_log = git_show(repo, 'git-annex', 'remote.log') except KeyError: return None - for line in remote_log: + for line in remote_log.splitlines(): remote = parse_remote_line(line) - if remote['name'] == remote_name: + if remote and remote['name'] == remote_name: rmet_path = compute_rmet(key) try: rmet = git_show(repo, 'git-annex', rmet_path) diff --git a/services/datalad/datalad_service/common/s3.py b/services/datalad/datalad_service/common/s3.py index 1d9ee7d48..390b4f997 100644 --- a/services/datalad/datalad_service/common/s3.py +++ b/services/datalad/datalad_service/common/s3.py @@ -7,28 +7,18 @@ import datalad_service.config from datalad_service.common.annex import annex_initremote, is_git_annex_remote from datalad_service.common.asyncio import run_check +from datalad_service.common.s3_client import ( + get_s3_bucket, + get_s3_remote, + get_s3_backup_bucket, + get_s3_backup_remote, +) class S3ConfigException(Exception): pass -def get_s3_remote(): - return 's3-PUBLIC' - - -def get_s3_backup_remote(): - return 's3-BACKUP' - - -def get_s3_bucket(): - return getattr(datalad_service.config, 'AWS_S3_PUBLIC_BUCKET') - - -def get_s3_backup_bucket(): - return getattr(datalad_service.config, 'GCP_S3_BACKUP_BUCKET') - - def generate_s3_annex_options(dataset_path, backup=False): dataset_id = os.path.basename(dataset_path) annex_options = [ diff --git a/services/datalad/datalad_service/common/s3_client.py b/services/datalad/datalad_service/common/s3_client.py new file mode 100644 index 000000000..48cd82de7 --- /dev/null +++ b/services/datalad/datalad_service/common/s3_client.py @@ -0,0 +1,52 @@ +import boto3 +from botocore.client import Config + +import datalad_service.config + +boto3_session = None +boto3_s3_client = None + + +def get_s3_client(): + """Setup a reusable boto3 session and S3 client.""" + global boto3_session + if not boto3_session: + aws_access_key_id = getattr(datalad_service.config, 'AWS_ACCESS_KEY_ID') + aws_secret_access_key = getattr(datalad_service.config, 'AWS_SECRET_ACCESS_KEY') + boto3_session = boto3.session.Session(aws_access_key_id, aws_secret_access_key) + global boto3_s3_client + if not boto3_s3_client: + boto3_config = Config(s3={'addressing_style': 'path'}) + boto3_s3_client = boto3_session.client('s3', config=boto3_config) + return boto3_s3_client + + +def presign_remote_url(key, version, expiration=604800): + """Presign URLs for the public bucket on S3.""" + bucket = get_s3_bucket() + s3_client = get_s3_client() + return s3_client.generate_presigned_url( + ClientMethod='get_object', + Params={ + 'Bucket': bucket, + 'Key': key, + 'VersionId': version, + }, + ExpiresIn=expiration, + ) + + +def get_s3_remote(): + return 's3-PUBLIC' + + +def get_s3_backup_remote(): + return 's3-BACKUP' + + +def get_s3_bucket(): + return getattr(datalad_service.config, 'AWS_S3_PUBLIC_BUCKET') + + +def get_s3_backup_bucket(): + return getattr(datalad_service.config, 'GCP_S3_BACKUP_BUCKET') diff --git a/services/datalad/tests/conftest.py b/services/datalad/tests/conftest.py index f8ea80d7d..65e2267e7 100644 --- a/services/datalad/tests/conftest.py +++ b/services/datalad/tests/conftest.py @@ -200,10 +200,12 @@ def no_publish(monkeypatch): ) -@pytest.fixture +@pytest.fixture(autouse=True) def s3_creds(monkeypatch): monkeypatch.setenv('AWS_S3_PUBLIC_BUCKET', 'a-fake-test-public-bucket') - monkeypatch.setenv('AWS_S3_PRIVATE_BUCKET', 'a-fake-test-private-bucket') + monkeypatch.setattr( + datalad_service.config, 'AWS_S3_PUBLIC_BUCKET', 'a-fake-test-public-bucket' + ) @pytest.fixture(autouse=True) @@ -212,6 +214,10 @@ def access_keys(monkeypatch): monkeypatch.setenv('AWS_SECRET_ACCESS_KEY', 'aws-secret') monkeypatch.setenv('GCP_ACCESS_KEY_ID', 'gcp-id') monkeypatch.setenv('GCP_SECRET_ACCESS_KEY', 'gcp-secret') + monkeypatch.setattr(datalad_service.config, 'AWS_ACCESS_KEY_ID', 'aws-id') + monkeypatch.setattr(datalad_service.config, 'AWS_SECRET_ACCESS_KEY', 'aws-secret') + monkeypatch.setattr(datalad_service.config, 'GCP_ACCESS_KEY_ID', 'gcp-id') + monkeypatch.setattr(datalad_service.config, 'GCP_SECRET_ACCESS_KEY', 'gcp-secret') @pytest.fixture(autouse=True) diff --git a/services/datalad/tests/test_annex.py b/services/datalad/tests/test_annex.py index d91647b69..8226963a8 100644 --- a/services/datalad/tests/test_annex.py +++ b/services/datalad/tests/test_annex.py @@ -156,6 +156,7 @@ def test_parse_remote_line(): def test_parse_rmet_line(): remote = { + 'name': 's3-PUBLIC', 'url': 'http://openneuro.org.s3.amazonaws.com/', 'uuid': '57894849-d0c8-4c62-8418-3627be18a196', } @@ -164,13 +165,15 @@ def test_parse_rmet_line(): """1590213748.042921433s 57894849-d0c8-4c62-8418-3627be18a196:V +iVcEk18e3J2WQys4zr_ANaTPfpUufW4Y#ds002778/dataset_description.json""", ) assert ( - url - == 'http://openneuro.org.s3.amazonaws.com/ds002778/dataset_description.json?versionId=iVcEk18e3J2WQys4zr_ANaTPfpUufW4Y' + 'https://s3.amazonaws.com/a-fake-test-public-bucket/ds002778/dataset_description.json?versionId=iVcEk18e3J2WQys4zr_ANaTPfpUufW4Y&AWSAccessKeyId=aws-id' + in url ) + assert 'Signature=' in url def test_parse_rmet_line_https(): remote = { + 'name': 's3-PUBLIC', 'url': 'https://s3.amazonaws.com/openneuro.org', 'uuid': '57894849-d0c8-4c62-8418-3627be18a196', } @@ -179,13 +182,15 @@ def test_parse_rmet_line_https(): """1590213748.042921433s 57894849-d0c8-4c62-8418-3627be18a196:V +iVcEk18e3J2WQys4zr_ANaTPfpUufW4Y#ds002778/dataset_description.json""", ) assert ( - url - == 'https://s3.amazonaws.com/openneuro.org/ds002778/dataset_description.json?versionId=iVcEk18e3J2WQys4zr_ANaTPfpUufW4Y' + 'https://s3.amazonaws.com/a-fake-test-public-bucket/ds002778/dataset_description.json?versionId=iVcEk18e3J2WQys4zr_ANaTPfpUufW4Y&AWSAccessKeyId=aws-id' + in url ) + assert 'Signature=' in url def test_read_rmet_file(): remote = { + 'name': 's3-PUBLIC', 'url': 'http://openneuro.org.s3.amazonaws.com/', 'uuid': '57894849-d0c8-4c62-8418-3627be18a196', } @@ -193,9 +198,10 @@ def test_read_rmet_file(): 1590213748.042921433s 57894849-d0c8-4c62-8418-3627be18a196:V +iVcEk18e3J2WQys4zr_ANaTPfpUufW4Y#ds002778/dataset_description.json""") url = read_rmet_file(remote, catFile) assert ( - url - == 'http://openneuro.org.s3.amazonaws.com/ds002778/dataset_description.json?versionId=iVcEk18e3J2WQys4zr_ANaTPfpUufW4Y' + 'https://s3.amazonaws.com/a-fake-test-public-bucket/ds002778/dataset_description.json?versionId=iVcEk18e3J2WQys4zr_ANaTPfpUufW4Y&AWSAccessKeyId=aws-id' + in url ) + assert 'Signature=' in url def test_remote_url_encoding():