Skip to content

Commit 6148042

Browse files
authored
Merge pull request #3643 from OpenNeuroOrg/private-exports
Export all datasets and manage object tags for access control
2 parents 4b97814 + 4e26666 commit 6148042

File tree

9 files changed

+126
-8
lines changed

9 files changed

+126
-8
lines changed

services/datalad/datalad_service/common/openneuro.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,3 +73,30 @@ def update_file_check(dataset_path, commit, references, bad_files, remote=None):
7373
except requests.exceptions.HTTPError as e:
7474
logging.error(e)
7575
logging.error(req.text)
76+
77+
78+
def is_public_dataset(dataset_id):
79+
"""Ask the OpenNeuro API if this dataset is public."""
80+
try:
81+
logger = logging.getLogger(__name__)
82+
response = requests.post(
83+
GRAPHQL_ENDPOINT,
84+
json={
85+
'query': 'query($datasetId: ID!) { dataset(id: $datasetId) { public } }',
86+
'variables': {'datasetId': dataset_id},
87+
},
88+
headers={'authorization': f'Bearer {generate_service_token(dataset_id)}'},
89+
)
90+
response.raise_for_status()
91+
data = response.json()
92+
if 'errors' in data:
93+
logger.error(
94+
f'GraphQL error checking public status for {dataset_id}: {data["errors"]}'
95+
)
96+
raise Exception(
97+
f'GraphQL error checking public status for {dataset_id}: {data["errors"]}'
98+
)
99+
return data.get('data', {}).get('dataset', {}).get('public', False)
100+
except requests.exceptions.RequestException as e:
101+
logger.error(f'Failed to check public status for {dataset_id}: {e}')
102+
raise

services/datalad/datalad_service/common/s3.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ def backup_remote_env():
7272
def setup_s3_sibling(dataset_path):
7373
"""Add a sibling for an S3 bucket publish."""
7474
# Public remote
75+
# TODO set ['x-amz-tagging=access=private'] if tagging is supported in git-annex
7576
subprocess.run(
7677
['git-annex', 'initremote', get_s3_remote()]
7778
+ generate_s3_annex_options(dataset_path),

services/datalad/datalad_service/handlers/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ async def on_post(self, req, resp, dataset):
4141
author = pygit2.Signature(name, email)
4242
else:
4343
author = None
44-
hexsha = await create_dataset(self.store, dataset, author)
44+
hexsha = await create_dataset(ds_path, author)
4545
resp.media = {'hexsha': hexsha}
4646
resp.status = falcon.HTTP_OK
4747

services/datalad/datalad_service/handlers/publish.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
import falcon
22

3-
from datalad_service.tasks.publish import create_remotes_and_export
3+
from taskiq_pipelines import Pipeline
4+
5+
from datalad_service.broker import broker
6+
from datalad_service.tasks.publish import (
7+
create_remotes_and_export,
8+
set_s3_access_tag,
9+
)
410

511

612
class PublishResource:
@@ -11,6 +17,11 @@ def __init__(self, store):
1117

1218
async def on_post(self, req, resp, dataset):
1319
dataset_path = self.store.get_dataset_path(dataset)
14-
await create_remotes_and_export.kiq(dataset_path)
20+
# Pipeline create and export -> set access tag to public
21+
await (
22+
Pipeline(broker, create_remotes_and_export)
23+
.call_after(set_s3_access_tag, dataset=dataset, value='public')
24+
.kiq(dataset_path) # create_remotes_and_export
25+
)
1526
resp.media = {}
1627
resp.status = falcon.HTTP_OK

services/datalad/datalad_service/tasks/dataset.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from datalad_service.common.annex import init_annex
1515
from datalad_service.common.git import git_commit, COMMITTER_EMAIL, COMMITTER_NAME
16+
from datalad_service.tasks.publish import create_remotes
1617

1718
# A list of patterns to avoid annexing in BIDS datasets
1819
GIT_ATTRIBUTES = """* annex.backend=SHA256E
@@ -44,12 +45,11 @@ def create_datalad_config(dataset_path):
4445
configfile.write(config)
4546

4647

47-
async def create_dataset(store, dataset, author=None, initial_head='main'):
48+
async def create_dataset(dataset_path, author=None, initial_head='main'):
4849
"""Create a DataLad git-annex repo for a new dataset.
4950
5051
initial_head is only meant for tests and is overridden by the implementation of git_commit
5152
"""
52-
dataset_path = store.get_dataset_path(dataset)
5353
if os.path.isdir(dataset_path):
5454
raise Exception('Dataset already exists')
5555
if not author:
@@ -70,6 +70,7 @@ async def create_dataset(store, dataset, author=None, initial_head='main'):
7070
'[OpenNeuro] Dataset created',
7171
parents=[],
7272
)
73+
create_remotes(dataset_path)
7374
return str(repo.head.target)
7475

7576

services/datalad/datalad_service/tasks/publish.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from datalad_service.config import GCP_ACCESS_KEY_ID
2222
from datalad_service.config import GCP_SECRET_ACCESS_KEY
2323
from datalad_service.common.annex import get_tag_info, is_git_annex_remote
24-
from datalad_service.common.openneuro import clear_dataset_cache
24+
from datalad_service.common.openneuro import clear_dataset_cache, is_public_dataset
2525
from datalad_service.common.git import git_show, git_tag, git_tag_tree
2626
from datalad_service.common.github import github_export
2727
from datalad_service.common.s3 import (
@@ -126,6 +126,8 @@ async def export_dataset(
126126
if tags:
127127
new_tag = tags[-1].name
128128
await s3_export(dataset_path, get_s3_remote(), new_tag)
129+
if not is_public_dataset(dataset_id):
130+
await set_s3_access_tag(dataset_id, 'private')
129131
await s3_backup_push(dataset_path)
130132
# Once all S3 tags are exported, update GitHub
131133
if github_enabled:
@@ -274,3 +276,48 @@ async def annex_drop(fsck_success, dataset_path, branch):
274276
await run_check(
275277
['git-annex', 'drop', '--branch', branch], dataset_path, env=env
276278
)
279+
280+
281+
async def set_remote_public(dataset):
282+
"""Clear x-amz-meta-access when a dataset is made public."""
283+
# If git-annex supports tags in the future, we'd modify this here.
284+
# await run_check(
285+
# ['git-annex', 'enableremote', get_s3_remote(), 'x-amz-tagging=access=public'],
286+
# dataset_path,
287+
# )
288+
await set_s3_access_tag(dataset, 'public')
289+
290+
291+
@broker.task
292+
async def set_s3_access_tag(dataset, value='private'):
293+
"""Set access tag on all versions of all files."""
294+
client = boto3.client(
295+
's3',
296+
aws_access_key_id=AWS_ACCESS_KEY_ID,
297+
aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
298+
)
299+
s3_bucket = get_s3_bucket()
300+
paginator = client.get_paginator('list_object_versions')
301+
for page in paginator.paginate(Bucket=s3_bucket, Prefix=f'{dataset}/'):
302+
for version in page.get('Versions', []):
303+
key = version['Key']
304+
version_id = version['VersionId']
305+
try:
306+
response = client.get_object_tagging(
307+
Bucket=s3_bucket, Key=key, VersionId=version_id
308+
)
309+
tag_set = response.get('TagSet', [])
310+
except client.exceptions.ClientError as e:
311+
if e.response['Error']['Code'] == 'NoSuchTagSet':
312+
tag_set = []
313+
else:
314+
raise
315+
# Remove any existing access tag and add the new one
316+
new_tags = [tag for tag in tag_set if tag['Key'] != 'access']
317+
new_tags.append({'Key': 'access', 'Value': value})
318+
client.put_object_tagging(
319+
Bucket=s3_bucket,
320+
Key=key,
321+
VersionId=version_id,
322+
Tagging={'TagSet': new_tags},
323+
)

services/datalad/tests/conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import json
55
import subprocess
66
import random
7+
from unittest import mock
78

89
import pytest
910
from falcon import testing
@@ -249,3 +250,11 @@ async def async_noop_validator(*args, **kwargs):
249250
monkeypatch.setattr(
250251
'datalad_service.tasks.files.validate_dataset', async_noop_validator
251252
)
253+
254+
255+
@pytest.fixture(autouse=True)
256+
def is_public_dataset_mock(monkeypatch):
257+
def _mock(dataset_id):
258+
return True
259+
260+
monkeypatch.setattr('datalad_service.tasks.publish.is_public_dataset', _mock)

services/datalad/tests/test_common_s3.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,26 @@ def test_s3_annex_options(monkeypatch):
3737
# Check prefix and bucket strings are interpolated right
3838
assert 'fileprefix=test00001/' in options
3939
assert 'bucket=a-fake-test-public-bucket' in options
40+
# Check that the private tag is applied by default
41+
# assert 'x-amz-tagging=access=private' in options
42+
43+
44+
def test_s3_annex_backup_options(monkeypatch):
45+
monkeypatch.setattr(
46+
datalad_service.config, 'AWS_S3_PUBLIC_BUCKET', 'a-fake-test-public-bucket'
47+
)
48+
monkeypatch.setattr(datalad_service.config, 'GCP_S3_BACKUP_BUCKET', 'backup-bucket')
49+
options = generate_s3_annex_options(
50+
'/tmp/dataset/does/not/exist/test00001', backup=True
51+
)
52+
assert 'type=S3' in options
53+
# Verify public=no (ACL deprecation)
54+
assert 'public=no' in options
55+
# Verify autoenable=true is not present
56+
assert 'autoenable=true' not in options
57+
# Check prefix and bucket strings are interpolated right
58+
assert 'fileprefix=test00001/' in options
59+
assert 'bucket=backup-bucket' in options
4060

4161

4262
def test_update_s3_sibling(monkeypatch, no_init_remote, new_dataset):

services/datalad/tests/test_datalad.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@
1616
async def test_create_dataset(datalad_store):
1717
ds_id = 'ds000002'
1818
author = pygit2.Signature('test author', '[email protected]')
19-
await create_dataset(datalad_store, ds_id, author)
19+
ds_path = os.path.join(datalad_store.annex_path, ds_id)
20+
await create_dataset(ds_path, author)
2021
ds = Dataset(os.path.join(datalad_store.annex_path, ds_id))
2122
assert ds.repo is not None
2223
# Verify the dataset is created with datalad config
@@ -52,10 +53,11 @@ async def test_create_dataset_master(datalad_store):
5253
async def test_create_dataset_unusual_default_branch(datalad_store):
5354
ds_id = 'ds000026'
5455
author = pygit2.Signature('test author', '[email protected]')
56+
ds_path = os.path.join(datalad_store.annex_path, ds_id)
5557
# Create dataset will commit data and this should fail since HEAD is something 'unusual'
5658
# (such as the git-annex branch as a plausible example)
5759
with pytest.raises(OpenNeuroGitError) as e:
58-
await create_dataset(datalad_store, ds_id, author, 'unusual')
60+
await create_dataset(ds_path, author, 'unusual')
5961

6062

6163
async def test_delete_dataset(datalad_store, new_dataset):

0 commit comments

Comments
 (0)