Skip to content

Commit 9388b30

Browse files
committed
fix(worker): Implement workaround for git-annex initremote on GCP object storage with fileprefix
1 parent 4280ab7 commit 9388b30

File tree

3 files changed

+106
-1
lines changed

3 files changed

+106
-1
lines changed

services/datalad/datalad_service/common/annex.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import subprocess
66
import os
77
import urllib.parse
8+
import time
9+
import uuid
810

911
import aiofiles
1012
import pygit2
@@ -22,6 +24,12 @@
2224
]
2325

2426

27+
class InitRemoteException(Exception):
28+
"""Raised when git-annex initremote fails."""
29+
30+
pass
31+
32+
2533
class EditAnnexedFileException(Exception):
2634
"""Snapshot conflicts with existing name."""
2735

@@ -380,3 +388,52 @@ def test_key_remote(dataset_path, key, remote_name='s3-PUBLIC'):
380388
if remote['uuid'] in line:
381389
return parse_rmet_line(remote, line)
382390
return None
391+
392+
393+
def annex_initremote(dataset_path, remote_name, remote_options):
394+
"""Initialize a git-annex remote manually."""
395+
remote_uuid = str(uuid.uuid4())
396+
repo = pygit2.Repository(dataset_path)
397+
branch = repo.branches.get('git-annex')
398+
if not branch:
399+
raise InitRemoteException('git-annex branch not found')
400+
commit = branch.peel()
401+
tree = commit.tree
402+
# Read remote.log
403+
file_content = ''
404+
log_entry = None
405+
try:
406+
# Check if 'remote.log' exists in the tree
407+
log_entry = tree['remote.log']
408+
blob = repo.get(log_entry.id)
409+
410+
# Ensure file is not binary
411+
if not blob.is_binary:
412+
file_content = blob.data.decode('utf-8')
413+
else:
414+
raise InitRemoteException("'remote.log' is a binary file, cannot read.")
415+
except KeyError:
416+
# 'remote.log' doesn't exist yet, we'll create it.
417+
file_content = '' # Ensure it's an empty string
418+
419+
new_content = (
420+
file_content
421+
+ f'{remote_uuid} type=S3 name={remote_name} timestamp={int(time.time())}s '
422+
+ ' '.join(remote_options)
423+
)
424+
new_blob_oid = repo.create_blob(new_content.encode('utf-8'))
425+
builder = repo.TreeBuilder(tree)
426+
builder.insert('remote.log', new_blob_oid, pygit2.GIT_FILEMODE_BLOB)
427+
new_tree_oid = builder.write()
428+
author = pygit2.Signature(SERVICE_USER, SERVICE_EMAIL)
429+
committer = pygit2.Signature(SERVICE_USER, SERVICE_EMAIL)
430+
commit_message = f'[OpenNeuro] Initialize git-annex remote {remote_name}'
431+
repo.create_commit(
432+
branch.name, # The ref to update (our branch)
433+
author, # The commit author
434+
committer, # The commit committer
435+
commit_message, # The commit message
436+
new_tree_oid, # The OID of the new tree
437+
[commit.id], # List of parent commit OIDs
438+
)
439+
return remote_uuid

services/datalad/datalad_service/common/s3.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import os
22
import subprocess
33

4+
import boto3
5+
from botocore.config import Config
6+
47
import datalad_service.config
8+
from datalad_service.common.annex import annex_initremote
59

610

711
class S3ConfigException(Exception):
@@ -59,6 +63,13 @@ def backup_remote_env():
5963
backup_remote_env['AWS_SECRET_ACCESS_KEY'] = backup_remote_env[
6064
'GCP_SECRET_ACCESS_KEY'
6165
]
66+
# Overwrite the AWS keys with the GCP keys from config
67+
backup_remote_env['AWS_ACCESS_KEY_ID'] = getattr(
68+
datalad_service.config, 'GCP_ACCESS_KEY_ID'
69+
)
70+
backup_remote_env['AWS_SECRET_ACCESS_KEY'] = getattr(
71+
datalad_service.config, 'GCP_SECRET_ACCESS_KEY'
72+
)
6273
return backup_remote_env
6374

6475

@@ -83,6 +94,43 @@ def setup_s3_backup_sibling(dataset_path):
8394
)
8495

8596

97+
def setup_s3_backup_sibling_workaround(dataset_path):
98+
"""setup_s3_backup_sibling with workaround for git-annex bug."""
99+
dataset_id = os.path.basename(dataset_path)
100+
uuid = annex_initremote(
101+
dataset_path,
102+
get_s3_backup_remote(),
103+
generate_s3_annex_options(dataset_path, backup=True),
104+
)
105+
# Manually upload the remote uuid file to the bucket
106+
aws_access_key_id = getattr(datalad_service.config, 'GCP_ACCESS_KEY_ID')
107+
aws_secret_access_key = getattr(datalad_service.config, 'GCP_SECRET_ACCESS_KEY')
108+
gcp_config = Config(
109+
region_name='auto',
110+
signature_version='s3v4',
111+
# This is required for GCP compatibility with boto3
112+
request_checksum_calculation='when_required',
113+
)
114+
s3 = boto3.client(
115+
's3',
116+
aws_access_key_id=aws_access_key_id,
117+
aws_secret_access_key=aws_secret_access_key,
118+
endpoint_url='https://storage.googleapis.com',
119+
config=gcp_config,
120+
)
121+
s3.put_object(
122+
Bucket=get_s3_backup_bucket(),
123+
Key=f'{dataset_id}/annex-uuid',
124+
Body=uuid.encode('utf-8'),
125+
)
126+
# Enableremote after
127+
subprocess.run(
128+
['git-annex', 'enableremote', get_s3_backup_remote()],
129+
cwd=dataset_path,
130+
env=backup_remote_env(),
131+
)
132+
133+
86134
def update_s3_sibling(dataset_path):
87135
"""Update S3 remote with latest config."""
88136
# note: enableremote command will only upsert config options, none are deleted

services/datalad/datalad_service/tasks/publish.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def s3_sibling(dataset_path):
5757
if not is_git_annex_remote(dataset_path, get_s3_remote()):
5858
datalad_service.common.s3.setup_s3_sibling(dataset_path)
5959
if not is_git_annex_remote(dataset_path, get_s3_backup_remote()):
60-
datalad_service.common.s3.setup_s3_backup_sibling(dataset_path)
60+
datalad_service.common.s3.setup_s3_backup_sibling_workaround(dataset_path)
6161

6262

6363
@broker.task

0 commit comments

Comments
 (0)