Skip to content

Commit 0d3634f

Browse files
authored
Merge pull request #3658 from OpenNeuroOrg/multiple-fsck-drop
refactor(worker): Run one fsck/drop on all branches
2 parents 6534d8c + 66105e9 commit 0d3634f

File tree

2 files changed

+16
-14
lines changed

2 files changed

+16
-14
lines changed

services/datalad/datalad_service/tasks/fsck.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def git_annex_fsck_local(dataset_path):
5858

5959

6060
@broker.task
61-
async def git_annex_fsck_remote(dataset_path, branch, remote='s3-PUBLIC'):
61+
async def git_annex_fsck_remote(dataset_path, branches, remote='s3-PUBLIC'):
6262
"""
6363
Run incremental fsck for one branch (tag) and remote.
6464
@@ -71,16 +71,16 @@ async def git_annex_fsck_remote(dataset_path, branch, remote='s3-PUBLIC'):
7171
logging.error(f'Could not open git repository for {dataset_path}')
7272
return False
7373

74-
annex_command = (
74+
annex_command = [
7575
'git-annex',
7676
'fsck',
7777
'-J4',
78-
f'--branch={branch}',
7978
f'--from={remote}',
8079
'--json',
8180
'--json-error-messages',
8281
'--incremental-schedule=365d',
83-
)
82+
]
83+
annex_command += [f'--branch={branch}' for branch in branches]
8484
annex_process = subprocess.Popen(
8585
annex_command, cwd=dataset_path, stdout=subprocess.PIPE
8686
)

services/datalad/datalad_service/tasks/publish.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -80,15 +80,15 @@ def create_remotes(dataset_path):
8080
github_sibling(dataset_path, dataset)
8181

8282

83-
async def fsck_and_drop(dataset_path, branch):
83+
async def fsck_and_drop(dataset_path, branches):
8484
# Check and clean local annexed files once export is complete
85-
fsck_success = await git_annex_fsck_remote(dataset_path, branch, get_s3_remote())
85+
fsck_success = await git_annex_fsck_remote(dataset_path, branches, get_s3_remote())
8686
if fsck_success:
87-
logger.info(f'{dataset_path} remote fsck passed for {branch}')
88-
await annex_drop(dataset_path, branch)
87+
logger.info(f'{dataset_path} remote fsck passed')
88+
await annex_drop(dataset_path, branches)
8989
logger.info(f'{dataset_path} drop complete')
9090
else:
91-
logger.error(f'{dataset_path} remote fsck failed for {branch}')
91+
logger.error(f'{dataset_path} remote fsck failed')
9292

9393

9494
@broker.task
@@ -112,8 +112,8 @@ async def export_backup_and_drop(dataset_path):
112112
if tag == tags[-1] and export_ran:
113113
# Always export the most recent tag again if any export ran
114114
await s3_export(dataset_path, get_s3_remote(), tag.name)
115-
await fsck_and_drop(dataset_path, tag.name)
116-
logger.info(f'Exporting/dropping tag {dataset_id}@{tag.name} complete')
115+
await fsck_and_drop(dataset_path, [tag.name for tag in tags])
116+
logger.info(f'Exporting/dropping tags for {dataset_id} complete')
117117
if not public_dataset:
118118
logger.info(f'Setting access tag for {dataset_id}')
119119
await set_s3_access_tag(dataset_id, 'private')
@@ -167,7 +167,7 @@ async def export_dataset(
167167
# Drop cache once all exports are complete
168168
clear_dataset_cache(dataset_id)
169169
# Check and clean local annexed files once export is complete
170-
await fsck_and_drop(dataset_path, new_tag)
170+
await fsck_and_drop(dataset_path, [new_tag])
171171
else:
172172
# Clear the cache even if only sibling updates occurred
173173
clear_dataset_cache(dataset_id)
@@ -277,7 +277,7 @@ def monitor_remote_configs(dataset_path):
277277

278278

279279
@broker.task
280-
async def annex_drop(dataset_path, branch):
280+
async def annex_drop(dataset_path, branches):
281281
"""Drop local contents from the annex."""
282282
# Ensure numcopies is set to 2 before running drop
283283
await run_check(['git-annex', 'numcopies', '2'], dataset_path)
@@ -293,7 +293,9 @@ async def annex_drop(dataset_path, branch):
293293
# Force git-annex to use cached credentials for this
294294
del env['AWS_ACCESS_KEY_ID']
295295
del env['AWS_SECRET_ACCESS_KEY']
296-
await run_check(['git-annex', 'drop', '--branch', branch], dataset_path, env=env)
296+
command = ['git-annex', 'drop']
297+
command += [f'--branch={branch}' for branch in branches]
298+
await run_check(command, dataset_path, env=env)
297299

298300

299301
async def set_remote_public(dataset):

0 commit comments

Comments
 (0)