From c9afcd27592ed43e76914673f922a6bc113ecbc0 Mon Sep 17 00:00:00 2001 From: Nell Hardcastle Date: Wed, 19 Nov 2025 12:22:20 -0800 Subject: [PATCH] fix(worker): Skip re-export on drop if remote contains all files for a branch or tag --- .../datalad/datalad_service/common/asyncio.py | 1 + .../datalad/datalad_service/tasks/publish.py | 21 +++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/services/datalad/datalad_service/common/asyncio.py b/services/datalad/datalad_service/common/asyncio.py index 91bdc292a..a13acb995 100644 --- a/services/datalad/datalad_service/common/asyncio.py +++ b/services/datalad/datalad_service/common/asyncio.py @@ -22,3 +22,4 @@ async def run_check(command, dataset_path, env=None): stdout, stderr = await process.communicate() if process.returncode != 0: raise subprocess.CalledProcessError(process.returncode, command, stdout, stderr) + return stdout.decode('utf-8') diff --git a/services/datalad/datalad_service/tasks/publish.py b/services/datalad/datalad_service/tasks/publish.py index d37a63110..401271ed0 100644 --- a/services/datalad/datalad_service/tasks/publish.py +++ b/services/datalad/datalad_service/tasks/publish.py @@ -105,8 +105,12 @@ async def export_backup_and_drop(dataset_path): await s3_backup_push(dataset_path) for tag in tags: logger.info(f'Exporting/dropping tag {dataset_id}@{tag.name}') - # Private datasets need to export each tag for this step - if not public_dataset: + export_ran = False + if await find_in_remote(dataset_path, tag.name, get_s3_remote()): + export_ran = True + await s3_export(dataset_path, get_s3_remote(), tag.name) + if tag == tags[-1] and export_ran: + # Always export the most recent tag again if any export ran await s3_export(dataset_path, get_s3_remote(), tag.name) await fsck_and_drop(dataset_path, tag.name) logger.info(f'Exporting/dropping tag {dataset_id}@{tag.name} complete') @@ -116,6 +120,19 @@ async def export_backup_and_drop(dataset_path): logger.info(f'{dataset_id} export_backup_and_drop complete') +async def find_in_remote(dataset_path, tag, remote): + """Check if any git-annex objects available locally for a branch are not present in a remote.""" + output = await run_check( + ['git-annex', 'find', f'--branch={tag}', '--not', f'--in={remote}'], + dataset_path, + ) + if len(output) > 0: + # Some keys are missing + return False + # All keys are present + return True + + @broker.task async def export_dataset( dataset_path,