diff --git a/services/datalad/datalad_service/handlers/drop.py b/services/datalad/datalad_service/handlers/drop.py index d636128a8..41f98aa35 100644 --- a/services/datalad/datalad_service/handlers/drop.py +++ b/services/datalad/datalad_service/handlers/drop.py @@ -1,6 +1,6 @@ import falcon -from datalad_service.tasks.publish import annex_drop +from datalad_service.tasks.publish import export_backup_and_drop class DropResource: @@ -11,6 +11,6 @@ def __init__(self, store): async def on_post(self, req, resp, dataset): dataset_path = self.store.get_dataset_path(dataset) - await annex_drop.kiq(dataset_path) + await export_backup_and_drop.kiq(dataset_path) resp.media = {} resp.status = falcon.HTTP_OK diff --git a/services/datalad/datalad_service/tasks/publish.py b/services/datalad/datalad_service/tasks/publish.py index 6da59c347..aae7320ed 100644 --- a/services/datalad/datalad_service/tasks/publish.py +++ b/services/datalad/datalad_service/tasks/publish.py @@ -79,6 +79,30 @@ def create_remotes(dataset_path): github_sibling(dataset_path, dataset) +async def export_backup_and_drop(dataset_path): + """ + Export dataset to S3 backup, verify s3-PUBLIC, and drop local data. + """ + repo = pygit2.Repository(dataset_path) + tags = sorted(git_tag(repo), key=lambda tag: tag.name) + if tags: + s3_backup_push(dataset_path) + for tag in tags: + # Check and clean local annexed files once export is complete + pipeline = Pipeline(broker, git_annex_fsck_remote).call_next( + annex_drop, + dataset_path=dataset_path, + branch=tag, + remote=get_s3_remote(), + ) + # Call the pipeline (arguments for git_annex_fsck_remote) + await pipeline.kiq( + dataset_path, + branch=tag, # Check the history from the new tag just exported + remote=get_s3_remote(), + ) + + @broker.task async def export_dataset( dataset_path, @@ -111,7 +135,10 @@ async def export_dataset( clear_dataset_cache(dataset_id) # Check and clean local annexed files once export is complete pipeline = Pipeline(broker, git_annex_fsck_remote).call_next( - annex_drop, dataset_path=dataset_path, branch=new_tag + annex_drop, + dataset_path=dataset_path, + branch=new_tag, + remote=get_s3_remote(), ) # Call the pipeline (arguments for git_annex_fsck_remote) await pipeline.kiq(