Skip to content

Commit 37dc4de

Browse files
authored
Merge pull request #3534 from OpenNeuroOrg/annex-fsck-tasks
Add git-annex fsck checks for datasets
2 parents 38d0344 + c5fb5ae commit 37dc4de

File tree

14 files changed

+174
-38
lines changed

14 files changed

+174
-38
lines changed

services/datalad/datalad_service/common/git.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
import pygit2
1111
from charset_normalizer import from_bytes
1212

13+
from datalad_service.common.onchange import on_head
14+
1315
tag_ref = re.compile('^refs/tags/')
1416

1517
COMMITTER_NAME = 'Git Worker'
@@ -130,7 +132,7 @@ def git_rename_master_to_main(repo):
130132
repo.references['HEAD'].set_target('refs/heads/main')
131133

132134

133-
def git_commit(
135+
async def git_commit(
134136
repo, file_paths, author=None, message='[OpenNeuro] Recorded changes', parents=None
135137
):
136138
"""Commit array of paths at HEAD."""
@@ -161,10 +163,10 @@ def git_commit(
161163
sentry_sdk.capture_exception(e)
162164
logger.error(f'Failed to read index after git-annex add: {e}')
163165
raise OpenNeuroGitError(f'Failed to read index: {e}') from e
164-
return git_commit_index(repo, author, message, parents)
166+
return await git_commit_index(repo, author, message, parents)
165167

166168

167-
def git_commit_index(
169+
async def git_commit_index(
168170
repo, author=None, message='[OpenNeuro] Recorded changes', parents=None
169171
):
170172
"""Commit any existing index changes."""
@@ -180,4 +182,5 @@ def git_commit_index(
180182
'refs/heads/main', author, committer, message, tree, parent_commits
181183
)
182184
repo.head.set_target(commit)
185+
await on_head(repo.workdir)
183186
return commit
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from datalad_service.tasks.fsck import git_annex_fsck_local, git_annex_fsck_remote
2+
3+
4+
async def on_head(dataset_path):
5+
"""Called after any change to HEAD."""
6+
await git_annex_fsck_local.kiq(dataset_path)
7+
8+
9+
async def on_tag(dataset_path, tag):
10+
"""Called after any new tag."""
11+
await git_annex_fsck_remote.kiq(dataset_path, tag)

services/datalad/datalad_service/handlers/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ async def on_post(self, req, resp, dataset):
4141
author = pygit2.Signature(name, email)
4242
else:
4343
author = None
44-
hexsha = create_dataset(self.store, dataset, author)
44+
hexsha = await create_dataset(self.store, dataset, author)
4545
resp.media = {'hexsha': hexsha}
4646
resp.status = falcon.HTTP_OK
4747

services/datalad/datalad_service/handlers/draft.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,9 @@ async def on_post(self, req, resp, dataset):
5353
# Add all changes to the index
5454
if name and email:
5555
author = pygit2.Signature(name, email)
56-
media_dict['ref'] = str(git_commit(repo, ['.'], author))
56+
media_dict['ref'] = str(await git_commit(repo, ['.'], author))
5757
else:
58-
media_dict['ref'] = str(git_commit(repo, ['.']))
58+
media_dict['ref'] = str(await git_commit(repo, ['.']))
5959
resp.media = media_dict
6060
resp.status = falcon.HTTP_OK
6161
except:

services/datalad/datalad_service/handlers/files.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import pygit2
77

88
from datalad_service.common.git import (
9-
git_show,
109
git_show_content,
1110
git_tree,
1211
OpenNeuroGitError,
@@ -125,7 +124,7 @@ async def on_delete(self, req, resp, dataset):
125124
media_dict['email'] = email
126125
try:
127126
if len(dirs_to_delete) > 0:
128-
remove_files(
127+
await remove_files(
129128
self.store,
130129
dataset,
131130
dirs_to_delete,
@@ -135,7 +134,7 @@ async def on_delete(self, req, resp, dataset):
135134
)
136135
resp.status = falcon.HTTP_INTERNAL_SERVER_ERROR
137136
if len(files_to_delete) > 0:
138-
remove_files(
137+
await remove_files(
139138
self.store,
140139
dataset,
141140
files_to_delete,

services/datalad/datalad_service/handlers/git.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from datalad_service.common.events import log_git_event
88
from datalad_service.common.const import CHUNK_SIZE_BYTES
9+
from datalad_service.common.onchange import on_head
910

1011

1112
cache_control = ['no-cache', 'max-age=0', 'must-revalidate']
@@ -137,11 +138,12 @@ async def on_post(self, req, resp, worker, dataset):
137138
resp.status = falcon.HTTP_OK
138139

139140
# After this request finishes successfully, log it to the OpenNeuro API
140-
def schedule_git_event():
141+
async def schedule_git_event():
142+
await on_head(dataset_path)
141143
for new_commit, new_ref in refs_updated:
142144
log_git_event(dataset, new_commit, new_ref, req.context['token'])
143145

144-
resp.schedule_sync(schedule_git_event)
146+
resp.schedule(schedule_git_event)
145147
else:
146148
resp.status = falcon.HTTP_UNPROCESSABLE_ENTITY
147149

services/datalad/datalad_service/handlers/upload.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ async def move_files_into_repo(
3434
await move_files(upload_path, dataset_path)
3535
if name and email:
3636
author = pygit2.Signature(name, email)
37-
hexsha = str(git_commit(repo, unlock_files, author))
37+
hexsha = str(await git_commit(repo, unlock_files, author))
3838
else:
39-
hexsha = str(git_commit(repo, unlock_files))
39+
hexsha = str(await git_commit(repo, unlock_files))
4040

4141

4242
class UploadResource:

services/datalad/datalad_service/tasks/dataset.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def create_datalad_config(dataset_path):
4242
configfile.write(config)
4343

4444

45-
def create_dataset(store, dataset, author=None, initial_head='main'):
45+
async def create_dataset(store, dataset, author=None, initial_head='main'):
4646
"""Create a DataLad git-annex repo for a new dataset.
4747
4848
initial_head is only meant for tests and is overridden by the implementation of git_commit
@@ -61,7 +61,7 @@ def create_dataset(store, dataset, author=None, initial_head='main'):
6161
# Set a datalad UUID
6262
create_datalad_config(dataset_path)
6363
repo.index.add('.datalad/config')
64-
git_commit(
64+
await git_commit(
6565
repo,
6666
['.gitattributes', '.datalad/config'],
6767
author,

services/datalad/datalad_service/tasks/description.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ async def update_description(store, dataset, description_fields, name=None, emai
3232
path, description, json.dumps(updated, indent=4, ensure_ascii=False)
3333
)
3434
# Commit new content, run validator
35-
commit_files(store, dataset, ['dataset_description.json'])
35+
await commit_files(store, dataset, ['dataset_description.json'])
3636
return updated
3737
else:
3838
return description_json

services/datalad/datalad_service/tasks/files.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
from datalad_service.config import AWS_S3_PUBLIC_BUCKET
2222

2323

24-
def commit_files(store, dataset, files, name=None, email=None, cookies=None):
24+
async def commit_files(store, dataset, files, name=None, email=None, cookies=None):
2525
"""
2626
Commit a list of files with the email and name provided.
2727
@@ -35,9 +35,9 @@ def commit_files(store, dataset, files, name=None, email=None, cookies=None):
3535
and pygit2.Signature(name, email)
3636
or pygit2.Signature(COMMITTER_NAME, COMMITTER_EMAIL)
3737
)
38-
ref = git_commit(repo, files, author)
38+
ref = await git_commit(repo, files, author)
3939
# Run the validator but don't block on the request
40-
asyncio.create_task(validate_dataset.kiq(dataset, dataset_path, str(ref), cookies))
40+
await validate_dataset.kiq(dataset, dataset_path, str(ref), cookies)
4141
return ref
4242

4343

@@ -47,7 +47,7 @@ def get_tree(store, dataset, tree):
4747
return get_repo_files(dataset, dataset_path, tree)
4848

4949

50-
def remove_files(store, dataset, paths, name=None, email=None, cookies=None):
50+
async def remove_files(store, dataset, paths, name=None, email=None, cookies=None):
5151
dataset_path = store.get_dataset_path(dataset)
5252
repo = pygit2.Repository(dataset_path)
5353
if name and email:
@@ -57,7 +57,9 @@ def remove_files(store, dataset, paths, name=None, email=None, cookies=None):
5757
repo.index.remove_all(paths)
5858
repo.index.write()
5959
repo.checkout_index()
60-
hexsha = str(git_commit_index(repo, author, message='[OpenNeuro] Files removed'))
60+
hexsha = str(
61+
await git_commit_index(repo, author, message='[OpenNeuro] Files removed')
62+
)
6163

6264

6365
def parse_s3_annex_url(url, bucket_name=AWS_S3_PUBLIC_BUCKET):

0 commit comments

Comments
 (0)