OpenNeuroOrg · nellh · Jul 9, 2025 · Jul 8, 2025 · Jul 8, 2025 · Jul 8, 2025
diff --git a/package.json b/package.json
@@ -54,6 +54,9 @@
   "lint-staged": {
     "*.{js,jsx,ts,tsx,json}": [
       "deno fmt *"
+    ],
+    "*.py": [
+      "uv --directory services/datalad run ruff format"
     ]
   },
   "dependenciesMeta": {

diff --git a/services/datalad/datalad_service/app.py b/services/datalad/datalad_service/app.py
@@ -22,20 +22,26 @@
 from datalad_service.handlers.upload import UploadResource
 from datalad_service.handlers.upload_file import UploadFileResource
 from datalad_service.handlers.validation import ValidationResource
-from datalad_service.handlers.git import GitRefsResource, GitReceiveResource, GitUploadResource
+from datalad_service.handlers.git import (
+    GitRefsResource,
+    GitReceiveResource,
+    GitUploadResource,
+)
 from datalad_service.handlers.annex import GitAnnexResource
 from datalad_service.handlers.reexporter import ReexporterResource
 from datalad_service.handlers.reset import ResetResource
 from datalad_service.handlers.remote_import import RemoteImportResource
 from datalad_service.middleware.auth import AuthenticateMiddleware
 from datalad_service.middleware.error import CustomErrorHandlerMiddleware
 
+
 def before_send(event):
     """Drop transactions that are marked as excluded."""
-    if event.get("transaction") == "excluded":
+    if event.get('transaction') == 'excluded':
         return None
     return event
 
+
 sentry_sdk.init(
     # Set traces_sample_rate to 1.0 to capture 100%
     # of transactions for performance monitoring.
@@ -44,11 +50,12 @@
     # of sampled transactions.
     # We recommend adjusting this value in production.
     profiles_sample_rate=1.0,
-    release=f"openneuro-datalad-service@{datalad_service.version.get_version()}",
+    release=f'openneuro-datalad-service@{datalad_service.version.get_version()}',
     server_name=socket.gethostname(),
     before_send=before_send,
 )
 
+
 class PathConverter(falcon.routing.converters.BaseConverter):
     """: is used because it is human readable as a separator, disallowed in filenames on Windows, and very rare in Unix filenames."""
 
@@ -58,7 +65,9 @@
 
 def create_app():
     if not datalad_service.config.DATALAD_DATASET_PATH:
-        raise Exception("Required DATALAD_DATASET_PATH environment variable is not defined")
+        raise Exception(
+            'Required DATALAD_DATASET_PATH environment variable is not defined'
+        )
 
     middleware = [AuthenticateMiddleware(), CustomErrorHandlerMiddleware()]
 
@@ -106,41 +115,37 @@
     app.add_route('/datasets/{dataset}/objects/{obj}', dataset_objects)
 
     app.add_route('/datasets/{dataset}/snapshots', dataset_snapshots)
+    app.add_route('/datasets/{dataset}/snapshots/{snapshot}', dataset_snapshots)
+    app.add_route('/datasets/{dataset}/snapshots/{snapshot}/files', dataset_files)
     app.add_route(
-        '/datasets/{dataset}/snapshots/{snapshot}', dataset_snapshots)
-    app.add_route(
-        '/datasets/{dataset}/snapshots/{snapshot}/files', dataset_files)
-    app.add_route(
-        '/datasets/{dataset}/snapshots/{snapshot}/files/{filename:path}', dataset_files)
+        '/datasets/{dataset}/snapshots/{snapshot}/files/{filename:path}', dataset_files
+    )
     app.add_route(
-        '/datasets/{dataset}/snapshots/{snapshot}/annex-key/{annex_key}', dataset_annex_objects)
+        '/datasets/{dataset}/snapshots/{snapshot}/annex-key/{annex_key}',
+        dataset_annex_objects,
+    )
+
+    app.add_route('/datasets/{dataset}/publish', dataset_publish)
+    app.add_route('/datasets/{dataset}/reexport-remotes', dataset_reexporter_resources)
 
+    app.add_route('/datasets/{dataset}/upload/{upload}', dataset_upload)
     app.add_route(
-        '/datasets/{dataset}/publish', dataset_publish
+        '/uploads/{worker}/{dataset}/{upload}/{filename:path}', dataset_upload_file
     )
-    app.add_route('/datasets/{dataset}/reexport-remotes',
-                  dataset_reexporter_resources)
 
+    app.add_route('/git/{worker}/{dataset}/info/refs', dataset_git_refs_resource)
     app.add_route(
-        '/datasets/{dataset}/upload/{upload}', dataset_upload
+        '/git/{worker}/{dataset}/git-receive-pack', dataset_git_receive_resource
     )
     app.add_route(
-        '/uploads/{worker}/{dataset}/{upload}/{filename:path}', dataset_upload_file
+        '/git/{worker}/{dataset}/git-upload-pack', dataset_git_upload_resource
     )
-
-    app.add_route('/git/{worker}/{dataset}/info/refs',
-                  dataset_git_refs_resource)
-    app.add_route('/git/{worker}/{dataset}/git-receive-pack',
-                  dataset_git_receive_resource)
-    app.add_route('/git/{worker}/{dataset}/git-upload-pack',
-                  dataset_git_upload_resource)
-    app.add_route('/git/{worker}/{dataset}/annex/{key}',
-                  dataset_git_annex_resource)
+    app.add_route('/git/{worker}/{dataset}/annex/{key}', dataset_git_annex_resource)
     # Serving keys internally as well (read only)
-    app.add_route('/datasets/{dataset}/annex/{key}',
-                  dataset_git_annex_resource)
+    app.add_route('/datasets/{dataset}/annex/{key}', dataset_git_annex_resource)
 
-    app.add_route('/datasets/{dataset}/import/{import_id}',
-                  dataset_remote_import_resource)
+    app.add_route(
+        '/datasets/{dataset}/import/{import_id}', dataset_remote_import_resource
+    )
 
     return app
diff --git a/services/datalad/datalad_service/common/annex.py b/services/datalad/datalad_service/common/annex.py
@@ -13,17 +13,23 @@
 
 SERVICE_EMAIL = '[email protected]'
 SERVICE_USER = 'Git Worker'
-S3_BUCKETS_WHITELIST = ['openneuro.org', 'openneuro-dev-datalad-public', 'openneuro-derivatives', 'bobsrepository']
+S3_BUCKETS_WHITELIST = [
+    'openneuro.org',
+    'openneuro-dev-datalad-public',
+    'openneuro-derivatives',
+    'bobsrepository',
+]
 
 
 class EditAnnexedFileException(Exception):
     """Snapshot conflicts with existing name."""
+
     pass
 
+
 def init_annex(dataset_path):
     """Setup git-annex within an existing git repo"""
-    subprocess.run(['git-annex', 'init', 'OpenNeuro'],
-                   check=True, cwd=dataset_path)
+    subprocess.run(['git-annex', 'init', 'OpenNeuro'], check=True, cwd=dataset_path)
 
 
 def compute_git_hash(path, size):
@@ -55,8 +61,7 @@
 
 def read_ls_tree_line(gitTreeLine, files, symlinkFilenames, symlinkObjects):
     """Read one line of `git ls-tree` and append to the correct buckets of files, symlinks, and objects."""
-    filename, mode, obj_type, obj_hash, size = parse_ls_tree_line(
-        gitTreeLine)
+    filename, mode, obj_type, obj_hash, size = parse_ls_tree_line(gitTreeLine)
     # Skip git / datalad files
     if filename.startswith('.git'):
         return
@@ -65,23 +70,40 @@
     if filename == '.gitattributes':
         return
     # Check if the file is annexed or a submodule
-    if (mode == '120000'):
+    if mode == '120000':
         # Save annexed file symlinks for batch processing
         symlinkFilenames.append(filename)
         symlinkObjects.append(obj_hash)
-    elif (mode == '160000'):
+    elif mode == '160000':
         # Skip submodules
         return
     else:
         # Immediately append regular files
-        if (size == '-'):
+        if size == '-':
             # Tree objects do not have sizes and are never annexed
             files.append(
-                {'id': obj_hash, 'filename': filename, 'directory': True, 'annexed': False, 'size': 0, 'urls': []})
+                {
+                    'id': obj_hash,
+                    'filename': filename,
+                    'directory': True,
+                    'annexed': False,
+                    'size': 0,
+                    'urls': [],
+                }
+            )
         else:
             file_id = compute_file_hash(obj_hash, filename)
-            files.append({'filename': filename, 'size': int(size),
-                          'id': file_id, 'key': obj_hash, 'directory': False, 'urls': [], 'annexed': False})
+            files.append(
+                {
+                    'filename': filename,
+                    'size': int(size),
+                    'id': file_id,
+                    'key': obj_hash,
+                    'directory': False,
+                    'urls': [],
+                    'annexed': False,
+                }
+            )
 
 
 def compute_rmet(key, legacy=False):
@@ -95,8 +117,7 @@
 
 
 def parse_remote_line(remoteLine):
-    remoteConfig = dict(item.split('=')
-                        for item in remoteLine[37:].split(' '))
+    remoteConfig = dict(item.split('=') for item in remoteLine[37:].split(' '))
     if remoteConfig['type'] == 'S3' and remoteConfig['bucket'] in S3_BUCKETS_WHITELIST:
         remoteUuid = remoteLine[0:36]
         remoteUrl = remoteConfig['publicurl'] if 'publicurl' in remoteConfig else None
@@ -133,18 +154,21 @@
 
 def encode_remote_url(url):
     """S3 requires some characters to be encoded"""
-    return urllib.parse.quote_plus(url, safe="/:?=")
+    return urllib.parse.quote_plus(url, safe='/:?=')
 
 
 def get_repo_urls(path, files):
     """For each file provided, obtain the rmet data and append URLs if possible."""
     # First obtain the git-annex branch objects
     gitAnnexBranch = subprocess.Popen(
-        ['git', 'ls-tree', '-l', '-r', 'git-annex'], cwd=path, stdout=subprocess.PIPE, encoding='utf-8')
+        ['git', 'ls-tree', '-l', '-r', 'git-annex'],
+        cwd=path,
+        stdout=subprocess.PIPE,
+        encoding='utf-8',
+    )
     rmetObjects = {}
     for line in gitAnnexBranch.stdout:
-        filename, mode, obj_type, obj_hash, size = parse_ls_tree_line(
-            line.rstrip())
+        filename, mode, obj_type, obj_hash, size = parse_ls_tree_line(line.rstrip())
         rmetObjects[filename] = obj_hash
     if 'remote.log' not in rmetObjects:
         # Skip searching for URLs if no remote.log is present
@@ -170,8 +194,15 @@
         gitObjects += rmetObjects['trust.log'] + '\n'
     gitObjects += rmetObjects['remote.log'] + '\n'
     gitObjects += '\n'.join(rmetObjects[rmetPath] for rmetPath in rmetPaths)
-    catFileProcess = subprocess.run(['git', 'cat-file', '--batch=:::%(objectname)', '--buffer'],
-                                    cwd=path, stdout=subprocess.PIPE, input=gitObjects, encoding='utf-8', bufsize=0, text=True)
+    catFileProcess = subprocess.run(
+        ['git', 'cat-file', '--batch=:::%(objectname)', '--buffer'],
+        cwd=path,
+        stdout=subprocess.PIPE,
+        input=gitObjects,
+        encoding='utf-8',
+        bufsize=0,
+        text=True,
+    )
     catFile = io.StringIO(catFileProcess.stdout)
     # Read in trust.log and remote.log first
     trustLog = {}
@@ -193,7 +224,11 @@
         else:
             matchedRemote = parse_remote_line(line)
             # X remotes are dead
-            if matchedRemote and matchedRemote['uuid'] in trustLog and trustLog[matchedRemote['uuid']] == "X":
+            if (
+                matchedRemote
+                and matchedRemote['uuid'] in trustLog
+                and trustLog[matchedRemote['uuid']] == 'X'
+            ):
                 continue
             if matchedRemote:
                 remote = matchedRemote
@@ -216,19 +251,27 @@
 def get_repo_files(dataset, dataset_path, tree):
     """Read all files in a repo at a given branch, tag, or commit hash."""
     gitProcess = subprocess.Popen(
-        ['git', 'ls-tree', '-l', tree], cwd=dataset_path, stdout=subprocess.PIPE, encoding='utf-8')
+        ['git', 'ls-tree', '-l', tree],
+        cwd=dataset_path,
+        stdout=subprocess.PIPE,
+        encoding='utf-8',
+    )
     files = []
     symlinkFilenames = []
     symlinkObjects = []
     for line in gitProcess.stdout:
         gitTreeLine = line.rstrip()
-        read_ls_tree_line(gitTreeLine, files,
-                          symlinkFilenames, symlinkObjects)
+        read_ls_tree_line(gitTreeLine, files, symlinkFilenames, symlinkObjects)
     # After regular files, process all symlinks with one git cat-file --batch call
     # This is about 100x faster than one call per file for annexed file heavy datasets
     catFileInput = '\n'.join(symlinkObjects)
-    catFileProcess = subprocess.run(['git', 'cat-file', '--batch', '--buffer'],
-                                    cwd=dataset_path, stdout=subprocess.PIPE, input=catFileInput, encoding='utf-8')
+    catFileProcess = subprocess.run(
+        ['git', 'cat-file', '--batch', '--buffer'],
+        cwd=dataset_path,
+        stdout=subprocess.PIPE,
+        input=catFileInput,
+        encoding='utf-8',
+    )
     # Output looks like this:
     # dc9dde956f6f28e425a412a4123526e330668e7e blob 140
     # ../../.git/annex/objects/Q0/VP/MD5E-s1618574--43762c4310549dcc8c5c25567f42722d.nii.gz/MD5E-s1618574--43762c4310549dcc8c5c25567f42722d.nii.gz
@@ -240,8 +283,17 @@
             size = int(key.split('-', 2)[1].lstrip('s'))
             filename = symlinkFilenames[(index - 1) // 2]
             file_id = compute_file_hash(key, filename)
-            files.append({'filename': filename, 'size': int(
-                size), 'id': file_id, 'key': key, 'urls': [], 'annexed': True, 'directory': False})
+            files.append(
+                {
+                    'filename': filename,
+                    'size': int(size),
+                    'id': file_id,
+                    'key': key,
+                    'urls': [],
+                    'annexed': True,
+                    'directory': False,
+                }
+            )
     # Now find URLs for each file if available
     files = get_repo_urls(dataset_path, files)
     # Provide fallbacks for any URLs that did not match rmet exports
@@ -256,8 +308,9 @@
 
 def get_tag_info(dataset_path, tag):
     """`git annex info <tag>`"""
-    git_process = subprocess.run(['git-annex', 'info', '--json', tag],
-                                 cwd=dataset_path, capture_output=True)
+    git_process = subprocess.run(
+        ['git-annex', 'info', '--json', tag], cwd=dataset_path, capture_output=True
+    )
     return json.loads(git_process.stdout)
 
 
@@ -276,7 +329,7 @@
         return False
 
 
-async def edit_annexed_file(path, expected_content, new_content, encoding="utf-8"):
+async def edit_annexed_file(path, expected_content, new_content, encoding='utf-8'):
     """
     Edit a git or annexed file by correctly removing the symlink and writing the new content.
 
@@ -286,7 +339,9 @@
     # Test if the annexed target exists
     if os.path.exists(real_path):
         # Open the working tree or annexed path to verify contents
-        async with aiofiles.open(real_path, 'r', encoding='utf-8', newline='') as annexed_file:
+        async with aiofiles.open(
+            real_path, 'r', encoding='utf-8', newline=''
+        ) as annexed_file:
             annexed_file_contents = await annexed_file.read()
             if expected_content != annexed_file_contents:
                 raise EditAnnexedFileException('unexpected {path} content')

diff --git a/services/datalad/datalad_service/common/bids.py b/services/datalad/datalad_service/common/bids.py
@@ -8,8 +8,7 @@
 def read_dataset_description(dataset_path, commit):
     try:
         repo = pygit2.Repository(dataset_path)
-        raw_description = git_show(
-            repo, commit, 'dataset_description.json')
+        raw_description = git_show(repo, commit, 'dataset_description.json')
         return json.loads(raw_description)
     except json.decoder.JSONDecodeError:
         return None
@@ -21,4 +20,13 @@ def read_dataset_description(dataset_path, commit):
 def dataset_sort(file):
     """BIDS aware sorting of dataset file listings"""
     filename = file.get('filename')
-    return (file.get('directory'), not (filename == 'dataset_description.json' or filename == 'CHANGES' or filename == 'README' or filename == 'LICENSE'), filename)
+    return (
+        file.get('directory'),
+        not (
+            filename == 'dataset_description.json'
+            or filename == 'CHANGES'
+            or filename == 'README'
+            or filename == 'LICENSE'
+        ),
+        filename,
+    )
diff --git a/services/datalad/datalad_service/common/const.py b/services/datalad/datalad_service/common/const.py
@@ -1,2 +1,2 @@
 # Default size for Linux is 65536 but other factors may require tuning this
-CHUNK_SIZE_BYTES = 65536
+CHUNK_SIZE_BYTES = 65536