Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
"lint-staged": {
"*.{js,jsx,ts,tsx,json}": [
"deno fmt *"
],
"*.py": [
"uv --directory services/datalad run ruff format"
]
},
"dependenciesMeta": {
Expand Down
61 changes: 33 additions & 28 deletions services/datalad/datalad_service/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,26 @@
from datalad_service.handlers.upload import UploadResource
from datalad_service.handlers.upload_file import UploadFileResource
from datalad_service.handlers.validation import ValidationResource
from datalad_service.handlers.git import GitRefsResource, GitReceiveResource, GitUploadResource
from datalad_service.handlers.git import (
GitRefsResource,
GitReceiveResource,
GitUploadResource,
)
from datalad_service.handlers.annex import GitAnnexResource
from datalad_service.handlers.reexporter import ReexporterResource
from datalad_service.handlers.reset import ResetResource
from datalad_service.handlers.remote_import import RemoteImportResource
from datalad_service.middleware.auth import AuthenticateMiddleware
from datalad_service.middleware.error import CustomErrorHandlerMiddleware


def before_send(event):
"""Drop transactions that are marked as excluded."""
if event.get("transaction") == "excluded":
if event.get('transaction') == 'excluded':

Check warning on line 40 in services/datalad/datalad_service/app.py

View check run for this annotation

Codecov / codecov/patch

services/datalad/datalad_service/app.py#L40

Added line #L40 was not covered by tests
return None
return event


sentry_sdk.init(
# Set traces_sample_rate to 1.0 to capture 100%
# of transactions for performance monitoring.
Expand All @@ -44,11 +50,12 @@
# of sampled transactions.
# We recommend adjusting this value in production.
profiles_sample_rate=1.0,
release=f"openneuro-datalad-service@{datalad_service.version.get_version()}",
release=f'openneuro-datalad-service@{datalad_service.version.get_version()}',
server_name=socket.gethostname(),
before_send=before_send,
)


class PathConverter(falcon.routing.converters.BaseConverter):
""": is used because it is human readable as a separator, disallowed in filenames on Windows, and very rare in Unix filenames."""

Expand All @@ -58,7 +65,9 @@

def create_app():
if not datalad_service.config.DATALAD_DATASET_PATH:
raise Exception("Required DATALAD_DATASET_PATH environment variable is not defined")
raise Exception(

Check warning on line 68 in services/datalad/datalad_service/app.py

View check run for this annotation

Codecov / codecov/patch

services/datalad/datalad_service/app.py#L68

Added line #L68 was not covered by tests
'Required DATALAD_DATASET_PATH environment variable is not defined'
)

middleware = [AuthenticateMiddleware(), CustomErrorHandlerMiddleware()]

Expand Down Expand Up @@ -106,41 +115,37 @@
app.add_route('/datasets/{dataset}/objects/{obj}', dataset_objects)

app.add_route('/datasets/{dataset}/snapshots', dataset_snapshots)
app.add_route('/datasets/{dataset}/snapshots/{snapshot}', dataset_snapshots)
app.add_route('/datasets/{dataset}/snapshots/{snapshot}/files', dataset_files)
app.add_route(
'/datasets/{dataset}/snapshots/{snapshot}', dataset_snapshots)
app.add_route(
'/datasets/{dataset}/snapshots/{snapshot}/files', dataset_files)
app.add_route(
'/datasets/{dataset}/snapshots/{snapshot}/files/{filename:path}', dataset_files)
'/datasets/{dataset}/snapshots/{snapshot}/files/{filename:path}', dataset_files
)
app.add_route(
'/datasets/{dataset}/snapshots/{snapshot}/annex-key/{annex_key}', dataset_annex_objects)
'/datasets/{dataset}/snapshots/{snapshot}/annex-key/{annex_key}',
dataset_annex_objects,
)

app.add_route('/datasets/{dataset}/publish', dataset_publish)
app.add_route('/datasets/{dataset}/reexport-remotes', dataset_reexporter_resources)

app.add_route('/datasets/{dataset}/upload/{upload}', dataset_upload)
app.add_route(
'/datasets/{dataset}/publish', dataset_publish
'/uploads/{worker}/{dataset}/{upload}/{filename:path}', dataset_upload_file
)
app.add_route('/datasets/{dataset}/reexport-remotes',
dataset_reexporter_resources)

app.add_route('/git/{worker}/{dataset}/info/refs', dataset_git_refs_resource)
app.add_route(
'/datasets/{dataset}/upload/{upload}', dataset_upload
'/git/{worker}/{dataset}/git-receive-pack', dataset_git_receive_resource
)
app.add_route(
'/uploads/{worker}/{dataset}/{upload}/{filename:path}', dataset_upload_file
'/git/{worker}/{dataset}/git-upload-pack', dataset_git_upload_resource
)

app.add_route('/git/{worker}/{dataset}/info/refs',
dataset_git_refs_resource)
app.add_route('/git/{worker}/{dataset}/git-receive-pack',
dataset_git_receive_resource)
app.add_route('/git/{worker}/{dataset}/git-upload-pack',
dataset_git_upload_resource)
app.add_route('/git/{worker}/{dataset}/annex/{key}',
dataset_git_annex_resource)
app.add_route('/git/{worker}/{dataset}/annex/{key}', dataset_git_annex_resource)
# Serving keys internally as well (read only)
app.add_route('/datasets/{dataset}/annex/{key}',
dataset_git_annex_resource)
app.add_route('/datasets/{dataset}/annex/{key}', dataset_git_annex_resource)

app.add_route('/datasets/{dataset}/import/{import_id}',
dataset_remote_import_resource)
app.add_route(
'/datasets/{dataset}/import/{import_id}', dataset_remote_import_resource
)

return app
117 changes: 86 additions & 31 deletions services/datalad/datalad_service/common/annex.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,23 @@

SERVICE_EMAIL = '[email protected]'
SERVICE_USER = 'Git Worker'
S3_BUCKETS_WHITELIST = ['openneuro.org', 'openneuro-dev-datalad-public', 'openneuro-derivatives', 'bobsrepository']
S3_BUCKETS_WHITELIST = [
'openneuro.org',
'openneuro-dev-datalad-public',
'openneuro-derivatives',
'bobsrepository',
]


class EditAnnexedFileException(Exception):
"""Snapshot conflicts with existing name."""

pass


def init_annex(dataset_path):
"""Setup git-annex within an existing git repo"""
subprocess.run(['git-annex', 'init', 'OpenNeuro'],
check=True, cwd=dataset_path)
subprocess.run(['git-annex', 'init', 'OpenNeuro'], check=True, cwd=dataset_path)


def compute_git_hash(path, size):
Expand Down Expand Up @@ -55,8 +61,7 @@

def read_ls_tree_line(gitTreeLine, files, symlinkFilenames, symlinkObjects):
"""Read one line of `git ls-tree` and append to the correct buckets of files, symlinks, and objects."""
filename, mode, obj_type, obj_hash, size = parse_ls_tree_line(
gitTreeLine)
filename, mode, obj_type, obj_hash, size = parse_ls_tree_line(gitTreeLine)
# Skip git / datalad files
if filename.startswith('.git'):
return
Expand All @@ -65,23 +70,40 @@
if filename == '.gitattributes':
return
# Check if the file is annexed or a submodule
if (mode == '120000'):
if mode == '120000':
# Save annexed file symlinks for batch processing
symlinkFilenames.append(filename)
symlinkObjects.append(obj_hash)
elif (mode == '160000'):
elif mode == '160000':
# Skip submodules
return
else:
# Immediately append regular files
if (size == '-'):
if size == '-':
# Tree objects do not have sizes and are never annexed
files.append(
{'id': obj_hash, 'filename': filename, 'directory': True, 'annexed': False, 'size': 0, 'urls': []})
{
'id': obj_hash,
'filename': filename,
'directory': True,
'annexed': False,
'size': 0,
'urls': [],
}
)
else:
file_id = compute_file_hash(obj_hash, filename)
files.append({'filename': filename, 'size': int(size),
'id': file_id, 'key': obj_hash, 'directory': False, 'urls': [], 'annexed': False})
files.append(
{
'filename': filename,
'size': int(size),
'id': file_id,
'key': obj_hash,
'directory': False,
'urls': [],
'annexed': False,
}
)


def compute_rmet(key, legacy=False):
Expand All @@ -95,8 +117,7 @@


def parse_remote_line(remoteLine):
remoteConfig = dict(item.split('=')
for item in remoteLine[37:].split(' '))
remoteConfig = dict(item.split('=') for item in remoteLine[37:].split(' '))
if remoteConfig['type'] == 'S3' and remoteConfig['bucket'] in S3_BUCKETS_WHITELIST:
remoteUuid = remoteLine[0:36]
remoteUrl = remoteConfig['publicurl'] if 'publicurl' in remoteConfig else None
Expand Down Expand Up @@ -133,18 +154,21 @@

def encode_remote_url(url):
"""S3 requires some characters to be encoded"""
return urllib.parse.quote_plus(url, safe="/:?=")
return urllib.parse.quote_plus(url, safe='/:?=')


def get_repo_urls(path, files):
"""For each file provided, obtain the rmet data and append URLs if possible."""
# First obtain the git-annex branch objects
gitAnnexBranch = subprocess.Popen(
['git', 'ls-tree', '-l', '-r', 'git-annex'], cwd=path, stdout=subprocess.PIPE, encoding='utf-8')
['git', 'ls-tree', '-l', '-r', 'git-annex'],
cwd=path,
stdout=subprocess.PIPE,
encoding='utf-8',
)
rmetObjects = {}
for line in gitAnnexBranch.stdout:
filename, mode, obj_type, obj_hash, size = parse_ls_tree_line(
line.rstrip())
filename, mode, obj_type, obj_hash, size = parse_ls_tree_line(line.rstrip())
rmetObjects[filename] = obj_hash
if 'remote.log' not in rmetObjects:
# Skip searching for URLs if no remote.log is present
Expand All @@ -170,8 +194,15 @@
gitObjects += rmetObjects['trust.log'] + '\n'
gitObjects += rmetObjects['remote.log'] + '\n'
gitObjects += '\n'.join(rmetObjects[rmetPath] for rmetPath in rmetPaths)
catFileProcess = subprocess.run(['git', 'cat-file', '--batch=:::%(objectname)', '--buffer'],
cwd=path, stdout=subprocess.PIPE, input=gitObjects, encoding='utf-8', bufsize=0, text=True)
catFileProcess = subprocess.run(

Check warning on line 197 in services/datalad/datalad_service/common/annex.py

View check run for this annotation

Codecov / codecov/patch

services/datalad/datalad_service/common/annex.py#L197

Added line #L197 was not covered by tests
['git', 'cat-file', '--batch=:::%(objectname)', '--buffer'],
cwd=path,
stdout=subprocess.PIPE,
input=gitObjects,
encoding='utf-8',
bufsize=0,
text=True,
)
catFile = io.StringIO(catFileProcess.stdout)
# Read in trust.log and remote.log first
trustLog = {}
Expand All @@ -193,7 +224,11 @@
else:
matchedRemote = parse_remote_line(line)
# X remotes are dead
if matchedRemote and matchedRemote['uuid'] in trustLog and trustLog[matchedRemote['uuid']] == "X":
if (

Check warning on line 227 in services/datalad/datalad_service/common/annex.py

View check run for this annotation

Codecov / codecov/patch

services/datalad/datalad_service/common/annex.py#L227

Added line #L227 was not covered by tests
matchedRemote
and matchedRemote['uuid'] in trustLog
and trustLog[matchedRemote['uuid']] == 'X'
):
continue
if matchedRemote:
remote = matchedRemote
Expand All @@ -216,19 +251,27 @@
def get_repo_files(dataset, dataset_path, tree):
"""Read all files in a repo at a given branch, tag, or commit hash."""
gitProcess = subprocess.Popen(
['git', 'ls-tree', '-l', tree], cwd=dataset_path, stdout=subprocess.PIPE, encoding='utf-8')
['git', 'ls-tree', '-l', tree],
cwd=dataset_path,
stdout=subprocess.PIPE,
encoding='utf-8',
)
files = []
symlinkFilenames = []
symlinkObjects = []
for line in gitProcess.stdout:
gitTreeLine = line.rstrip()
read_ls_tree_line(gitTreeLine, files,
symlinkFilenames, symlinkObjects)
read_ls_tree_line(gitTreeLine, files, symlinkFilenames, symlinkObjects)
# After regular files, process all symlinks with one git cat-file --batch call
# This is about 100x faster than one call per file for annexed file heavy datasets
catFileInput = '\n'.join(symlinkObjects)
catFileProcess = subprocess.run(['git', 'cat-file', '--batch', '--buffer'],
cwd=dataset_path, stdout=subprocess.PIPE, input=catFileInput, encoding='utf-8')
catFileProcess = subprocess.run(
['git', 'cat-file', '--batch', '--buffer'],
cwd=dataset_path,
stdout=subprocess.PIPE,
input=catFileInput,
encoding='utf-8',
)
# Output looks like this:
# dc9dde956f6f28e425a412a4123526e330668e7e blob 140
# ../../.git/annex/objects/Q0/VP/MD5E-s1618574--43762c4310549dcc8c5c25567f42722d.nii.gz/MD5E-s1618574--43762c4310549dcc8c5c25567f42722d.nii.gz
Expand All @@ -240,8 +283,17 @@
size = int(key.split('-', 2)[1].lstrip('s'))
filename = symlinkFilenames[(index - 1) // 2]
file_id = compute_file_hash(key, filename)
files.append({'filename': filename, 'size': int(
size), 'id': file_id, 'key': key, 'urls': [], 'annexed': True, 'directory': False})
files.append(
{
'filename': filename,
'size': int(size),
'id': file_id,
'key': key,
'urls': [],
'annexed': True,
'directory': False,
}
)
# Now find URLs for each file if available
files = get_repo_urls(dataset_path, files)
# Provide fallbacks for any URLs that did not match rmet exports
Expand All @@ -256,8 +308,9 @@

def get_tag_info(dataset_path, tag):
"""`git annex info <tag>`"""
git_process = subprocess.run(['git-annex', 'info', '--json', tag],
cwd=dataset_path, capture_output=True)
git_process = subprocess.run(
['git-annex', 'info', '--json', tag], cwd=dataset_path, capture_output=True
)
return json.loads(git_process.stdout)


Expand All @@ -276,7 +329,7 @@
return False


async def edit_annexed_file(path, expected_content, new_content, encoding="utf-8"):
async def edit_annexed_file(path, expected_content, new_content, encoding='utf-8'):
"""
Edit a git or annexed file by correctly removing the symlink and writing the new content.

Expand All @@ -286,7 +339,9 @@
# Test if the annexed target exists
if os.path.exists(real_path):
# Open the working tree or annexed path to verify contents
async with aiofiles.open(real_path, 'r', encoding='utf-8', newline='') as annexed_file:
async with aiofiles.open(
real_path, 'r', encoding='utf-8', newline=''
) as annexed_file:
annexed_file_contents = await annexed_file.read()
if expected_content != annexed_file_contents:
raise EditAnnexedFileException('unexpected {path} content')
Expand Down
14 changes: 11 additions & 3 deletions services/datalad/datalad_service/common/bids.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
def read_dataset_description(dataset_path, commit):
try:
repo = pygit2.Repository(dataset_path)
raw_description = git_show(
repo, commit, 'dataset_description.json')
raw_description = git_show(repo, commit, 'dataset_description.json')
return json.loads(raw_description)
except json.decoder.JSONDecodeError:
return None
Expand All @@ -21,4 +20,13 @@ def read_dataset_description(dataset_path, commit):
def dataset_sort(file):
"""BIDS aware sorting of dataset file listings"""
filename = file.get('filename')
return (file.get('directory'), not (filename == 'dataset_description.json' or filename == 'CHANGES' or filename == 'README' or filename == 'LICENSE'), filename)
return (
file.get('directory'),
not (
filename == 'dataset_description.json'
or filename == 'CHANGES'
or filename == 'README'
or filename == 'LICENSE'
),
filename,
)
2 changes: 1 addition & 1 deletion services/datalad/datalad_service/common/const.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Default size for Linux is 65536 but other factors may require tuning this
CHUNK_SIZE_BYTES = 65536
CHUNK_SIZE_BYTES = 65536
Loading