Skip to content

Commit ff1cecb

Browse files
authored
Merge pull request #281 from effigies/proxy-files
feat: Load annexed data from public S3 special remotes
2 parents 54a4b9a + 5b5b616 commit ff1cecb

File tree

8 files changed

+660
-24
lines changed

8 files changed

+660
-24
lines changed

build.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import * as esbuild from 'https://deno.land/x/[email protected]/mod.js'
88
import { parse } from 'https://deno.land/[email protected]/flags/mod.ts'
99
import { denoPlugin } from "jsr:@deno/[email protected]";
1010
import * as path from "https://deno.land/[email protected]/path/mod.ts"
11+
import { nodeModulesPolyfillPlugin } from 'npm:esbuild-plugins-node-modules-polyfill'
1112
import { getVersion } from './src/version.ts'
1213

1314

@@ -50,6 +51,15 @@ const result = await esbuild.build({
5051
minify: flags.minify,
5152
target: ['chrome109', 'firefox109', 'safari16'],
5253
plugins: [
54+
nodeModulesPolyfillPlugin({
55+
globals: {
56+
Buffer: true,
57+
},
58+
modules: {
59+
'buffer': true,
60+
'worker_threads': false,
61+
},
62+
}),
5363
versionPlugin,
5464
denoPlugin({
5565
configPath: path.join(dir, 'deno.json'),
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
### Added
2+
3+
- Datalad/git-annex datasets with remote content in public S3 buckets can be validated
4+
with the `--preferredRemote <remote-name>` flag. If a public S3 bucket is detected without
5+
this flag, the most recent export is used.

deno.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@
4242
"@std/log": "jsr:@std/log@^0.224.14",
4343
"@std/path": "jsr:@std/path@^1.1.2",
4444
"@std/streams": "jsr:@std/streams@^1.0.12",
45-
"@std/yaml": "jsr:@std/yaml@^1.0.9"
45+
"@std/yaml": "jsr:@std/yaml@^1.0.9",
46+
"isomorphic-git": "npm:isomorphic-git@^1.27.1",
47+
"hash-wasm": "npm:hash-wasm@^4.12.0"
4648
},
4749
"tasks": {
4850
"test": "deno test -A src/"

deno.lock

Lines changed: 450 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/files/deno.ts

Lines changed: 45 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,50 +3,73 @@
33
*/
44
import { basename, join } from '@std/path'
55
import * as posix from '@std/path/posix'
6-
import { BIDSFile, FileTree } from '../types/filetree.ts'
6+
import { BIDSFile, type FileOpener, FileTree } from '../types/filetree.ts'
77
import { requestReadPermission } from '../setup/requestPermissions.ts'
88
import { FileIgnoreRules, readBidsIgnore } from './ignore.ts'
9-
import { FsFileOpener } from './openers.ts'
9+
import { FsFileOpener, HTTPOpener } from './openers.ts'
10+
import { resolveAnnexedFile } from './repo.ts'
11+
import fs from 'node:fs'
1012

1113
export class BIDSFileDeno extends BIDSFile {
1214
constructor(datasetPath: string, path: string, ignore?: FileIgnoreRules, parent?: FileTree) {
1315
super(path, new FsFileOpener(datasetPath, path), ignore, parent)
1416
}
1517
}
1618

17-
async function _readFileTree(
18-
rootPath: string,
19-
relativePath: string,
20-
ignore: FileIgnoreRules,
21-
prune: FileIgnoreRules,
22-
parent?: FileTree,
23-
): Promise<FileTree> {
19+
type ReadFileTreeOptions = {
20+
rootPath: string
21+
relativePath: string
22+
ignore: FileIgnoreRules
23+
prune: FileIgnoreRules
24+
parent?: FileTree
25+
preferredRemote?: string
26+
}
27+
28+
async function _readFileTree({
29+
rootPath,
30+
relativePath,
31+
ignore,
32+
prune,
33+
parent,
34+
preferredRemote,
35+
}: ReadFileTreeOptions): Promise<FileTree> {
2436
await requestReadPermission()
2537
const name = basename(relativePath)
2638
const tree = new FileTree(relativePath, name, parent, ignore)
2739

40+
// Opaque cache for passing to git operations
41+
const cache = {}
42+
2843
for await (const dirEntry of Deno.readDir(join(rootPath, relativePath))) {
2944
const thisPath = posix.join(relativePath, dirEntry.name)
3045
if (prune.test(thisPath)) {
3146
continue
3247
}
3348
if (dirEntry.isFile || dirEntry.isSymlink) {
34-
const file = new BIDSFile(
35-
thisPath,
36-
new FsFileOpener(rootPath, thisPath),
37-
ignore,
38-
tree,
39-
)
40-
tree.files.push(file)
49+
let opener: FileOpener
50+
const fullPath = join(rootPath, thisPath)
51+
try {
52+
const fileInfo = await Deno.stat(fullPath)
53+
opener = new FsFileOpener(rootPath, thisPath, fileInfo)
54+
} catch (error) {
55+
try {
56+
const { url, size } = await resolveAnnexedFile(fullPath, preferredRemote, { cache, fs })
57+
opener = new HTTPOpener(url, size)
58+
} catch (_) {
59+
throw error
60+
}
61+
}
62+
tree.files.push(new BIDSFile(thisPath, opener, ignore, tree))
4163
}
4264
if (dirEntry.isDirectory) {
43-
const dirTree = await _readFileTree(
65+
const dirTree = await _readFileTree({
4466
rootPath,
45-
thisPath,
67+
relativePath: thisPath,
4668
ignore,
4769
prune,
48-
tree,
49-
)
70+
parent: tree,
71+
preferredRemote,
72+
})
5073
tree.directories.push(dirTree)
5174
}
5275
}
@@ -59,10 +82,11 @@ async function _readFileTree(
5982
export async function readFileTree(
6083
rootPath: string,
6184
prune?: FileIgnoreRules,
85+
preferredRemote?: string,
6286
): Promise<FileTree> {
6387
prune ??= new FileIgnoreRules([], false)
6488
const ignore = new FileIgnoreRules([])
65-
const tree = await _readFileTree(rootPath, '/', ignore, prune)
89+
const tree = await _readFileTree({rootPath, relativePath: '/', ignore, prune, preferredRemote})
6690
const bidsignore = tree.get('.bidsignore')
6791
if (bidsignore) {
6892
try {

src/files/repo.ts

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/**
2+
* Utilities for reading git-annex metadata
3+
*/
4+
import { dirname, join, parse, SEPARATOR_PATTERN } from '@std/path'
5+
import { default as git } from 'isomorphic-git'
6+
import { createMD5 } from 'hash-wasm'
7+
8+
const textDecoder = new TextDecoder('utf-8')
9+
export const annexKeyRegex =
10+
/^(?<hashname>[A-Z0-9]+)-s(?<size>\d+)--(?<digest>[0-9a-fA-F]+)(?<ext>\.[\w\-. ]*)?/
11+
export const rmetLineRegex =
12+
/^(?<timestamp>\d+(\.\d+)?)s (?<uuid>[0-9a-fA-F-]+):V \+(?<version>[^#]+)#(?<path>.+)/
13+
14+
type Rmet = {
15+
timestamp: number
16+
uuid: string
17+
version: string
18+
path: string
19+
}
20+
21+
export async function readAnnexPath(
22+
filepath: string,
23+
options: any,
24+
): Promise<string> {
25+
const oid = await git.resolveRef({ ref: 'git-annex', ...options })
26+
const { blob } = await git.readBlob({ oid, filepath, ...options })
27+
return textDecoder.decode(blob)
28+
}
29+
30+
/**
31+
* git-annex hashDirLower implementation based on https://git-annex.branchable.com/internals/hashing/
32+
* Compute the directory path from a git-annex filename
33+
*/
34+
export async function hashDirLower(annexKey: string): Promise<[string, string]> {
35+
const computeMD5 = await createMD5()
36+
computeMD5.init()
37+
computeMD5.update(annexKey)
38+
const digest = computeMD5.digest('hex')
39+
return [digest.slice(0, 3), digest.slice(3, 6)]
40+
}
41+
42+
/**
43+
* Read remote metadata entries for a given annex key
44+
*
45+
* *.log.rmet
46+
* Path: {md5(key)[0:3]}/{md5(key)[3:6]}/{key}.log.rmet
47+
* Contents:
48+
* <timestamp> <uuid>:V +<version>#<path>
49+
*
50+
* The general form is <uuid>:<key> [+-]<value> and is an append-only log
51+
* We may at some point care about doing this correctly.
52+
*/
53+
async function readRmet(key: string, options: any): Promise<Record<string, Rmet>> {
54+
const hashDirs = await hashDirLower(key)
55+
const rmet = await readAnnexPath(join(...hashDirs, `${key}.log.rmet`), options)
56+
const ret: Record<string, Rmet> = {}
57+
for (const line of rmet.split('\n')) {
58+
const match = line.match(rmetLineRegex)
59+
if (match) {
60+
ret[match!.groups!.uuid] = match!.groups as unknown as Rmet
61+
}
62+
}
63+
return ret
64+
}
65+
66+
/**
67+
* Read special remote configuration from remote.log
68+
*
69+
* remote.log
70+
* <uuid> [<key>=<value>]...
71+
* keys of interest:
72+
* name
73+
* type (S3)
74+
* publicurl
75+
* timestamp
76+
*/
77+
export async function readRemotes(options: any): Promise<Record<string, Record<string, string>>> {
78+
const remotesText = await readAnnexPath('remote.log', options)
79+
const byUUID: Record<string, Record<string, string>> = {}
80+
for (const line of remotesText.split('\n')) {
81+
const [uuid, ...keyvals] = line.split(' ')
82+
byUUID[uuid] = Object.fromEntries(keyvals.map((kv) => kv.split('=')))
83+
}
84+
return byUUID
85+
}
86+
87+
/**
88+
* Resolve an annexed file location to an HTTP URL, if a public S3 remote is available
89+
*/
90+
export async function resolveAnnexedFile(
91+
path: string,
92+
remote?: string,
93+
options?: any,
94+
): Promise<{ url: string; size: number }> {
95+
// path is known to be a symlink
96+
const target = await Deno.readLink(path)
97+
const { dir, base } = parse(target)
98+
99+
if (!options?.gitdir) {
100+
const dirs = dir.split(SEPARATOR_PATTERN)
101+
const gitdir = join(dirname(path), ...dirs.slice(0, dirs.indexOf('.git') + 1))
102+
options = { ...options, gitdir }
103+
}
104+
105+
const size = +base.match(annexKeyRegex)?.groups?.size!
106+
107+
const rmet = await readRmet(base, options)
108+
const remotes = await readRemotes(options)
109+
let uuid: string
110+
if (remote) {
111+
let matching: string | undefined
112+
for (const [u, r] of Object.entries(remotes)) {
113+
// Only consider public S3 remotes.
114+
// This will need to be expanded for other types of remotes in future
115+
if (!r?.publicurl) {
116+
continue
117+
}
118+
if (r.name === remote) {
119+
matching = u
120+
break
121+
}
122+
}
123+
if (!matching) {
124+
throw new Error(`No remote named ${remote}`)
125+
}
126+
uuid = matching
127+
} else {
128+
// Take the newest remote (reverse sort timestamps, take first)
129+
uuid = Object.entries(rmet).toSorted((a, b) => +b[1].timestamp - +a[1].timestamp)[0][0]
130+
}
131+
const { publicurl } = remotes[uuid]
132+
133+
if (!publicurl) {
134+
throw new Error(`No publicurl found for remote ${uuid}`)
135+
}
136+
137+
const metadata = rmet[uuid]
138+
const url = `${publicurl}/${metadata.path}?versionId=${metadata.version}`
139+
140+
return { url, size }
141+
}

src/main.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ export async function main(): Promise<ValidationResult> {
2626
const prune = options.prune
2727
? new FileIgnoreRules(['derivatives', 'sourcedata', 'code'], false)
2828
: undefined
29-
const tree = await readFileTree(absolutePath, prune)
29+
const tree = await readFileTree(absolutePath, prune, options.preferredRemote)
3030

3131
const config = options.config ? JSON.parse(Deno.readTextFileSync(options.config)) as Config : {}
3232

src/setup/options.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ export type ValidatorOptions = {
3333
blacklistModalities: string[]
3434
prune?: boolean
3535
maxRows?: number
36+
preferredRemote?: string
3637
}
3738

3839
const datasetType = new EnumType<string>(
@@ -107,6 +108,10 @@ export const validateCommand: Command<void, void, any, string[], void> = new Com
107108
'-o, --outfile <file:string>',
108109
'File to write validation results to.',
109110
)
111+
.option(
112+
'--preferredRemote <preferredRemote:string>',
113+
'Name of the preferred git-annex remote for accessing remote data (experimental)',
114+
)
110115

111116
// Disabling color output is only available in Deno
112117
if (typeof Deno !== 'undefined') {

0 commit comments

Comments
 (0)