|
| 1 | +/** |
| 2 | + * Utilities for reading git-annex metadata |
| 3 | + */ |
| 4 | +import { dirname, join, parse, SEPARATOR_PATTERN } from '@std/path' |
| 5 | +import { default as git } from 'isomorphic-git' |
| 6 | +import { createMD5 } from 'hash-wasm' |
| 7 | + |
| 8 | +const textDecoder = new TextDecoder('utf-8') |
| 9 | +export const annexKeyRegex = |
| 10 | + /^(?<hashname>[A-Z0-9]+)-s(?<size>\d+)--(?<digest>[0-9a-fA-F]+)(?<ext>\.[\w\-. ]*)?/ |
| 11 | +export const rmetLineRegex = |
| 12 | + /^(?<timestamp>\d+(\.\d+)?)s (?<uuid>[0-9a-fA-F-]+):V \+(?<version>[^#]+)#(?<path>.+)/ |
| 13 | + |
| 14 | +type Rmet = { |
| 15 | + timestamp: number |
| 16 | + uuid: string |
| 17 | + version: string |
| 18 | + path: string |
| 19 | +} |
| 20 | + |
| 21 | +export async function readAnnexPath( |
| 22 | + filepath: string, |
| 23 | + options: any, |
| 24 | +): Promise<string> { |
| 25 | + const oid = await git.resolveRef({ ref: 'git-annex', ...options }) |
| 26 | + const { blob } = await git.readBlob({ oid, filepath, ...options }) |
| 27 | + return textDecoder.decode(blob) |
| 28 | +} |
| 29 | + |
| 30 | +/** |
| 31 | + * git-annex hashDirLower implementation based on https://git-annex.branchable.com/internals/hashing/ |
| 32 | + * Compute the directory path from a git-annex filename |
| 33 | + */ |
| 34 | +export async function hashDirLower(annexKey: string): Promise<[string, string]> { |
| 35 | + const computeMD5 = await createMD5() |
| 36 | + computeMD5.init() |
| 37 | + computeMD5.update(annexKey) |
| 38 | + const digest = computeMD5.digest('hex') |
| 39 | + return [digest.slice(0, 3), digest.slice(3, 6)] |
| 40 | +} |
| 41 | + |
| 42 | +/** |
| 43 | + * Read remote metadata entries for a given annex key |
| 44 | + * |
| 45 | + * *.log.rmet |
| 46 | + * Path: {md5(key)[0:3]}/{md5(key)[3:6]}/{key}.log.rmet |
| 47 | + * Contents: |
| 48 | + * <timestamp> <uuid>:V +<version>#<path> |
| 49 | + * |
| 50 | + * The general form is <uuid>:<key> [+-]<value> and is an append-only log |
| 51 | + * We may at some point care about doing this correctly. |
| 52 | + */ |
| 53 | +async function readRmet(key: string, options: any): Promise<Record<string, Rmet>> { |
| 54 | + const hashDirs = await hashDirLower(key) |
| 55 | + const rmet = await readAnnexPath(join(...hashDirs, `${key}.log.rmet`), options) |
| 56 | + const ret: Record<string, Rmet> = {} |
| 57 | + for (const line of rmet.split('\n')) { |
| 58 | + const match = line.match(rmetLineRegex) |
| 59 | + if (match) { |
| 60 | + ret[match!.groups!.uuid] = match!.groups as unknown as Rmet |
| 61 | + } |
| 62 | + } |
| 63 | + return ret |
| 64 | +} |
| 65 | + |
| 66 | +/** |
| 67 | + * Read special remote configuration from remote.log |
| 68 | + * |
| 69 | + * remote.log |
| 70 | + * <uuid> [<key>=<value>]... |
| 71 | + * keys of interest: |
| 72 | + * name |
| 73 | + * type (S3) |
| 74 | + * publicurl |
| 75 | + * timestamp |
| 76 | + */ |
| 77 | +export async function readRemotes(options: any): Promise<Record<string, Record<string, string>>> { |
| 78 | + const remotesText = await readAnnexPath('remote.log', options) |
| 79 | + const byUUID: Record<string, Record<string, string>> = {} |
| 80 | + for (const line of remotesText.split('\n')) { |
| 81 | + const [uuid, ...keyvals] = line.split(' ') |
| 82 | + byUUID[uuid] = Object.fromEntries(keyvals.map((kv) => kv.split('='))) |
| 83 | + } |
| 84 | + return byUUID |
| 85 | +} |
| 86 | + |
| 87 | +/** |
| 88 | + * Resolve an annexed file location to an HTTP URL, if a public S3 remote is available |
| 89 | + */ |
| 90 | +export async function resolveAnnexedFile( |
| 91 | + path: string, |
| 92 | + remote?: string, |
| 93 | + options?: any, |
| 94 | +): Promise<{ url: string; size: number }> { |
| 95 | + // path is known to be a symlink |
| 96 | + const target = await Deno.readLink(path) |
| 97 | + const { dir, base } = parse(target) |
| 98 | + |
| 99 | + if (!options?.gitdir) { |
| 100 | + const dirs = dir.split(SEPARATOR_PATTERN) |
| 101 | + const gitdir = join(dirname(path), ...dirs.slice(0, dirs.indexOf('.git') + 1)) |
| 102 | + options = { ...options, gitdir } |
| 103 | + } |
| 104 | + |
| 105 | + const size = +base.match(annexKeyRegex)?.groups?.size! |
| 106 | + |
| 107 | + const rmet = await readRmet(base, options) |
| 108 | + const remotes = await readRemotes(options) |
| 109 | + let uuid: string |
| 110 | + if (remote) { |
| 111 | + let matching: string | undefined |
| 112 | + for (const [u, r] of Object.entries(remotes)) { |
| 113 | + // Only consider public S3 remotes. |
| 114 | + // This will need to be expanded for other types of remotes in future |
| 115 | + if (!r?.publicurl) { |
| 116 | + continue |
| 117 | + } |
| 118 | + if (r.name === remote) { |
| 119 | + matching = u |
| 120 | + break |
| 121 | + } |
| 122 | + } |
| 123 | + if (!matching) { |
| 124 | + throw new Error(`No remote named ${remote}`) |
| 125 | + } |
| 126 | + uuid = matching |
| 127 | + } else { |
| 128 | + // Take the newest remote (reverse sort timestamps, take first) |
| 129 | + uuid = Object.entries(rmet).toSorted((a, b) => +b[1].timestamp - +a[1].timestamp)[0][0] |
| 130 | + } |
| 131 | + const { publicurl } = remotes[uuid] |
| 132 | + |
| 133 | + if (!publicurl) { |
| 134 | + throw new Error(`No publicurl found for remote ${uuid}`) |
| 135 | + } |
| 136 | + |
| 137 | + const metadata = rmet[uuid] |
| 138 | + const url = `${publicurl}/${metadata.path}?versionId=${metadata.version}` |
| 139 | + |
| 140 | + return { url, size } |
| 141 | +} |
0 commit comments