Skip to content

Commit 219ddb2

Browse files
authored
Merge pull request #3563 from OpenNeuroOrg/3561/annex-key-checks
refactor(cli): Recheck annexed object contents added in previous upload runs
2 parents e09d2ae + 48c4bae commit 219ddb2

File tree

2 files changed

+78
-22
lines changed

2 files changed

+78
-22
lines changed

cli/src/worker/annex.ts

Lines changed: 66 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,22 @@
11
import type { GitWorkerContext } from "./types/git-context.ts"
2+
import type { Logger } from "@std/log"
23
import { basename, dirname, join, relative } from "@std/path"
3-
import { default as git } from "isomorphic-git"
4+
import { default as git, TREE } from "isomorphic-git"
45

56
/**
67
* Why are we using hash wasm over web crypto?
78
* Web crypto cannot do streaming hashes of the common git-annex functions yet.
89
*/
910
import { createMD5, createSHA256 } from "hash-wasm"
1011

12+
/**
13+
* Mapping from annex key to relative paths
14+
*/
15+
export type AnnexKeyPaths = Record<string, string>
16+
17+
// Initialize a utf-8 text decoder for reuse
18+
const textDecoder = new TextDecoder("utf-8")
19+
1120
/**
1221
* Reusable hash factories
1322
*/
@@ -65,7 +74,6 @@ export function annexRelativePath(path: string) {
6574
* @param context GitWorkerContext objects
6675
*/
6776
export async function annexAdd(
68-
annexKeys: Record<string, string>,
6977
hash: string,
7078
path: string,
7179
relativePath: string,
@@ -119,8 +127,6 @@ export async function annexAdd(
119127

120128
// Key has changed if the existing link points to another object
121129
if (forceAdd || link !== symlinkTarget) {
122-
// Upload this key after the git commit
123-
annexKeys[annexKey] = path
124130
// This object has a new annex hash, update the symlink and add it
125131
const symlinkTarget = join(
126132
annexRelativePath(relativePath),
@@ -157,5 +163,60 @@ export async function readAnnexPath(
157163
oid: annexBranchOid,
158164
filepath: logPath,
159165
})
160-
return new TextDecoder().decode(blob)
166+
return textDecoder.decode(blob)
167+
}
168+
169+
/**
170+
* Walk the git tree belonging to `ref` and return a mapping from annex keys to relative path
171+
* @param ref Git reference to scan for annex objects
172+
* @param logger Logger to use, reports what keys are found at INFO level
173+
* @param context GitWorkerContext configured for a repo
174+
*/
175+
export async function getAnnexKeys(
176+
ref: string,
177+
logger: Logger,
178+
context: GitWorkerContext,
179+
): Promise<AnnexKeyPaths> {
180+
const annexKeys = {} as AnnexKeyPaths
181+
const annexedGitObjects: { path: string; oid: string }[] = []
182+
// Walk HEAD and find all annex symlinks
183+
await git.walk({
184+
...context.config(),
185+
trees: [TREE({ ref })],
186+
map: async function (filepath, [entry]) {
187+
if (
188+
entry && await entry.type() === "blob" &&
189+
await entry.mode() === 0o120000
190+
) {
191+
annexedGitObjects.push({ path: filepath, oid: await entry.oid() })
192+
const content = await entry.content()
193+
if (content) {
194+
const symlinkTarget = textDecoder.decode(content)
195+
const annexKey = basename(symlinkTarget)
196+
// Check that annexKey conforms to the git-annex key format
197+
// Other symlinks are allowed but may be rejected on push if they point outside of the repo
198+
if (
199+
annexKey.match(/^[A-Z0-9]+-s\d+--[0-9a-fA-F]+(\.[a-zA-Z0-9.]*)?$/)
200+
) {
201+
logger.info(`Found key "${annexKey}" in HEAD.`)
202+
annexKeys[annexKey] = filepath
203+
} else {
204+
logger.warn(
205+
`Skipping invalid annex key format: "${annexKey}" for file "${filepath}"`,
206+
)
207+
}
208+
}
209+
}
210+
},
211+
})
212+
213+
if (annexedGitObjects.length > 0) {
214+
logger.info("Annexed objects in HEAD:")
215+
for (const obj of annexedGitObjects) {
216+
logger.info(`- ${obj.path} (OID: ${obj.oid})`)
217+
}
218+
} else {
219+
logger.info("No annexed objects found in HEAD.")
220+
}
221+
return annexKeys
161222
}

cli/src/worker/git.ts

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,15 @@ import { logger, setupLogging } from "../logger.ts"
77
import { PromiseQueue } from "./queue.ts"
88
import { checkKey, storeKey } from "./transferKey.ts"
99
import ProgressBar from "@deno-library/progress"
10-
import { annexAdd, hashDirLower, readAnnexPath } from "./annex.ts"
10+
import type { AnnexKeyPaths } from "./annex.ts"
11+
import { annexAdd, getAnnexKeys, hashDirLower, readAnnexPath } from "./annex.ts"
1112
import { GitWorkerContext } from "./types/git-context.ts"
1213
import { resetWorktree } from "./resetWorktree.ts"
1314
import { getDefault } from "./getDefault.ts"
1415

1516
let context: GitWorkerContext
1617
let attributesCache: GitAnnexAttributes
1718

18-
/**
19-
* Paths to upload to the remote annex
20-
*
21-
* Keys are the annex key
22-
* Values are repo relative path
23-
*/
24-
const annexKeys: Record<string, string> = {}
25-
2619
async function done() {
2720
logger.info("Git worker shutdown.")
2821
// @ts-ignore
@@ -163,7 +156,6 @@ async function add(event: GitWorkerEventAdd) {
163156
} else {
164157
if (
165158
await annexAdd(
166-
annexKeys,
167159
annexed,
168160
event.data.path,
169161
event.data.relativePath,
@@ -214,14 +206,13 @@ async function createAnnexBranch() {
214206
* Generate a commit for remote.log updates if needed
215207
*/
216208
async function remoteSetup() {
217-
const noAnnexKeys: Record<string, string> = {}
218-
await commitAnnexBranch(noAnnexKeys)
209+
await commitAnnexBranch()
219210
}
220211

221212
/**
222213
* Generate one commit for all pending git-annex branch changes
223214
*/
224-
async function commitAnnexBranch(annexKeys: Record<string, string>) {
215+
async function commitAnnexBranch() {
225216
// Find the UUID of this repository if it exists already
226217
const expectedRemote = "OpenNeuro" // TODO - This could be more flexible?
227218
let uuid
@@ -280,6 +271,7 @@ async function commitAnnexBranch(annexKeys: Record<string, string>) {
280271
}
281272
}
282273
// Add logs for each annexed file
274+
const annexKeys = await getAnnexKeys("HEAD", logger, context)
283275
for (const [key, _path] of Object.entries(annexKeys)) {
284276
const hashDir = join(...await hashDirLower(key))
285277
const annexBranchPath = join(hashDir, `${key}.log`)
@@ -379,7 +371,7 @@ async function commit() {
379371
author: context.author,
380372
message: "[OpenNeuro] Added local files",
381373
})
382-
await commitAnnexBranch(annexKeys)
374+
await commitAnnexBranch()
383375
logger.info(`Committed as "${commitHash}"`)
384376
}
385377
}
@@ -389,6 +381,7 @@ async function commit() {
389381
*/
390382
async function push() {
391383
let completed = 0
384+
const annexKeys: AnnexKeyPaths = await getAnnexKeys("HEAD", logger, context)
392385
const annexedObjects = Object.keys(annexKeys).length
393386
const progress = new ProgressBar({
394387
title: `Transferring annexed files`,
@@ -398,13 +391,15 @@ async function push() {
398391
await progress.render(completed)
399392
}
400393
// Git-annex copy --to=openneuro
401-
for (const [key, path] of Object.entries(annexKeys)) {
394+
for (const [key, relativePath] of Object.entries(annexKeys)) {
402395
const checkKeyResult = await checkKey({
403396
url: context.repoEndpoint,
404397
token: context.authorization,
405398
}, key)
406399
if (checkKeyResult) {
407400
logger.info(`Skipping key "${key}" present on remote`)
401+
completed += 1
402+
await progress.render(completed)
408403
} else {
409404
let storeKeyResult = -1
410405
let retries = 3
@@ -416,7 +411,7 @@ async function push() {
416411
token: context.authorization,
417412
},
418413
key,
419-
path,
414+
join(context.sourcePath, relativePath),
420415
)
421416
if (storeKeyResult === -1 && retries > 0) {
422417
logger.warn(`Failed to transfer annex object "${key}" - retrying`)
@@ -430,7 +425,7 @@ async function push() {
430425
completed += 1
431426
await progress.render(completed)
432427
logger.info(
433-
`Stored ${storeKeyResult} bytes for key "${key}" from path "${path}"`,
428+
`Stored ${storeKeyResult} bytes for key "${key}" from path "${relativePath}"`,
434429
)
435430
}
436431
}

0 commit comments

Comments
 (0)