-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsurgicalDiff.js
More file actions
83 lines (77 loc) · 2.85 KB
/
surgicalDiff.js
File metadata and controls
83 lines (77 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
'use strict';
/**
* Word-level diff used to make AI edits surgical at the changeset level.
*
* When the AI returns a {findText, replaceText} pair, applying it as a
* single splice means every character of replaceText becomes AI-authored
* — even words that already existed verbatim in findText (so were never
* actually rewritten by the AI). Diffing the two strings first and only
* inserting the genuinely-new runs preserves the original author's
* attribution on the unchanged spans.
*
* The diff is word-and-whitespace tokenised: tokens are runs of either
* non-space chars or whitespace. That gives clean breakpoints — a one-
* word change doesn't accidentally re-author the surrounding words just
* because a different mid-word substring happens to LCS-match.
*
* Returns a list of {type, text} ops where type is 'keep' | 'remove' |
* 'insert'. Adjacent same-type ops are coalesced so the caller can map
* each op directly to a single Changeset.Builder operation.
*/
const tokenize = (s) => s.match(/\S+|\s+/g) || [];
const lcsTable = (a, b) => {
const m = a.length;
const n = b.length;
const dp = Array.from({length: m + 1}, () => new Array(n + 1).fill(0));
for (let i = 1; i <= m; i++) {
for (let j = 1; j <= n; j++) {
if (a[i - 1] === b[j - 1]) dp[i][j] = dp[i - 1][j - 1] + 1;
else dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
}
}
return dp;
};
/**
* Diff `oldText` -> `newText` and return the minimal sequence of
* keep/remove/insert ops at the token level (whitespace-aware).
*/
const diffOps = (oldText, newText) => {
if (oldText === newText) {
return oldText.length ? [{type: 'keep', text: oldText}] : [];
}
const a = tokenize(oldText);
const b = tokenize(newText);
const dp = lcsTable(a, b);
const ops = [];
let i = a.length;
let j = b.length;
while (i > 0 || j > 0) {
if (i > 0 && j > 0 && a[i - 1] === b[j - 1]) {
ops.unshift({type: 'keep', text: a[i - 1]});
i--; j--;
} else if (j > 0 && (i === 0 || dp[i][j - 1] >= dp[i - 1][j])) {
ops.unshift({type: 'insert', text: b[j - 1]});
j--;
} else {
ops.unshift({type: 'remove', text: a[i - 1]});
i--;
}
}
// Coalesce adjacent same-type ops so the caller can emit one
// Builder operation per chunk instead of one per token.
const coalesced = [];
for (const op of ops) {
const last = coalesced[coalesced.length - 1];
if (last && last.type === op.type) last.text += op.text;
else coalesced.push({...op});
}
return coalesced;
};
const countNewlines = (s) => {
let n = 0;
for (let i = 0; i < s.length; i++) if (s.charCodeAt(i) === 10) n++;
return n;
};
exports.tokenize = tokenize;
exports.diffOps = diffOps;
exports.countNewlines = countNewlines;