Skip to content

Commit 90dbe88

Browse files
committed
Iterate strings in JavaScript with a custom iterator
See google#69 In this patch I'm trying to see if we can scan through strings in the library while preserving Unicode semantics if we stay close to native JavaScript strings. That is, by only accounting for things like surrogates when we have to. As is obvious from inspection, this work is incomplete and it was the attempt that has led me to pursue concverting the intput strings up-front into an already-split array of Unicode scalar values.
1 parent 62f2e68 commit 90dbe88

File tree

2 files changed

+248
-9
lines changed

2 files changed

+248
-9
lines changed

javascript/diff_match_patch_uncompressed.js

Lines changed: 117 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,111 @@ diff_match_patch.Diff.prototype.toString = function() {
8888
return this[0] + ',' + this[1];
8989
};
9090

91+
diff_match_patch.prototype.U_is_high_surrogate = function(c) {
92+
var v = c.charCodeAt(0);
93+
return v >= 0xD800 && v <= 0xDBFF;
94+
}
95+
96+
diff_match_patch.prototype.U_is_low_surrogate = function(c) {
97+
var v = c.charCodeAt(0);
98+
return v >= 0xDC00 && v <= 0xDFFF;
99+
}
100+
101+
/**
102+
* Returns the index of the Nth code point after the given index in a string
103+
*
104+
* @param {string} str Given string
105+
* @param {number} index Starting index
106+
* @param {number} steps Number of code points to advance
107+
* @returns {number} Index in string of advanced code point
108+
*/
109+
diff_match_patch.prototype.U_advance = function(str, index, steps) {
110+
var next = index;
111+
112+
for (var i = (!steps && 0 !== steps) ? 1 : steps; i > 0; i--) {
113+
next += this.U_is_high_surrogate(str[next]) ? 2 : 1;
114+
}
115+
116+
return next;
117+
}
118+
119+
/**
120+
* Returns the index of the Nth code point before the given index in a string
121+
*
122+
* @param {string} str Given string
123+
* @param {number} index Starting index
124+
* @param {number} steps Number of code points to go back
125+
* @returns {number} Index in string of advanced code point
126+
*/
127+
diff_match_patch.prototype.U_recede = function(str, index, steps) {
128+
var prev = index;
129+
130+
for (var i = (!steps && 0 !== steps) ? 1 : steps; i > 0; i--) {
131+
prev -= this.U_is_low_surrogate(str[prev]) ? 2 : 1;
132+
}
133+
134+
return prev;
135+
}
136+
137+
diff_match_patch.prototype.U_length = function(str) {
138+
var length = str.length;
139+
var uLength = 0;
140+
141+
for (var i = 0; i < length; i++) {
142+
if (this.U_is_high_surrogate(str[i])) {
143+
i++;
144+
}
145+
uLength++;
146+
}
147+
148+
return uLength;
149+
}
150+
151+
diff_match_patch.prototype.U_charAt = function(str, index) {
152+
return this.U_is_high_surrogate(str[index])
153+
? str[index] + str[index + 1]
154+
: str[index];
155+
}
156+
157+
/**
158+
* Returns closest `indexOf` past a Unicode word boundary
159+
*
160+
* @param {string} str "Containing" string to search inside
161+
* @param {string} searchValue String to search for
162+
* @param {number} fromIndex Index at which to start searching
163+
* @return {string} Index of first match if found, otherwise -1.
164+
* Skips matches that don't occur on word boundaries.
165+
*/
166+
diff_match_patch.prototype.U_indexOf = function(str, searchValue, fromIndex) {
167+
var match = str.indexOf(searchValue, fromIndex || 0);
168+
var atBoundary = match <= 0 || !this.U_is_low_surrogate(str[match - 1]);
169+
170+
if ( ! atBoundary ) {
171+
console.log("tried to split inside word boundary");
172+
}
173+
174+
return atBoundary
175+
? match
176+
: diff_match_patch.prototype.U_indexOf(str, searchValue, match + 1);
177+
}
178+
179+
/**
180+
* Returns closest Unicode boundary at or before a given index in a string
181+
*
182+
* @param {string} s Containing string
183+
* @param {number} index Candidate index
184+
* @returns {number} closest index on or before given index which falls on a Unicode boundary
185+
*/
186+
diff_match_patch.prototype.U_boundary = function(s, index) {
187+
var i = index;
188+
189+
while (i >= 0 && this.U_is_low_surrogate(s[i])) {
190+
i--;
191+
console.log("tried to index a non-boundary");
192+
}
193+
194+
return i;
195+
}
91196

92197
/**
93198
* Find the differences between two texts. Simplifies the problem by stripping
@@ -187,7 +292,7 @@ diff_match_patch.prototype.diff_compute_ = function(text1, text2, checklines,
187292

188293
var longtext = text1.length > text2.length ? text1 : text2;
189294
var shorttext = text1.length > text2.length ? text2 : text1;
190-
var i = longtext.indexOf(shorttext);
295+
var i = this.U_indexOf(longtext, shorttext);
191296
if (i != -1) {
192297
// Shorter text is inside the longer text (speedup).
193298
diffs = [new diff_match_patch.Diff(DIFF_INSERT, longtext.substring(0, i)),
@@ -317,7 +422,9 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
317422
// Cache the text lengths to prevent multiple calls.
318423
var text1_length = text1.length;
319424
var text2_length = text2.length;
320-
var max_d = Math.ceil((text1_length + text2_length) / 2);
425+
var text1_U_length = this.U_length(text1);
426+
var text2_U_length = this.U_length(text2);
427+
var max_d = Math.ceil((text1_U_length + text2_U_length) / 2);
321428
var v_offset = max_d;
322429
var v_length = 2 * max_d;
323430
var v1 = new Array(v_length);
@@ -330,7 +437,7 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
330437
}
331438
v1[v_offset + 1] = 0;
332439
v2[v_offset + 1] = 0;
333-
var delta = text1_length - text2_length;
440+
var delta = text1_U_length - text2_U_length;
334441
// If the total number of characters is odd, then the front path will collide
335442
// with the reverse path.
336443
var front = (delta % 2 != 0);
@@ -353,13 +460,15 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
353460
if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) {
354461
x1 = v1[k1_offset + 1];
355462
} else {
356-
x1 = v1[k1_offset - 1] + 1;
463+
x1 = this.U_advance(text1, v1[k1_offset - 1]);
357464
}
358-
var y1 = x1 - k1;
465+
var y1 = k1 > 0 ? this.U_advance(text2, x1, k1) :
466+
k2 < 0 ? this.U_recede(text2, x1, k1) :
467+
x1;
359468
while (x1 < text1_length && y1 < text2_length &&
360-
text1.charAt(x1) == text2.charAt(y1)) {
361-
x1++;
362-
y1++;
469+
this.U_charAt(text1, x1) == this.U_charAt(text2, y1)) {
470+
x1 = this.U_advance(text1, x1);
471+
y1 = this.U_advance(text2, y1);
363472
}
364473
v1[k1_offset] = x1;
365474
if (x1 > text1_length) {

javascript/tests/diff_match_patch_test.js

Lines changed: 131 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,136 @@ function testDiffDelta() {
492492
// Convert delta string into a diff.
493493
assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));
494494

495+
diffs = [[DIFF_EQUAL, '\ud83d\ude4b\ud83d'], [DIFF_INSERT, '\ude4c\ud83d'], [DIFF_EQUAL, '\ude4b']];
496+
try {
497+
delta = dmp.diff_toDelta(diffs);
498+
assertEquals('=2\t+%F0%9F%99%8C\t=2', delta);
499+
} catch ( e ) {
500+
assertEquals(false, true);
501+
}
502+
503+
(function(){
504+
const originalText = `U+1F17x 🅰️ 🅱️ 🅾️ 🅿️ safhawifhkw
505+
U+1F18x 🆎
506+
0 1 2 3 4 5 6 7 8 9 A B C D E F
507+
U+1F19x 🆑 🆒 🆓 🆔 🆕 🆖 🆗 🆘 🆙 🆚
508+
U+1F20x 🈁 🈂️ sfss.,_||saavvvbbds
509+
U+1F21x 🈚
510+
U+1F22x 🈯
511+
U+1F23x 🈲 🈳 🈴 🈵 🈶 🈷️ 🈸 🈹 🈺
512+
U+1F25x 🉐 🉑
513+
U+1F30x 🌀 🌁 🌂 🌃 🌄 🌅 🌆 🌇 🌈 🌉 🌊 🌋 🌌 🌍 🌎 🌏
514+
U+1F31x 🌐 🌑 🌒 🌓 🌔 🌕 🌖 🌗 🌘 🌙 🌚 🌛 🌜 🌝 🌞 `;
515+
516+
// applies some random edits to string and returns new, edited string
517+
function applyRandomTextEdit(text) {
518+
let textArr = [...text];
519+
let r = Math.random();
520+
if(r < 1/3) { // swap
521+
let swapCount = Math.floor(Math.random()*5);
522+
for(let i = 0; i < swapCount; i++) {
523+
let swapPos1 = Math.floor(Math.random()*textArr.length);
524+
let swapPos2 = Math.floor(Math.random()*textArr.length);
525+
let char1 = textArr[swapPos1];
526+
let char2 = textArr[swapPos2];
527+
textArr[swapPos1] = char2;
528+
textArr[swapPos2] = char1;
529+
}
530+
} else if(r < 2/3) { // remove
531+
let removeCount = Math.floor(Math.random()*5);
532+
for(let i = 0; i < removeCount; i++) {
533+
let removePos = Math.floor(Math.random()*textArr.length);
534+
textArr[removePos] = "";
535+
}
536+
} else { // add
537+
let addCount = Math.floor(Math.random()*5);
538+
for(let i = 0; i < addCount; i++) {
539+
let addPos = Math.floor(Math.random()*textArr.length);
540+
let addFromPos = Math.floor(Math.random()*textArr.length);
541+
textArr[addPos] = textArr[addPos] + textArr[addFromPos];
542+
}
543+
}
544+
return textArr.join("");
545+
}
546+
547+
for(let i = 0; i < 1000; i++) {
548+
newText = applyRandomTextEdit(originalText);
549+
dmp.diff_toDelta(dmp.diff_main(originalText, newText));
550+
}
551+
})();
552+
553+
// Unicode - splitting surrogates
554+
try {
555+
assertEquivalent(
556+
dmp.diff_toDelta([[DIFF_INSERT,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
557+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
558+
);
559+
} catch ( e ) {
560+
assertEquals('Inserting similar surrogate pair at beginning', 'crashed');
561+
}
562+
563+
try {
564+
assertEquivalent(
565+
dmp.diff_toDelta([[DIFF_EQUAL,'\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd70'], [DIFF_EQUAL, '\ud83c\udd71']]),
566+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd70\ud83c\udd71'))
567+
);
568+
} catch ( e ) {
569+
assertEquals('Inserting similar surrogate pair in the middle', 'crashed');
570+
}
571+
572+
try {
573+
assertEquivalent(
574+
dmp.diff_toDelta([[DIFF_DELETE,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
575+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
576+
);
577+
} catch ( e ) {
578+
assertEquals('Deleting similar surrogate pair at the beginning', 'crashed');
579+
}
580+
581+
try {
582+
assertEquivalent(
583+
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c\udd70'], [DIFF_DELETE,'\ud83c\udd72'], [DIFF_EQUAL, '\ud83c\udd71']]),
584+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
585+
);
586+
} catch ( e ) {
587+
assertEquals('Deleting similar surrogate pair in the middle', 'crashed');
588+
}
589+
590+
try {
591+
assertEquivalent(
592+
dmp.diff_toDelta([[DIFF_DELETE, '\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd71']]),
593+
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_DELETE, '\udd70'], [DIFF_INSERT, '\udd71']]),
594+
);
595+
} catch ( e ) {
596+
assertEquals('Swap surrogate pair', 'crashed');
597+
}
598+
599+
try {
600+
assertEquivalent(
601+
dmp.diff_toDelta([[DIFF_INSERT, '\ud83c\udd70'], [DIFF_DELETE, '\ud83c\udd71']]),
602+
dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_INSERT, '\udd70'], [DIFF_DELETE, '\udd71']]),
603+
);
604+
} catch ( e ) {
605+
assertEquals('Swap surrogate pair', 'crashed');
606+
}
607+
608+
// Empty diff groups
609+
assertEquivalent(
610+
dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]),
611+
dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']]),
612+
);
613+
614+
// Different versions of the library may have created deltas with
615+
// half of a surrogate pair encoded as if it were valid UTF-8
616+
try {
617+
assertEquivalent(
618+
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
619+
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
620+
);
621+
} catch ( e ) {
622+
assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
623+
}
624+
495625
// Verify pool of unchanged characters.
496626
diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
497627
var text2 = dmp.diff_text2(diffs);
@@ -963,4 +1093,4 @@ function testPatchApply() {
9631093
patches = dmp.patch_make('y', 'y123');
9641094
results = dmp.patch_apply(patches, 'x');
9651095
assertEquivalent(['x123', [true]], results);
966-
}
1096+
}

0 commit comments

Comments
 (0)