Skip to content

Commit 00260dd

Browse files
committed
Prevent splitting surrogate pairs when diffing
See google#69 In this patch I'm trying to follow the approach taken by @judofyr but starting from the top of the script and going through, auditing every place that perform string operations that split, index, or otherwise operate on a character level so that we can make sure that we don't split surrogate pairs. This contrasts with [attempt one] where I created a custom iterator for strings. Surprisingly I found this more "ad-hoc" approach easier to manage since it doesn't create a split universe of string/Unicode. As of this commit I haven't audited the cleanup functions but my own tests are passing so I'm given to believe that they might be safe. I have my own doubts that this is sound work and that the middle-snake algorithm might find the wrong snake when presented with variable-width characters.
1 parent 62f2e68 commit 00260dd

File tree

2 files changed

+137
-2
lines changed

2 files changed

+137
-2
lines changed

javascript/diff_match_patch_uncompressed.js

+59-1
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,15 @@ diff_match_patch.Diff.prototype.toString = function() {
8888
return this[0] + ',' + this[1];
8989
};
9090

91+
diff_match_patch.prototype.isHighSurrogate = function(c) {
92+
var v = c.charCodeAt(0);
93+
return v >= 0xD800 && v <= 0xDBFF;
94+
}
95+
96+
diff_match_patch.prototype.isLowSurrogate = function(c) {
97+
var v = c.charCodeAt(0);
98+
return v >= 0xDC00 && v <= 0xDFFF;
99+
}
91100

92101
/**
93102
* Find the differences between two texts. Simplifies the problem by stripping
@@ -187,13 +196,23 @@ diff_match_patch.prototype.diff_compute_ = function(text1, text2, checklines,
187196

188197
var longtext = text1.length > text2.length ? text1 : text2;
189198
var shorttext = text1.length > text2.length ? text2 : text1;
199+
var shortlength = shorttext.length;
190200
var i = longtext.indexOf(shorttext);
191201
if (i != -1) {
202+
// skip leading unpaired surrogate
203+
if (this.isLowSurrogate(longtext[i])) {
204+
shortlength--;
205+
i++;
206+
}
207+
// skip trailing unpaired surrogate
208+
if (this.isHighSurrogate(longtext[i + shortlength])) {
209+
shortlength--;
210+
}
192211
// Shorter text is inside the longer text (speedup).
193212
diffs = [new diff_match_patch.Diff(DIFF_INSERT, longtext.substring(0, i)),
194213
new diff_match_patch.Diff(DIFF_EQUAL, shorttext),
195214
new diff_match_patch.Diff(DIFF_INSERT,
196-
longtext.substring(i + shorttext.length))];
215+
longtext.substring(i + shortlength))];
197216
// Swap insertions for deletions if diff is reversed.
198217
if (text1.length > text2.length) {
199218
diffs[0][0] = diffs[2][0] = DIFF_DELETE;
@@ -439,6 +458,15 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
439458
*/
440459
diff_match_patch.prototype.diff_bisectSplit_ = function(text1, text2, x, y,
441460
deadline) {
461+
// backup if we split a surrogate
462+
if (
463+
x > 0 && x < text1.length && this.isLowSurrogate(text1[x]) &&
464+
y > 0 && y < text2.length && this.isLowSurrogate(text2[y])
465+
) {
466+
x--;
467+
y--;
468+
}
469+
442470
var text1a = text1.substring(0, x);
443471
var text2a = text2.substring(0, y);
444472
var text1b = text1.substring(x);
@@ -569,6 +597,12 @@ diff_match_patch.prototype.diff_commonPrefix = function(text1, text2) {
569597
}
570598
pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
571599
}
600+
601+
// shorten the prefix if it splits a surrogate
602+
if (pointermid > 0 && this.isHighSurrogate(text1[pointermid-1])) {
603+
pointermid--;
604+
}
605+
572606
return pointermid;
573607
};
574608

@@ -601,6 +635,12 @@ diff_match_patch.prototype.diff_commonSuffix = function(text1, text2) {
601635
}
602636
pointermid = Math.floor((pointermax - pointermin) / 2 + pointermin);
603637
}
638+
639+
// shorten the suffix if it splits a surrogate
640+
if (pointermid < length - 1 && this.isLowSurrogate(text1[pointermid])) {
641+
pointermid++;
642+
}
643+
604644
return pointermid;
605645
};
606646

@@ -749,6 +789,24 @@ diff_match_patch.prototype.diff_halfMatch_ = function(text1, text2) {
749789
text1_b = hm[3];
750790
}
751791
var mid_common = hm[4];
792+
793+
// move forward to prevent splitting a surrogate pair
794+
if (mid_common.length > 0 && this.isLowSurrogate(mid_common[0])) {
795+
text1_a = text1_a + mid_common[0];
796+
text2_a = text2_a + mid_common[0];
797+
mid_common = mid_common.substring(1);
798+
}
799+
800+
// back up to prevent splitting a surrogate pair
801+
if (
802+
text1_b.length > 0 && this.isLowSurrogate(text1_b[0]) &&
803+
text2_b.length > 0 && this.isLowSurrogate(text2_b[0])
804+
) {
805+
text1_b = mid_common[mid_common.length - 1] + text1_b;
806+
text2_b = mid_common[mid_common.length - 1] + text2_b;
807+
mid_common = mid_common.substring(0, -1);
808+
}
809+
752810
return [text1_a, text1_b, text2_a, text2_b, mid_common];
753811
};
754812

javascript/tests/diff_match_patch_test.js

+78-1
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,83 @@ function testDiffDelta() {
492492
// Convert delta string into a diff.
493493
assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));
494494

495+
(function(){
496+
const originalText = `U+1F17x 🅰️ 🅱️ 🅾️ 🅿️ safhawifhkw
497+
U+1F18x 🆎
498+
0 1 2 3 4 5 6 7 8 9 A B C D E F
499+
U+1F19x 🆑 🆒 🆓 🆔 🆕 🆖 🆗 🆘 🆙 🆚
500+
U+1F20x 🈁 🈂️ sfss.,_||saavvvbbds
501+
U+1F21x 🈚
502+
U+1F22x 🈯
503+
U+1F23x 🈲 🈳 🈴 🈵 🈶 🈷️ 🈸 🈹 🈺
504+
U+1F25x 🉐 🉑
505+
U+1F30x 🌀 🌁 🌂 🌃 🌄 🌅 🌆 🌇 🌈 🌉 🌊 🌋 🌌 🌍 🌎 🌏
506+
U+1F31x 🌐 🌑 🌒 🌓 🌔 🌕 🌖 🌗 🌘 🌙 🌚 🌛 🌜 🌝 🌞 `;
507+
508+
// applies some random edits to string and returns new, edited string
509+
function applyRandomTextEdit(text) {
510+
let textArr = [...text];
511+
let r = Math.random();
512+
if(r < 1/3) { // swap
513+
let swapCount = Math.floor(Math.random()*5);
514+
for(let i = 0; i < swapCount; i++) {
515+
let swapPos1 = Math.floor(Math.random()*textArr.length);
516+
let swapPos2 = Math.floor(Math.random()*textArr.length);
517+
let char1 = textArr[swapPos1];
518+
let char2 = textArr[swapPos2];
519+
textArr[swapPos1] = char2;
520+
textArr[swapPos2] = char1;
521+
}
522+
} else if(r < 2/3) { // remove
523+
let removeCount = Math.floor(Math.random()*5);
524+
for(let i = 0; i < removeCount; i++) {
525+
let removePos = Math.floor(Math.random()*textArr.length);
526+
textArr[removePos] = "";
527+
}
528+
} else { // add
529+
let addCount = Math.floor(Math.random()*5);
530+
for(let i = 0; i < addCount; i++) {
531+
let addPos = Math.floor(Math.random()*textArr.length);
532+
let addFromPos = Math.floor(Math.random()*textArr.length);
533+
textArr[addPos] = textArr[addPos] + textArr[addFromPos];
534+
}
535+
}
536+
return textArr.join("");
537+
}
538+
539+
for(let i = 0; i < 1000; i++) {
540+
newText = applyRandomTextEdit(originalText);
541+
dmp.diff_toDelta(dmp.diff_main(originalText, newText));
542+
}
543+
})();
544+
545+
// Unicode - splitting surrogates
546+
assertEquivalent(
547+
'+%F0%9F%85%B1\t=4',
548+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
549+
);
550+
551+
assertEquivalent(
552+
'-2\t=4',
553+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
554+
);
555+
556+
assertEquivalent(
557+
'=2\t-2\t=2',
558+
dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
559+
);
560+
561+
// Different versions of the library may have created deltas with
562+
// half of a surrogate pair encoded as if it were valid UTF-8
563+
try {
564+
assertEquivalent(
565+
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
566+
dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
567+
);
568+
} catch ( e ) {
569+
assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
570+
}
571+
495572
// Verify pool of unchanged characters.
496573
diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
497574
var text2 = dmp.diff_text2(diffs);
@@ -963,4 +1040,4 @@ function testPatchApply() {
9631040
patches = dmp.patch_make('y', 'y123');
9641041
results = dmp.patch_apply(patches, 'x');
9651042
assertEquivalent(['x123', [true]], results);
966-
}
1043+
}

0 commit comments

Comments
 (0)