Skip to content

Commit db1cbba

Browse files
committed
Python2: Stop breaking surrogate pairs in toDelta()
Resolves google#69 for Python2 Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
1 parent dfadc9c commit db1cbba

File tree

2 files changed

+110
-4
lines changed

2 files changed

+110
-4
lines changed

python2/diff_match_patch.py

+30-4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
__author__ = '[email protected] (Neil Fraser)'
2929

3030
import re
31+
import struct
3132
import sys
3233
import time
3334
import urllib
@@ -1135,6 +1136,14 @@ def diff_levenshtein(self, diffs):
11351136
levenshtein += max(insertions, deletions)
11361137
return levenshtein
11371138

1139+
@classmethod
1140+
def is_high_surrogate(cls, c):
1141+
return 0xd800 <= struct.unpack('>H', c)[0] <= 0xdbff
1142+
1143+
@classmethod
1144+
def is_low_surrogate(cls, c):
1145+
return 0xdc00 <= struct.unpack('>H', c)[0] <= 0xdfff
1146+
11381147
def diff_toDelta(self, diffs):
11391148
"""Crush the diff into an encoded string which describes the operations
11401149
required to transform text1 into text2.
@@ -1148,15 +1157,32 @@ def diff_toDelta(self, diffs):
11481157
Delta text.
11491158
"""
11501159
text = []
1160+
last_end = None
11511161
for (op, data) in diffs:
1162+
if 0 == len(data):
1163+
continue
1164+
1165+
encoded = data.encode('utf-16be')
1166+
this_top = encoded[0:2]
1167+
this_end = encoded[-2:]
1168+
1169+
if self.is_high_surrogate(this_end):
1170+
last_end = this_end
1171+
encoded = encoded[0:-2]
1172+
1173+
if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top):
1174+
encoded = last_end + encoded
1175+
1176+
if 0 == len(encoded):
1177+
continue
1178+
11521179
if op == self.DIFF_INSERT:
11531180
# High ascii will raise UnicodeDecodeError. Use Unicode instead.
1154-
data = data.encode("utf-8")
1155-
text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# "))
1181+
text.append("+" + urllib.quote(encoded.decode('utf-16be').encode('utf-8'), "!~*'();/?:@&=+$,# "))
11561182
elif op == self.DIFF_DELETE:
1157-
text.append("-%d" % len(data))
1183+
text.append("-%d" % (len(encoded) // 2))
11581184
elif op == self.DIFF_EQUAL:
1159-
text.append("=%d" % len(data))
1185+
text.append("=%d" % (len(encoded) // 2))
11601186
return "\t".join(text)
11611187

11621188
def diff_fromDelta(self, text1, delta):

python2/tests/diff_match_patch_test.py

+80
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,86 @@ def testDiffDelta(self):
441441
# Convert delta string into a diff.
442442
self.assertEquals(diffs, self.dmp.diff_fromDelta(text1, delta))
443443

444+
diffs = [(self.dmp.DIFF_EQUAL, u"\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, u"\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, u"\ude4b")]
445+
delta = self.dmp.diff_toDelta(diffs)
446+
self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta)
447+
448+
# Unicode: split surrogates
449+
# Inserting similar surrogate pair at beginning
450+
self.assertEquals(
451+
self.dmp.diff_toDelta([
452+
(self.dmp.DIFF_INSERT, u'\U0001F171'),
453+
(self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171')
454+
]),
455+
self.dmp.diff_toDelta(self.dmp.diff_main(
456+
u'\U0001F170\U0001F171',
457+
u'\U0001F171\U0001F170\U0001F171'
458+
))
459+
)
460+
461+
# Inserting similar surrogate pair in the middle
462+
self.assertEquals(
463+
self.dmp.diff_toDelta([
464+
(self.dmp.DIFF_EQUAL, u'\U0001F170'),
465+
(self.dmp.DIFF_INSERT, u'\U0001F172'),
466+
(self.dmp.DIFF_EQUAL, u'\U0001F171')
467+
]),
468+
self.dmp.diff_toDelta(self.dmp.diff_main(
469+
u'\U0001F170\U0001F171',
470+
u'\U0001F170\U0001F172\U0001F171'
471+
))
472+
)
473+
474+
# Deleting similar surogate pair at the beginning
475+
self.assertEquals(
476+
self.dmp.diff_toDelta([
477+
(self.dmp.DIFF_DELETE, u'\U0001F171'),
478+
(self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171')
479+
]),
480+
self.dmp.diff_toDelta(self.dmp.diff_main(
481+
u'\U0001F171\U0001F170\U0001F171',
482+
u'\U0001F170\U0001F171'
483+
))
484+
)
485+
486+
# Deleting similar surogate pair in the middle
487+
self.assertEquals(
488+
self.dmp.diff_toDelta([
489+
(self.dmp.DIFF_EQUAL, u'\U0001F170'),
490+
(self.dmp.DIFF_DELETE, u'\U0001F172'),
491+
(self.dmp.DIFF_EQUAL, u'\U0001F171')
492+
]),
493+
self.dmp.diff_toDelta(self.dmp.diff_main(
494+
u'\U0001F170\U0001F172\U0001F171',
495+
u'\U0001F170\U0001F171'
496+
))
497+
)
498+
499+
# Swap surrogate pair
500+
self.assertEquals(
501+
self.dmp.diff_toDelta([
502+
(self.dmp.DIFF_DELETE, u'\U0001F170'),
503+
(self.dmp.DIFF_INSERT, u'\U0001F171')
504+
]),
505+
self.dmp.diff_toDelta(self.dmp.diff_main(
506+
u'\U0001F170',
507+
u'\U0001F171'
508+
))
509+
)
510+
511+
# Swap surrogate pair, force the invalid diff groups
512+
self.assertEquals(
513+
self.dmp.diff_toDelta([
514+
(self.dmp.DIFF_INSERT, u'\U0001F170'),
515+
(self.dmp.DIFF_DELETE, u'\U0001F171')
516+
]),
517+
self.dmp.diff_toDelta([
518+
(self.dmp.DIFF_EQUAL, u'\ud83c'),
519+
(self.dmp.DIFF_INSERT, u'\udd70'),
520+
(self.dmp.DIFF_DELETE, u'\udd71')
521+
])
522+
)
523+
444524
# Verify pool of unchanged characters.
445525
diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]
446526
text2 = self.dmp.diff_text2(diffs)

0 commit comments

Comments
 (0)