Python2: Stop breaking surrogate pairs in toDelta()

dmsnell · dmsnell · commit db1cbba2800f · 2024-01-30T16:53:42.000-07:00
Resolves google#69 for Python2 Sometimes we can find a common prefix that runs into the middle of a surrogate pair and we split that pair when building our diff groups. This is fine as long as we are operating on UTF-16 code units. It becomes problematic when we start trying to treat those substrings as valid Unicode (or UTF-8) sequences. When we pass these split groups into `toDelta()` we do just that and the library crashes. In this patch we're post-processing the diff groups before encoding them to make sure that we un-split the surrogate pairs. The post-processed diffs should produce the same output when applying the diffs. The diff string itself will be different but should change that much - only by a single character at surrogate boundaries.
diff --git a/python2/diff_match_patch.py b/python2/diff_match_patch.py
@@ -28,6 +28,7 @@
 __author__ = 'fraser@google.com (Neil Fraser)'
 
 import re
+import struct
 import sys
 import time
 import urllib
@@ -1135,6 +1136,14 @@ def diff_levenshtein(self, diffs):
     levenshtein += max(insertions, deletions)
     return levenshtein
 
+  @classmethod
+  def is_high_surrogate(cls, c):
+    return 0xd800 <= struct.unpack('>H', c)[0] <= 0xdbff
+
+  @classmethod
+  def is_low_surrogate(cls, c):
+    return 0xdc00 <= struct.unpack('>H', c)[0] <= 0xdfff
+
   def diff_toDelta(self, diffs):
     """Crush the diff into an encoded string which describes the operations
     required to transform text1 into text2.
@@ -1148,15 +1157,32 @@ def diff_toDelta(self, diffs):
       Delta text.
     """
     text = []
+    last_end = None
     for (op, data) in diffs:
+      if 0 == len(data):
+        continue
+
+      encoded = data.encode('utf-16be')
+      this_top = encoded[0:2]
+      this_end = encoded[-2:]
+
+      if self.is_high_surrogate(this_end):
+        last_end = this_end
+        encoded = encoded[0:-2]
+
+      if last_end and self.is_high_surrogate(last_end) and self.is_low_surrogate(this_top):
+        encoded = last_end + encoded
+
+      if 0 == len(encoded):
+        continue
+
       if op == self.DIFF_INSERT:
         # High ascii will raise UnicodeDecodeError.  Use Unicode instead.
-        data = data.encode("utf-8")
-        text.append("+" + urllib.quote(data, "!~*'();/?:@&=+$,# "))
+        text.append("+" + urllib.quote(encoded.decode('utf-16be').encode('utf-8'), "!~*'();/?:@&=+$,# "))
       elif op == self.DIFF_DELETE:
-        text.append("-%d" % len(data))
+        text.append("-%d" % (len(encoded) // 2))
       elif op == self.DIFF_EQUAL:
-        text.append("=%d" % len(data))
+        text.append("=%d" % (len(encoded) // 2))
     return "\t".join(text)
 
   def diff_fromDelta(self, text1, delta):
diff --git a/python2/tests/diff_match_patch_test.py b/python2/tests/diff_match_patch_test.py
@@ -441,6 +441,86 @@ def testDiffDelta(self):
     # Convert delta string into a diff.
     self.assertEquals(diffs, self.dmp.diff_fromDelta(text1, delta))
 
+    diffs = [(self.dmp.DIFF_EQUAL, u"\ud83d\ude4b\ud83d"), (self.dmp.DIFF_INSERT, u"\ude4c\ud83d"), (self.dmp.DIFF_EQUAL, u"\ude4b")]
+    delta = self.dmp.diff_toDelta(diffs)
+    self.assertEquals("=2\t+%F0%9F%99%8C\t=2", delta)
+
+    # Unicode: split surrogates
+    # Inserting similar surrogate pair at beginning
+    self.assertEquals(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_INSERT, u'\U0001F171'),
+        (self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        u'\U0001F170\U0001F171',
+        u'\U0001F171\U0001F170\U0001F171'
+      ))
+    )
+
+    # Inserting similar surrogate pair in the middle
+    self.assertEquals(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_EQUAL, u'\U0001F170'),
+        (self.dmp.DIFF_INSERT, u'\U0001F172'),
+        (self.dmp.DIFF_EQUAL, u'\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        u'\U0001F170\U0001F171',
+        u'\U0001F170\U0001F172\U0001F171'
+      ))
+    )
+
+    # Deleting similar surogate pair at the beginning
+    self.assertEquals(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_DELETE, u'\U0001F171'),
+        (self.dmp.DIFF_EQUAL, u'\U0001F170\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        u'\U0001F171\U0001F170\U0001F171',
+        u'\U0001F170\U0001F171'
+      ))
+    )
+
+    # Deleting similar surogate pair in the middle
+    self.assertEquals(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_EQUAL, u'\U0001F170'),
+        (self.dmp.DIFF_DELETE, u'\U0001F172'),
+        (self.dmp.DIFF_EQUAL, u'\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        u'\U0001F170\U0001F172\U0001F171',
+        u'\U0001F170\U0001F171'
+      ))
+    )
+
+    # Swap surrogate pair
+    self.assertEquals(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_DELETE, u'\U0001F170'),
+        (self.dmp.DIFF_INSERT, u'\U0001F171')
+      ]),
+      self.dmp.diff_toDelta(self.dmp.diff_main(
+        u'\U0001F170',
+        u'\U0001F171'
+      ))
+    )
+
+    # Swap surrogate pair, force the invalid diff groups
+    self.assertEquals(
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_INSERT, u'\U0001F170'),
+        (self.dmp.DIFF_DELETE, u'\U0001F171')
+      ]),
+      self.dmp.diff_toDelta([
+        (self.dmp.DIFF_EQUAL, u'\ud83c'),
+        (self.dmp.DIFF_INSERT, u'\udd70'),
+        (self.dmp.DIFF_DELETE, u'\udd71')
+      ])
+    )
+
     # Verify pool of unchanged characters.
     diffs = [(self.dmp.DIFF_INSERT, "A-Z a-z 0-9 - _ . ! ~ * ' ( ) ; / ? : @ & = + $ , # ")]
     text2 = self.dmp.diff_text2(diffs)