Iterate strings in JavaScript with a custom iterator

dmsnell · dmsnell · commit 90dbe8828761 · 2020-03-30T22:29:51.000-07:00
See google#69 In this patch I'm trying to see if we can scan through strings in the library while preserving Unicode semantics if we stay close to native JavaScript strings. That is, by only accounting for things like surrogates when we have to. As is obvious from inspection, this work is incomplete and it was the attempt that has led me to pursue concverting the intput strings up-front into an already-split array of Unicode scalar values.
diff --git a/javascript/diff_match_patch_uncompressed.js b/javascript/diff_match_patch_uncompressed.js
@@ -88,6 +88,111 @@ diff_match_patch.Diff.prototype.toString = function() {
   return this[0] + ',' + this[1];
 };
 
+diff_match_patch.prototype.U_is_high_surrogate = function(c) {
+  var v = c.charCodeAt(0);
+  return v >= 0xD800 && v <= 0xDBFF;
+}
+
+diff_match_patch.prototype.U_is_low_surrogate = function(c) {
+  var v = c.charCodeAt(0);
+  return v >= 0xDC00 && v <= 0xDFFF;
+}
+
+/**
+ * Returns the index of the Nth code point after the given index in a string
+ * 
+ * @param {string} str Given string
+ * @param {number} index Starting index
+ * @param {number} steps Number of code points to advance
+ * @returns {number} Index in string of advanced code point
+ */
+diff_match_patch.prototype.U_advance = function(str, index, steps) {
+  var next = index;
+
+  for (var i = (!steps && 0 !== steps) ? 1 : steps; i > 0; i--) {
+    next += this.U_is_high_surrogate(str[next]) ? 2 : 1;
+  }
+
+  return next;
+}
+
+/**
+ * Returns the index of the Nth code point before the given index in a string
+ * 
+ * @param {string} str Given string
+ * @param {number} index Starting index
+ * @param {number} steps Number of code points to go back
+ * @returns {number} Index in string of advanced code point
+ */
+diff_match_patch.prototype.U_recede = function(str, index, steps) {
+  var prev = index;
+
+  for (var i = (!steps && 0 !== steps) ? 1 : steps; i > 0; i--) {
+    prev -= this.U_is_low_surrogate(str[prev]) ? 2 : 1;
+  }
+
+  return prev;
+}
+
+diff_match_patch.prototype.U_length = function(str) {
+  var length = str.length;
+  var uLength = 0;
+
+  for (var i = 0; i < length; i++) {
+    if (this.U_is_high_surrogate(str[i])) {
+      i++;
+    }
+    uLength++;
+  }
+
+  return uLength;
+}
+
+diff_match_patch.prototype.U_charAt = function(str, index) {
+  return this.U_is_high_surrogate(str[index])
+    ? str[index] + str[index + 1]
+    : str[index];
+}
+
+/**
+ * Returns closest `indexOf` past a Unicode word boundary
+ * 
+ * @param {string} str "Containing" string to search inside
+ * @param {string} searchValue String to search for
+ * @param {number} fromIndex Index at which to start searching
+ * @return {string} Index of first match if found, otherwise -1.
+ *     Skips matches that don't occur on word boundaries.
+ */
+diff_match_patch.prototype.U_indexOf = function(str, searchValue, fromIndex) {
+  var match = str.indexOf(searchValue, fromIndex || 0);
+  var atBoundary = match <= 0 || !this.U_is_low_surrogate(str[match - 1]);
+
+  if ( ! atBoundary ) {
+    console.log("tried to split inside word boundary");
+  }
+
+  return atBoundary
+    ? match
+    : diff_match_patch.prototype.U_indexOf(str, searchValue, match + 1);
+}
+
+/**
+ * Returns closest Unicode boundary at or before a given index in a string
+ * 
+ * @param {string} s Containing string
+ * @param {number} index Candidate index
+ * @returns {number} closest index on or before given index which falls on a Unicode boundary
+ */
+diff_match_patch.prototype.U_boundary = function(s, index) {
+  var i = index;
+
+  while (i >= 0 && this.U_is_low_surrogate(s[i])) {
+    i--;
+    console.log("tried to index a non-boundary");
+  }
+
+  return i;
+}
 
 /**
  * Find the differences between two texts.  Simplifies the problem by stripping
@@ -187,7 +292,7 @@ diff_match_patch.prototype.diff_compute_ = function(text1, text2, checklines,
 
   var longtext = text1.length > text2.length ? text1 : text2;
   var shorttext = text1.length > text2.length ? text2 : text1;
-  var i = longtext.indexOf(shorttext);
+  var i = this.U_indexOf(longtext, shorttext);
   if (i != -1) {
     // Shorter text is inside the longer text (speedup).
     diffs = [new diff_match_patch.Diff(DIFF_INSERT, longtext.substring(0, i)),
@@ -317,7 +422,9 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
   // Cache the text lengths to prevent multiple calls.
   var text1_length = text1.length;
   var text2_length = text2.length;
-  var max_d = Math.ceil((text1_length + text2_length) / 2);
+  var text1_U_length = this.U_length(text1);
+  var text2_U_length = this.U_length(text2);
+  var max_d = Math.ceil((text1_U_length + text2_U_length) / 2);
   var v_offset = max_d;
   var v_length = 2 * max_d;
   var v1 = new Array(v_length);
@@ -330,7 +437,7 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
   }
   v1[v_offset + 1] = 0;
   v2[v_offset + 1] = 0;
-  var delta = text1_length - text2_length;
+  var delta = text1_U_length - text2_U_length;
   // If the total number of characters is odd, then the front path will collide
   // with the reverse path.
   var front = (delta % 2 != 0);
@@ -353,13 +460,15 @@ diff_match_patch.prototype.diff_bisect_ = function(text1, text2, deadline) {
       if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) {
         x1 = v1[k1_offset + 1];
       } else {
-        x1 = v1[k1_offset - 1] + 1;
+        x1 = this.U_advance(text1, v1[k1_offset - 1]);
       }
-      var y1 = x1 - k1;
+      var y1 = k1 > 0 ? this.U_advance(text2, x1, k1) :
+               k2 < 0 ? this.U_recede(text2, x1, k1) :
+                        x1;
       while (x1 < text1_length && y1 < text2_length &&
-             text1.charAt(x1) == text2.charAt(y1)) {
-        x1++;
-        y1++;
+             this.U_charAt(text1, x1) == this.U_charAt(text2, y1)) {
+        x1 = this.U_advance(text1, x1);
+        y1 = this.U_advance(text2, y1);
       }
       v1[k1_offset] = x1;
       if (x1 > text1_length) {
diff --git a/javascript/tests/diff_match_patch_test.js b/javascript/tests/diff_match_patch_test.js
@@ -492,6 +492,136 @@ function testDiffDelta() {
   // Convert delta string into a diff.
   assertEquivalent(diffs, dmp.diff_fromDelta(text1, delta));
 
+  diffs = [[DIFF_EQUAL, '\ud83d\ude4b\ud83d'], [DIFF_INSERT, '\ude4c\ud83d'], [DIFF_EQUAL, '\ude4b']];
+  try {
+    delta = dmp.diff_toDelta(diffs);
+    assertEquals('=2\t+%F0%9F%99%8C\t=2', delta);
+  } catch ( e ) {
+    assertEquals(false, true);
+  }
+
+  (function(){
+    const originalText = `U+1F17x	🅰️	🅱️		🅾️	🅿️ safhawifhkw
+    U+1F18x															🆎	
+    0	1	2	3	4	5	6	7	8	9	A	B	C	D	E	F
+    U+1F19x		🆑	🆒	🆓	🆔	🆕	🆖	🆗	🆘	🆙	🆚					
+    U+1F20x		🈁	🈂️							sfss.,_||saavvvbbds						
+    U+1F21x	🈚					
+    U+1F22x			🈯
+    U+1F23x			🈲	🈳	🈴	🈵	🈶	🈷️	🈸	🈹	🈺					
+    U+1F25x	🉐	🉑		
+    U+1F30x	🌀	🌁	🌂	🌃	🌄	🌅	🌆	🌇	🌈	🌉	🌊	🌋	🌌	🌍	🌎	🌏
+    U+1F31x	🌐	🌑	🌒	🌓	🌔	🌕	🌖	🌗	🌘	🌙	🌚	🌛	🌜	🌝	🌞	`;
+
+    // applies some random edits to string and returns new, edited string
+    function applyRandomTextEdit(text) {
+      let textArr = [...text];
+      let r = Math.random();
+      if(r < 1/3) { // swap
+      let swapCount = Math.floor(Math.random()*5);
+        for(let i = 0; i < swapCount; i++) {
+        let swapPos1 = Math.floor(Math.random()*textArr.length);
+          let swapPos2 = Math.floor(Math.random()*textArr.length);
+          let char1 = textArr[swapPos1];
+          let char2 = textArr[swapPos2];
+          textArr[swapPos1] = char2;
+          textArr[swapPos2] = char1;
+        }
+      } else if(r < 2/3) { // remove
+        let removeCount = Math.floor(Math.random()*5);
+        for(let i = 0; i < removeCount; i++) {
+          let removePos = Math.floor(Math.random()*textArr.length);
+          textArr[removePos] = "";
+        }
+      } else { // add
+        let addCount = Math.floor(Math.random()*5);
+        for(let i = 0; i < addCount; i++) {
+          let addPos = Math.floor(Math.random()*textArr.length);
+          let addFromPos = Math.floor(Math.random()*textArr.length);
+          textArr[addPos] = textArr[addPos] + textArr[addFromPos];
+        }
+      }
+      return textArr.join("");
+    }
+
+    for(let i = 0; i < 1000; i++) {
+      newText = applyRandomTextEdit(originalText);
+      dmp.diff_toDelta(dmp.diff_main(originalText, newText));
+    }
+  })();
+
+  // Unicode - splitting surrogates
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_INSERT,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
+      dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd71\ud83c\udd70\ud83c\udd71'))
+    );
+  } catch ( e ) {
+    assertEquals('Inserting similar surrogate pair at beginning', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_EQUAL,'\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd70'], [DIFF_EQUAL, '\ud83c\udd71']]),
+      dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd70\ud83c\udd71'))
+    );
+  } catch ( e ) {
+    assertEquals('Inserting similar surrogate pair in the middle', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_DELETE,'\ud83c\udd71'], [DIFF_EQUAL, '\ud83c\udd70\ud83c\udd71']]),
+      dmp.diff_toDelta(dmp.diff_main('\ud83c\udd71\ud83c\udd70\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
+    );
+  } catch ( e ) {
+    assertEquals('Deleting similar surrogate pair at the beginning', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c\udd70'], [DIFF_DELETE,'\ud83c\udd72'], [DIFF_EQUAL, '\ud83c\udd71']]),
+      dmp.diff_toDelta(dmp.diff_main('\ud83c\udd70\ud83c\udd72\ud83c\udd71', '\ud83c\udd70\ud83c\udd71'))
+    );
+  } catch ( e ) {
+    assertEquals('Deleting similar surrogate pair in the middle', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_DELETE, '\ud83c\udd70'], [DIFF_INSERT, '\ud83c\udd71']]),
+      dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_DELETE, '\udd70'], [DIFF_INSERT, '\udd71']]),
+    );
+  } catch ( e ) {
+    assertEquals('Swap surrogate pair', 'crashed');
+  }
+
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta([[DIFF_INSERT, '\ud83c\udd70'], [DIFF_DELETE, '\ud83c\udd71']]),
+      dmp.diff_toDelta([[DIFF_EQUAL, '\ud83c'], [DIFF_INSERT, '\udd70'], [DIFF_DELETE, '\udd71']]),
+    );
+  } catch ( e ) {
+    assertEquals('Swap surrogate pair', 'crashed');
+  }
+
+  // Empty diff groups
+  assertEquivalent(
+    dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_DELETE, ''], [DIFF_INSERT, 'ghijk']]),
+    dmp.diff_toDelta([[DIFF_EQUAL, 'abcdef'], [DIFF_INSERT, 'ghijk']]),
+  );
+
+  // Different versions of the library may have created deltas with
+  // half of a surrogate pair encoded as if it were valid UTF-8
+  try {
+    assertEquivalent(
+      dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '-2\t+%F0%9F%85%B1')),
+      dmp.diff_toDelta(dmp.diff_fromDelta('\ud83c\udd70', '=1\t-1\t+%ED%B5%B1'))
+    );
+  } catch ( e ) {
+    assertEquals('Decode UTF8-encoded surrogate half', 'crashed');
+  }
+
   // Verify pool of unchanged characters.
   diffs = [[DIFF_INSERT, 'A-Z a-z 0-9 - _ . ! ~ * \' ( ) ; / ? : @ & = + $ , # ']];
   var text2 = dmp.diff_text2(diffs);
@@ -963,4 +1093,4 @@ function testPatchApply() {
   patches = dmp.patch_make('y', 'y123');
   results = dmp.patch_apply(patches, 'x');
   assertEquivalent(['x123', [true]], results);
-}
+}