Bug 1940795: Don't tear aligned subwords in AtomicMemcpy. r=spidermonkey-reviewers,jandem

anba · anba · commit ecf9a0d48100 · 2025-01-14T09:01:35.000Z
Reads/writes to integer typed arrays mustn't tear, so they can't be split into single byte operations. Replace single byte copy loops with `AtomicCopy{Down,Up}NoTearIfAlignedUnsynchronized`, which performs additional checks when aligned word or dword copying is needed. Test262 tests: <tc39/test262#4369> Differential Revision: https://phabricator.services.mozilla.com/D233710
diff --git a/js/src/jit/GenerateAtomicOperations.py b/js/src/jit/GenerateAtomicOperations.py
@@ -664,6 +664,9 @@ def gen_copy(fun_name, cpp_type, size, unroll, direction):
             if size == 1:
                 insns += fmt_insn("movb OFFSET(%[src]), %[scratch]")
                 insns += fmt_insn("movb %[scratch], OFFSET(%[dst])")
+            elif size == 2:
+                insns += fmt_insn("movw OFFSET(%[src]), %[scratch]")
+                insns += fmt_insn("movw %[scratch], OFFSET(%[dst])")
             elif size == 4:
                 insns += fmt_insn("movl OFFSET(%[src]), %[scratch]")
                 insns += fmt_insn("movl %[scratch], OFFSET(%[dst])")
@@ -675,6 +678,12 @@ def gen_copy(fun_name, cpp_type, size, unroll, direction):
             if size == 1:
                 insns += fmt_insn("ldrb %w[scratch], [%x[src], OFFSET]")
                 insns += fmt_insn("strb %w[scratch], [%x[dst], OFFSET]")
+            elif size == 2:
+                insns += fmt_insn("ldrh %w[scratch], [%x[src], OFFSET]")
+                insns += fmt_insn("strh %w[scratch], [%x[dst], OFFSET]")
+            elif size == 4:
+                insns += fmt_insn("ldr %w[scratch], [%x[src], OFFSET]")
+                insns += fmt_insn("str %w[scratch], [%x[dst], OFFSET]")
             else:
                 assert size == 8
                 insns += fmt_insn("ldr %x[scratch], [%x[src], OFFSET]")
@@ -683,6 +692,9 @@ def gen_copy(fun_name, cpp_type, size, unroll, direction):
             if size == 1:
                 insns += fmt_insn("ldrb %[scratch], [%[src], #OFFSET]")
                 insns += fmt_insn("strb %[scratch], [%[dst], #OFFSET]")
+            elif size == 2:
+                insns += fmt_insn("ldrh %[scratch], [%[src], #OFFSET]")
+                insns += fmt_insn("strh %[scratch], [%[dst], #OFFSET]")
             else:
                 assert size == 4
                 insns += fmt_insn("ldr %[scratch], [%[src], #OFFSET]")
@@ -864,7 +876,9 @@ def generate_atomics_header(c_out):
         contents += gen_copy(
             "AtomicCopyWordUnsynchronized", "uintptr_t", wordsize, 1, "down"
         )
-        contents += gen_copy("AtomicCopyByteUnsynchronized", "uint8_t", 1, 1, "down")
+        contents += gen_copy("AtomicCopy32Unsynchronized", "uint32_t", 4, 1, "down")
+        contents += gen_copy("AtomicCopy16Unsynchronized", "uint16_t", 2, 1, "down")
+        contents += gen_copy("AtomicCopy8Unsynchronized", "uint8_t", 1, 1, "down")
 
         contents += "\n"
         contents += (
diff --git a/js/src/jit/shared/AtomicOperations-shared-jit.cpp b/js/src/jit/shared/AtomicOperations-shared-jit.cpp
@@ -4,18 +4,26 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
 
+#include "mozilla/Assertions.h"
+#include "mozilla/Attributes.h"
+#include "mozilla/MathAlgorithms.h"
+
+#include <atomic>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <tuple>
+#include <utility>
+
 #include "jit/AtomicOperations.h"
+#include "js/GCAPI.h"
 
 #if defined(__arm__)
 #  include "jit/arm/Architecture-arm.h"
 #endif
 
 #ifdef JS_HAVE_GENERATED_ATOMIC_OPS
 
-#  include <atomic>
-
-#  include "js/GCAPI.h"
-
 using namespace js;
 using namespace js::jit;
 
@@ -70,6 +78,64 @@ void AtomicCompilerFence() {
 }
 #  endif
 
+/**
+ * Return `true` if all pointers are aligned to `Alignment`.
+ */
+template <size_t Alignment>
+static inline bool CanCopyAligned(const uint8_t* dest, const uint8_t* src,
+                                  const uint8_t* lim) {
+  static_assert(mozilla::IsPowerOfTwo(Alignment));
+  return ((uintptr_t(dest) | uintptr_t(src) | uintptr_t(lim)) &
+          (Alignment - 1)) == 0;
+}
+
+/**
+ * Return `true` if both pointers have the same alignment and can be aligned to
+ * `Alignment`.
+ */
+template <size_t Alignment>
+static inline bool CanAlignTo(const uint8_t* dest, const uint8_t* src) {
+  static_assert(mozilla::IsPowerOfTwo(Alignment));
+  return ((uintptr_t(dest) ^ uintptr_t(src)) & (Alignment - 1)) == 0;
+}
+
+/**
+ * Copy a datum smaller than `WORDSIZE`. Prevents tearing when `dest` and `src`
+ * are both aligned.
+ *
+ * No tearing is a requirement for integer TypedArrays.
+ *
+ * https://tc39.es/ecma262/#sec-isnotearconfiguration
+ * https://tc39.es/ecma262/#sec-tear-free-aligned-reads
+ * https://tc39.es/ecma262/#sec-valid-executions
+ */
+static MOZ_ALWAYS_INLINE auto AtomicCopyDownNoTearIfAlignedUnsynchronized(
+    uint8_t* dest, const uint8_t* src, const uint8_t* srcEnd) {
+  MOZ_ASSERT(src <= srcEnd);
+  MOZ_ASSERT(size_t(srcEnd - src) < WORDSIZE);
+
+  if (WORDSIZE > 4 && CanCopyAligned<4>(dest, src, srcEnd)) {
+    static_assert(WORDSIZE <= 8, "copies 32-bits at most once");
+
+    if (src < srcEnd) {
+      AtomicCopy32Unsynchronized(dest, src);
+      dest += 4;
+      src += 4;
+    }
+  } else if (CanCopyAligned<2>(dest, src, srcEnd)) {
+    while (src < srcEnd) {
+      AtomicCopy16Unsynchronized(dest, src);
+      dest += 2;
+      src += 2;
+    }
+  } else {
+    while (src < srcEnd) {
+      AtomicCopy8Unsynchronized(dest++, src++);
+    }
+  }
+  return std::pair{dest, src};
+}
+
 void AtomicMemcpyDownUnsynchronized(uint8_t* dest, const uint8_t* src,
                                     size_t nbytes) {
   JS::AutoSuppressGCAnalysis nogc;
@@ -85,12 +151,14 @@ void AtomicMemcpyDownUnsynchronized(uint8_t* dest, const uint8_t* src,
     void (*copyBlock)(uint8_t* dest, const uint8_t* src);
     void (*copyWord)(uint8_t* dest, const uint8_t* src);
 
-    if (((uintptr_t(dest) ^ uintptr_t(src)) & WORDMASK) == 0) {
+    if (CanAlignTo<WORDSIZE>(dest, src)) {
       const uint8_t* cutoff = (const uint8_t*)RoundUp(uintptr_t(src), WORDSIZE);
       MOZ_ASSERT(cutoff <= lim);  // because nbytes >= WORDSIZE
-      while (src < cutoff) {
-        AtomicCopyByteUnsynchronized(dest++, src++);
-      }
+
+      // Copy initial bytes to align to word size.
+      std::tie(dest, src) =
+          AtomicCopyDownNoTearIfAlignedUnsynchronized(dest, src, cutoff);
+
       copyBlock = AtomicCopyBlockDownUnsynchronized;
       copyWord = AtomicCopyWordUnsynchronized;
     } else if (UnalignedAccessesAreOK()) {
@@ -118,11 +186,46 @@ void AtomicMemcpyDownUnsynchronized(uint8_t* dest, const uint8_t* src,
     }
   }
 
-  // Byte copy any remaining tail.
+  // Copy any remaining tail.
 
-  while (src < lim) {
-    AtomicCopyByteUnsynchronized(dest++, src++);
+  AtomicCopyDownNoTearIfAlignedUnsynchronized(dest, src, lim);
+}
+
+/**
+ * Copy a datum smaller than `WORDSIZE`. Prevents tearing when `dest` and `src`
+ * are both aligned.
+ *
+ * No tearing is a requirement for integer TypedArrays.
+ *
+ * https://tc39.es/ecma262/#sec-isnotearconfiguration
+ * https://tc39.es/ecma262/#sec-tear-free-aligned-reads
+ * https://tc39.es/ecma262/#sec-valid-executions
+ */
+static MOZ_ALWAYS_INLINE auto AtomicCopyUpNoTearIfAlignedUnsynchronized(
+    uint8_t* dest, const uint8_t* src, const uint8_t* srcBegin) {
+  MOZ_ASSERT(src >= srcBegin);
+  MOZ_ASSERT(size_t(src - srcBegin) < WORDSIZE);
+
+  if (WORDSIZE > 4 && CanCopyAligned<4>(dest, src, srcBegin)) {
+    static_assert(WORDSIZE <= 8, "copies 32-bits at most once");
+
+    if (src > srcBegin) {
+      dest -= 4;
+      src -= 4;
+      AtomicCopy32Unsynchronized(dest, src);
+    }
+  } else if (CanCopyAligned<2>(dest, src, srcBegin)) {
+    while (src > srcBegin) {
+      dest -= 2;
+      src -= 2;
+      AtomicCopy16Unsynchronized(dest, src);
+    }
+  } else {
+    while (src > srcBegin) {
+      AtomicCopy8Unsynchronized(--dest, --src);
+    }
   }
+  return std::pair{dest, src};
 }
 
 void AtomicMemcpyUpUnsynchronized(uint8_t* dest, const uint8_t* src,
@@ -134,16 +237,23 @@ void AtomicMemcpyUpUnsynchronized(uint8_t* dest, const uint8_t* src,
   src += nbytes;
   dest += nbytes;
 
+  // Set up bulk copying.  The cases are ordered the way they are on the
+  // assumption that if we can achieve aligned copies even with a little
+  // preprocessing then that is better than unaligned copying on a platform
+  // that supports it.
+
   if (nbytes >= WORDSIZE) {
     void (*copyBlock)(uint8_t* dest, const uint8_t* src);
     void (*copyWord)(uint8_t* dest, const uint8_t* src);
 
-    if (((uintptr_t(dest) ^ uintptr_t(src)) & WORDMASK) == 0) {
+    if (CanAlignTo<WORDSIZE>(dest, src)) {
       const uint8_t* cutoff = (const uint8_t*)(uintptr_t(src) & ~WORDMASK);
       MOZ_ASSERT(cutoff >= lim);  // Because nbytes >= WORDSIZE
-      while (src > cutoff) {
-        AtomicCopyByteUnsynchronized(--dest, --src);
-      }
+
+      // Copy initial bytes to align to word size.
+      std::tie(dest, src) =
+          AtomicCopyUpNoTearIfAlignedUnsynchronized(dest, src, cutoff);
+
       copyBlock = AtomicCopyBlockUpUnsynchronized;
       copyWord = AtomicCopyWordUnsynchronized;
     } else if (UnalignedAccessesAreOK()) {
@@ -154,6 +264,8 @@ void AtomicMemcpyUpUnsynchronized(uint8_t* dest, const uint8_t* src,
       copyWord = AtomicCopyUnalignedWordUpUnsynchronized;
     }
 
+    // Bulk copy, first larger blocks and then individual words.
+
     const uint8_t* blocklim = src - ((src - lim) & ~BLOCKMASK);
     while (src > blocklim) {
       dest -= BLOCKSIZE;
@@ -169,9 +281,9 @@ void AtomicMemcpyUpUnsynchronized(uint8_t* dest, const uint8_t* src,
     }
   }
 
-  while (src > lim) {
-    AtomicCopyByteUnsynchronized(--dest, --src);
-  }
+  // Copy any remaining tail.
+
+  AtomicCopyUpNoTearIfAlignedUnsynchronized(dest, src, lim);
 }
 
 }  // namespace jit