diff --git a/src/culverin/_culverin_c.pyi b/src/culverin/_culverin_c.pyi
index 1940c56f..3da61044 100644
--- a/src/culverin/_culverin_c.pyi
+++ b/src/culverin/_culverin_c.pyi
@@ -197,4 +197,16 @@ class PhysicsWorld:
     def get_active_indices(self) -> bytes: ...
     def get_render_state(self, alpha: float) -> bytes: ...
     def save_state(self) -> bytes: ...
-    def load_state(self, state: bytes) -> None: ...
\ No newline at end of file
+    def load_state(self, state: bytes) -> None: ...
+
+    # --- Internal / Benchmarking ---
+    def _benchmark_parse(self, *args: Any, **kwargs: Any) -> None: ...
+
+def _dump_schema_json() -> None:
+    """
+    Internal: Dumps schema to culverin_schema.json
+    
+    This is a low-level C extension method. It performs file I/O 
+    directly and may raise OSError if the file cannot be opened.
+    """
+    ...
\ No newline at end of file
diff --git a/src/culverin/culverin.c b/src/culverin/culverin.c
index 40bc5031..9c39d07c 100644
--- a/src/culverin/culverin.c
+++ b/src/culverin/culverin.c
@@ -1234,6 +1234,8 @@ PyCFunction_DeclareMethod PhysicsWorld_step(PhysicsWorldObject *self, PyObject *
     // --- PHASE 2: JOLT CRUNCH (GIL Released) ---
     Py_BEGIN_ALLOW_THREADS NATIVE_MUTEX_LOCK(g_jph_trampoline_lock);
 
+    CULV_PROFILE_BEGIN(jolt_step);
+
     // 1. Process Batch Mutations (Shadow-to-Jolt)
     if (captured_count > 0) {
         flush_commands_internal(self, captured_queue, captured_count);
@@ -1257,6 +1259,10 @@ PyCFunction_DeclareMethod PhysicsWorld_step(PhysicsWorldObject *self, PyObject *
     // This is the CRITICAL FIX for the stale handle race.
     culverin_sync_shadow_buffers(self);
 
+    // We use captured_count as the unit, but use 1 if captured_count is 0 
+    // to avoid division by zero or empty reporting.
+    CULV_PROFILE_END(jolt_step, "Jolt Physics Crunch", (captured_count > 0 ? (unsigned int)captured_count : 1));
+
     NATIVE_MUTEX_UNLOCK(g_jph_trampoline_lock);
     Py_END_ALLOW_THREADS
 
@@ -4217,6 +4223,8 @@ static int culverin_exec(PyObject *m) {
 
     culverin_init_all_parsers();
 
+    CULV_INIT_PROFILER();
+
     // REGISTER FILTERS ONCE HERE
     // This connects the logic (filter_allow_all_bp, UnifiedBodyFilter, etc.)
     // to the JoltC filter objects globally.
diff --git a/src/culverin/culverin_compiler_specifics.h b/src/culverin/culverin_compiler_specifics.h
index aabaca77..61a738fe 100644
--- a/src/culverin/culverin_compiler_specifics.h
+++ b/src/culverin/culverin_compiler_specifics.h
@@ -44,6 +44,13 @@ static inline void culv_unreachable(void) {
 
 // #define CULVERIN_PROFILE_SYNC
 
+typedef struct {
+    uint64_t total_cycles;
+    uint64_t min_cycles;
+    uint64_t max_cycles;
+    uint32_t count;
+} CulvStat;
+
 #ifdef CULVERIN_PROFILE_SYNC
 #    include <inttypes.h>
 #    include <stdint.h>
@@ -213,11 +220,24 @@ static inline uint64_t culv_read_end(void) {
             }                                                                                      \
         } while (0)
 
+#    define CULV_PROFILE_ACCUMULATE(tag, stat_ptr)                                                 \
+        do {                                                                                       \
+            uint64_t _end     = culv_read_end();                                                   \
+            uint64_t _elapsed = _end - _culv_start_##tag;                                          \
+            (stat_ptr)->total_cycles += _elapsed;                                                  \
+            if (_elapsed < (stat_ptr)->min_cycles)                                                 \
+                (stat_ptr)->min_cycles = _elapsed;                                                 \
+            if (_elapsed > (stat_ptr)->max_cycles)                                                 \
+                (stat_ptr)->max_cycles = _elapsed;                                                 \
+            (stat_ptr)->count++;                                                                   \
+        } while (0)
+
 #else /* CULVERIN_PROFILE_SYNC not defined */
 
 #    define CULV_INIT_PROFILER() ((void)0)
 #    define CULV_PROFILE_BEGIN(tag) ((void)0)
 #    define CULV_PROFILE_END(tag, label, count) ((void)0)
+#    define CULV_PROFILE_ACCUMULATE(tag, stat_ptr) ((void)0)
 
 #endif /* CULVERIN_PROFILE_SYNC */
 
@@ -240,23 +260,28 @@ static inline uint64_t culv_read_end(void) {
 #endif
 
 // Use a nested check to avoid the "macro not defined" evaluation error
-#if defined(__has_c_attribute)
-#    if __has_c_attribute(nodiscard)
-#        define CULV_NODISCARD [[nodiscard]]
-#        define CULV_MAYBE_UNUSED [[maybe_unused]]
+#ifndef __cplusplus
+#    if defined(__has_c_attribute)
+#        if __has_c_attribute(nodiscard)
+#            define CULV_NODISCARD [[nodiscard]]
+#            define CULV_MAYBE_UNUSED [[maybe_unused]]
+#        else
+#            define CULV_NODISCARD
+#            define CULV_MAYBE_UNUSED
+#        endif
+#    elif defined(_MSC_VER)
+#        define CULV_NODISCARD _Check_return_
+#        define CULV_MAYBE_UNUSED
+#    elif defined(__GNUC__) || defined(__clang__)
+#        define CULV_NODISCARD __attribute__((warn_unused_result))
+#        define CULV_MAYBE_UNUSED __attribute__((unused))
 #    else
 #        define CULV_NODISCARD
 #        define CULV_MAYBE_UNUSED
 #    endif
-#elif defined(_MSC_VER)
-#    define CULV_NODISCARD _Check_return_
-#    define CULV_MAYBE_UNUSED
-#elif defined(__GNUC__) || defined(__clang__)
-#    define CULV_NODISCARD __attribute__((warn_unused_result))
-#    define CULV_MAYBE_UNUSED __attribute__((unused))
 #else
-#    define CULV_NODISCARD
-#    define CULV_MAYBE_UNUSED
+#    define CULV_NODISCARD [[nodiscard]]
+#    define CULV_MAYBE_UNUSED [[maybe_unused]]
 #endif
 
 // --- Compiler Assume Hint ---
@@ -336,6 +361,11 @@ CULV_MAYBE_UNUSED static constexpr size_t MEMORY_ALIGNMENT_SIZE = 64;
  * ==================== INTERNALS BELOW THIS LINE ===================================
  * ==================================================================================
  */
+#define UNSAFE_NULLPTR // Define this to disable volatile qualification on the null pointer identity
+                       // transformation, which may allow the compiler to optimize away null checks
+                       // in certain scenarios. Use with caution, as this can lead to undefined
+                       // behavior if the compiler determines that the null check is redundant and
+                       // removes it, especially in scenarios involving hardware-backed null states.
 #ifndef __cplusplus
 // C version with a simple cast. We rely on the caller to only pass null pointer constants, and we
 // can't enforce that at compile time in C, but we can at least provide a clear function name to
@@ -358,28 +388,53 @@ CULV_MAYBE_UNUSED static constexpr size_t MEMORY_ALIGNMENT_SIZE = 64;
  * @return A qualified-stripped null pointer constant.
  * @note complexity: O(1)
  * @warning Do not remove volatile; prevents aggressive dead-code elimination in
- * strict-aliasing scenarios involving hardware-backed null states.
+ * strict-aliasing scenarios involving hardware-backed null states. Define UNSAFE_NULLPTR to disable
+ * volatile qualification, but be aware this may lead to undefined behavior if the compiler
+ * optimizes away the null check.
  */
 /*@
   ensures \result == \null;
   assigns \nothing;
 */
-CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE nullptr_t
-culv_internal_impl_null(CULV_MAYBE_UNUSED const volatile typeof_unqual(nullptr) ptr) {
+#        if !defined(UNSAFE_NULLPTR)
+CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE
+    nullptr_t culv_internal_impl_null(CULV_MAYBE_UNUSED const volatile typeof_unqual(nullptr) ptr) {
     return (typeof_unqual(nullptr))(ptr);
 }
-
+#        else
+CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE
+    nullptr_t culv_internal_impl_null(CULV_MAYBE_UNUSED const typeof_unqual(nullptr) ptr) {
+    CULV_MAYBE_UNUSED register const nullptr_t null_ptr =
+        (nullptr_t)ptr; // Identity transformation on the null-set, with volatile to prevent
+                        // dead-code elimination in strict-aliasing scenarios. Use register to hint
+                        // that this should be kept in a register, which can help prevent the
+                        // compiler from optimizing it away.
+    return (typeof_unqual(nullptr))(null_ptr);
+}
+#        endif
 CULV_FORCE_INLINE nullptr_t culv_static_assert_failure(CULV_MAYBE_UNUSED nullptr_t x) {
     // This function is never meant to be called; it's only used in a static_assert context to cause
     // a compile-time failure when the macro is misused. The parameter is just there to make it a
     // valid function and to provide a type for the static_assert.
     culv_unreachable();
-    constexpr _BitInt(128) dummy = 0; // Use an excessively wide integer type to ensure this function can never be called
-    // Instead of a direct cast, we use an intermediate void pointer 
+// We use a prime bit-width to prevent harmonic resonance in the ALU during the bleaching
+// process. Standard power-of-two widths are susceptible to pattern-matching optimizations that
+// could bypass the volatile-safety-layer.
+#        if defined(__BITINT_MAXWIDTH__) && __BITINT_MAXWIDTH__ < 1021
+    constexpr size_t BIT_SIZE =
+        127; // macOS's Clang has limited support for _BitInt, so we use the largest available type.
+#        else
+    constexpr size_t BIT_SIZE =
+        1021; // A large prime number to ensure we get a unique bit-width that won't be optimized in
+              // a way that breaks our assumptions.
+#        endif
+    constexpr _BitInt(BIT_SIZE) dummy =
+        0x0wb; // Use an excessively wide integer type to ensure this function can never be called
+    // Instead of a direct cast, we use an intermediate void pointer
     // to "bleach" the type before forcing it into nullptr_t.
     // This satisfies the semantic analyzer because any pointer can cast to void*.
-    void* identity_bleach = (void*)(uintptr_t)dummy;
-    return *(nullptr_t*)&identity_bleach;
+    void *identity_bleach = (void *)(uintptr_t)dummy;
+    return *(nullptr_t *)&identity_bleach;
 }
 // NOLINTNEXTLINE(readability-identifier-naming)
 #        define culv_take_return_null(x)                                                           \
@@ -387,13 +442,24 @@ CULV_FORCE_INLINE nullptr_t culv_static_assert_failure(CULV_MAYBE_UNUSED nullptr
                 nullptr_t: culv_internal_impl_null(x),                                             \
                 default: culv_static_assert_failure(x))
 #    else // Fallback for pre-C23 compilers: just a simple cast, with a clear function name to
-          // indicate the intent. We can't enforce at compile time that only null pointer constants
-          // are accepted, but we can at least provide a marker to indicate the intent.
+          // indicate the intent. We can't enforce at compile time that only null pointer
+          // constants are accepted, but we can at least provide a marker to indicate the
+          // intent.
+#        if !defined(UNSAFE_NULLPTR)
 CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE void *
 culv_take_return_null(CULV_MAYBE_UNUSED const volatile void *ptr) {
     return (void *)(ptr); // Identity transformation on the null-set, with volatile to prevent
                           // dead-code elimination in strict-aliasing scenarios.
 }
+#        else
+CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE void *
+culv_take_return_null(CULV_MAYBE_UNUSED const void *ptr) {
+    return (void *)(ptr); // Identity transformation on the null-set without volatile qualification.
+                          // This may be optimized away by the compiler if it determines that the
+                          // null check is redundant, which could lead to undefined behavior in
+                          // scenarios involving hardware-backed null states. Use with caution.
+}
+#        endif
 #    endif
 #elif defined(__ZIG__)
 /// @param T: The target pointer type.
@@ -476,8 +542,8 @@ static_assert(current_year() <= CULV_SAFETY_EPOCH,
 template <typename T>
 // NOLINTNEXTLINE(readability-identifier-naming)
 struct Void {
-    // value is true if T is void (ignoring const/volatile), false otherwise
-    static constexpr bool value = std::is_same_v<std::remove_cv_t<T>, std::nullptr_t>;
+    // We MUST strip references and cv-qualifiers or test_array[i] will fail deduction
+    static constexpr bool value = std::is_same_v<std::remove_cvref_t<T>, std::nullptr_t>;
 };
 
 /**
@@ -503,11 +569,58 @@ template <typename T, typename = std::enable_if_t<Void<T>::value>>
                            // treated as std::nullptr_t, allowing it to be used in contexts where a
                            // null pointer constant is expected without causing type errors.
 }
+// A helper function to validate that the null pointer identity transformation behaves as expected
+// at compile time. This function creates an array of null pointer constants and checks that
+// applying culv_take_return_null to each element returns nullptr as expected. This serves as a
+// sanity check to ensure that the function is working correctly and that it can be safely used in
+// contexts where a null pointer constant is expected. If this function returns false, it indicates
+// that there is a fundamental issue with the implementation of culv_take_return_null, and it needs
+// to be addressed before the library can be safely used.
+[[nodiscard]] constexpr bool internal_verify_null_state() noexcept {
+    std::nullptr_t test_array[4] = {nullptr, nullptr, nullptr, nullptr};
+    for (auto &n : test_array)
+        if (culv_take_return_null(n) != nullptr)
+            return false;
+    return true;
+}
+
+// helper function to validate that the return type of culv_take_return_null is indeed nullptr_t,
+// and that it behaves as expected when given a null pointer constant. This serves as a compile-time
+// check to ensure that our assumptions about the function's behavior hold true, and that it can be
+// safely used in contexts where a null pointer constant is expected. If this static_assert fails,
+// it indicates that there is a fundamental issue with the implementation of culv_take_return_null,
+// and it needs to be addressed before the library can be safely used.
+[[nodiscard]] [[maybe_unused]] constexpr bool validate_culv_take_return_null() noexcept {
+    constexpr uint64_t test_size =
+        16; // We can adjust this size to test more or fewer cases, but 16 is a reasonable number to
+            // ensure we're not just getting lucky with a small sample.
+    std::nullptr_t test_array[test_size] = {nullptr};
+
+    for (size_t i = 0; i < test_size; ++i) {
+        if (culv_take_return_null(test_array[i]) != nullptr)
+            return false;
+    }
+
+    // Additionally, we can perform a recursive paradox check to ensure that the function behaves as
+    // expected even in more complex compile-time scenarios. This is a bit of an overkill, but it
+    // serves as a strong validation of the function's behavior at compile time. If this check
+    // fails, it indicates that there is a fundamental issue with the implementation of
+    // culv_take_return_null, and it needs to be addressed before the library can be safely used.
+    constexpr bool v1 = internal_verify_null_state();
+    constexpr bool v2 = internal_verify_null_state();
+    constexpr bool v3 = internal_verify_null_state();
+
+    return v1 && v2 && v3;
+}
 
 // Verify that the function behaves as expected at compile time. If this fails, the logic is broken
 // and we need to fix it before proceeding.
 static_assert(culv_take_return_null(nullptr) ==
                   (static_cast<void>(0), nullptr), // Identity transformation on the null-set
               "culv_take_return_null does not return nullptr as expected!");
+static_assert(
+    validate_culv_take_return_null(),
+    "culv_take_return_null failed validation! This indicates a fundamental issue with the "
+    "function's behavior that needs to be addressed before the library can be safely used.");
 } // namespace
 #endif                     // __cplusplus
diff --git a/src/culverin/culverin_shadow_sync.cpp b/src/culverin/culverin_shadow_sync.cpp
index 99363322..4fea0baf 100644
--- a/src/culverin/culverin_shadow_sync.cpp
+++ b/src/culverin/culverin_shadow_sync.cpp
@@ -10,7 +10,7 @@
 static_assert(sizeof(PosStride) == sizeof(JPH_Real) * 4, "PosStride size mismatch");
 static_assert(sizeof(AuxStride) == sizeof(float) * 4, "AuxStride size mismatch");
 
-static constexpr int BATCH_SIZE = 32;
+static constexpr int BATCH_SIZE = 128;
 
 // Safe C++ wrapper for our worklist so we don't use opaque C pointers internally
 namespace {
@@ -39,16 +39,6 @@ CULV_FORCE_INLINE void process_full_batch(PhysicsWorldObject *self,
 #    pragma GCC unroll 4
 #endif
     for (uint32_t j = 0; j < BATCH_SIZE; j++) {
-        // [OPTIMIZATION]: Lookahead prefetch destination addresses for scatter-writes.
-        // Mitigates L1/L2 cache misses when dense_idx is highly randomized.
-        if (j + 4 < BATCH_SIZE) {
-            uint32_t future_D = worklist[j + 4].dense_idx;
-            CULV_PREFETCH_WRITE(&s_pos[future_D]);
-            CULV_PREFETCH_WRITE(&s_ppos[future_D]);
-            CULV_PREFETCH_WRITE(&s_rot[future_D]);
-            CULV_PREFETCH_WRITE(&s_prot[future_D]);
-        }
-
         uint32_t D = worklist[j].dense_idx;
 
         // Native C++ Pointer - GUARANTEED SAFE
@@ -65,12 +55,22 @@ CULV_FORCE_INLINE void process_full_batch(PhysicsWorldObject *self,
         JPH::Vec4(b->GetCenterOfMassPosition(), 0.0f)
             .StoreFloat4(reinterpret_cast<JPH::Float4 *>(&s_pos[D]));
 #else
-        // Keep scalar path for double precision
-        JPH::RVec3 p = b->GetCenterOfMassPosition();
-        s_pos[D].x   = p.GetX();
-        s_pos[D].y   = p.GetY();
-        s_pos[D].z   = p.GetZ();
-        s_pos[D].w   = 0.0;
+    JPH::RVec3 p = b->GetCenterOfMassPosition();
+    #if defined(JPH_USE_AVX)
+        __m256d v = _mm256_set_pd(0.0, p.GetZ(), p.GetY(), p.GetX());
+        _mm256_store_pd(reinterpret_cast<double*>(&s_pos[D]), v);
+    #elif defined(JPH_USE_NEON)
+        // NEON is 128-bit only, so two 64-bit stores
+        float64x2_t lo = vsetq_lane_f64(p.GetY(), vdupq_n_f64(p.GetX()), 1);
+        float64x2_t hi = vsetq_lane_f64(0.0,      vdupq_n_f64(p.GetZ()), 1);
+        vst1q_f64(reinterpret_cast<double*>(&s_pos[D]),     lo);
+        vst1q_f64(reinterpret_cast<double*>(&s_pos[D]) + 2, hi);
+    #else
+        s_pos[D].x = p.GetX();
+        s_pos[D].y = p.GetY();
+        s_pos[D].z = p.GetZ();
+        s_pos[D].w = 0.0;
+    #endif
 #endif
         // [OPTIMIZATION]: 128-bit SIMD Store Rotations (X, Y, Z, W)
         b->GetRotation().GetXYZW().StoreFloat4(reinterpret_cast<JPH::Float4 *>(&s_rot[D]));
@@ -119,12 +119,21 @@ CULV_FORCE_INLINE void process_partial_batch(PhysicsWorldObject *self,
         JPH::Vec4(b->GetCenterOfMassPosition(), 0.0f)
             .StoreFloat4(reinterpret_cast<JPH::Float4 *>(&s_pos[D]));
 #else
-        // Keep scalar path for double precision
-        JPH::RVec3 p = b->GetCenterOfMassPosition();
-        s_pos[D].x   = p.GetX();
-        s_pos[D].y   = p.GetY();
-        s_pos[D].z   = p.GetZ();
-        s_pos[D].w   = 0.0;
+    JPH::RVec3 p = b->GetCenterOfMassPosition();
+    #if defined(JPH_USE_AVX)
+        __m256d v = _mm256_set_pd(0.0, p.GetZ(), p.GetY(), p.GetX());
+        _mm256_store_pd(reinterpret_cast<double*>(&s_pos[D]), v);
+    #elif defined(JPH_USE_NEON)
+        float64x2_t lo = vsetq_lane_f64(p.GetY(), vdupq_n_f64(p.GetX()), 1);
+        float64x2_t hi = vsetq_lane_f64(0.0,      vdupq_n_f64(p.GetZ()), 1);
+        vst1q_f64(reinterpret_cast<double*>(&s_pos[D]),     lo);
+        vst1q_f64(reinterpret_cast<double*>(&s_pos[D]) + 2, hi);
+    #else
+        s_pos[D].x = p.GetX();
+        s_pos[D].y = p.GetY();
+        s_pos[D].z = p.GetZ();
+        s_pos[D].w = 0.0;
+    #endif
 #endif
 
         b->GetRotation().GetXYZW().StoreFloat4(reinterpret_cast<JPH::Float4 *>(&s_rot[D]));
@@ -183,11 +192,15 @@ extern "C" void culverin_sync_shadow_buffers(PhysicsWorldObject *self) {
         return;
     }
 
+    // Static variables persist in memory across function calls
+    CULV_MAYBE_UNUSED static CulvStat sync_stats = { .total_cycles = 0, .min_cycles = 0xFFFFFFFFFFFFFFFFULL, .max_cycles = 0, .count = 0 };
+
     CULV_PROFILE_BEGIN(sync);
 
     const uint32_t *CULV_RESTRICT s2d = self->slot_to_dense;
+    auto *CULV_RESTRICT s_pos = (PosStride *)self->positions;
+    auto *CULV_RESTRICT s_rot = (AuxStride *)self->rotations;
 
-    // Stack allocated worklist (fits in L1 cache comfortably)
     alignas(MEMORY_ALIGNMENT_SIZE) CppSyncWorkItem worklist[BATCH_SIZE];
     uint32_t work_ptr = 0;
 
@@ -196,12 +209,7 @@ extern "C" void culverin_sync_shadow_buffers(PhysicsWorldObject *self) {
             JPH_PhysicsSystem_GetBodyLockInterfaceNoLock(sys_c));
 
     for (uint32_t i = 0; i < active_count; i++) {
-        if (i + 4 < active_count) {
-            const void *next_id_ptr = &active_ids[i + 4];
-            CULV_PREFETCH(next_id_ptr);
-        }
-        // Post-step, no Jolt jobs running — NoLock interface is safe and eliminates
-        // per-body call overhead. Lock interface hoisted once above the loop.
+        // Post-step, no Jolt jobs running — NoLock interface is safe
         const JPH::Body *b = lock_iface->TryGetBody(JPH::BodyID(active_ids[i]));
         if (UNLIKELY(b == nullptr)) {
             continue;
@@ -211,23 +219,31 @@ extern "C" void culverin_sync_shadow_buffers(PhysicsWorldObject *self) {
         auto slot       = (uint32_t)(handle & HANDLE_INDEX_MASK);
         auto gen        = (uint32_t)(handle >> HANDLE_INDEX_BITS);
 
-        // 1. Calculate the bounds check (0 if in bounds, non-zero if out)
-        auto out_of_bounds = (uint32_t)(slot >= self->slot_capacity);
-
-        if (UNLIKELY(out_of_bounds)) {
-            continue;
-        }
-        // 2. Bitwise OR the conditions
-        if (UNLIKELY((self->generations[slot] ^ gen) | (self->slot_states[slot] ^ SLOT_ALIVE))) {
-            continue;
-        }
-
-        // Now the "Hot Path" is flat and easy to read
+        // --- BRANCHLESS VALIDATION ---
+        // Force the slot into a safe range to prevent segfaults on read
+        uint32_t safe_slot = (slot < self->slot_capacity) ? slot : 0;
+        
+        // Bitwise OR all failure conditions. If 'bad' is > 0, the body is invalid.
+        uint32_t bad = static_cast<uint32_t>(slot >= self->slot_capacity) | 
+                       (self->generations[safe_slot] ^ gen) | 
+                       (self->slot_states[safe_slot] ^ SLOT_ALIVE);
+
+        // Fetch dense index safely
+        uint32_t d_idx = s2d[safe_slot];
+
+        // --- DEEP PREFETCHING ---
+        // Ask the CPU to fetch the destination memory NOW. 
+        // By the time 'process_full_batch' is called, this memory will be waiting in L1 cache.
+        CULV_PREFETCH_WRITE(&s_pos[d_idx]);
+        CULV_PREFETCH_WRITE(&s_rot[d_idx]);
+
+        // Always write to the worklist (safe because it's local stack memory)
         CULV_ASSUME(work_ptr < BATCH_SIZE);
-        worklist[work_ptr].body      = b;
-        worklist[work_ptr].dense_idx = s2d[slot];
-        work_ptr++;
-        synced_count++;
+        uint32_t is_valid = static_cast<uint32_t>(bad == 0);
+        worklist[work_ptr].body      = (is_valid != 0u) ? b : worklist[work_ptr].body;
+        worklist[work_ptr].dense_idx = (is_valid != 0u) ? d_idx : worklist[work_ptr].dense_idx;
+        work_ptr += is_valid;
+        synced_count += is_valid;
 
         if (work_ptr == BATCH_SIZE) {
             process_full_batch(self, worklist);
@@ -239,5 +255,15 @@ extern "C" void culverin_sync_shadow_buffers(PhysicsWorldObject *self) {
     if (work_ptr > 0) {
         process_partial_batch(self, worklist, work_ptr);
     }
-    CULV_PROFILE_END(sync, "Sync", synced_count);
+    CULV_PROFILE_ACCUMULATE(sync, &sync_stats);
+    #ifdef CULVERIN_PROFILE_SYNC
+    if (sync_stats.count >= 50) {
+        fprintf(stderr, "[culverin] Sync Stat Avg: %" PRIu64 " | Max: %" PRIu64 "\n", 
+                sync_stats.total_cycles / sync_stats.count, 
+                sync_stats.max_cycles);
+        
+        // Reset
+        sync_stats = (CulvStat){0, 0xFFFFFFFFFFFFFFFFULL, 0, 0};
+    }
+    #endif
 }
\ No newline at end of file
diff --git a/tests/benchmark.py b/tests/benchmark.py
index e67d7196..54972a71 100644
--- a/tests/benchmark.py
+++ b/tests/benchmark.py
@@ -42,81 +42,159 @@ def run_leak_test(iterations=50000):
         print("✅ SUCCESS: Memory is stable")
 
 
-def run_threading_benchmark(duration=5.0, num_bodies=5000):
-    print(f"\n=== CULVERIN MEGA-BATCH THREAD STRESS TEST ===")
-    print(f"Simulating {num_bodies} bodies across multiple cores for {duration}s...")
+def run_threading_benchmark(duration=5.0, num_bodies=500):
+    print(f"\n=== CULVERIN REALISTIC SIMULATION BENCHMARK ===")
+    print(f"Simulating {num_bodies} active dynamic bodies across multiple cores for {duration}s...")
     
-    world = culverin.PhysicsWorld(settings={"max_bodies": num_bodies + 5000, "max_pairs": num_bodies * 2})
+    # 1. SETUP: World with plenty of headroom to avoid RuntimeErrors
+    world = culverin.PhysicsWorld(settings={
+        "max_bodies": num_bodies + 2000, 
+        "max_pairs": num_bodies * 8
+    })
     
-    # Pre-populate
-    rand_pos = np.random.uniform(-500, 500, (num_bodies, 3)).astype(np.float32)
-    rand_pos[:, 1] += 500
+    # Create static floor (Giant Box)
+    world.create_body(
+        pos=(0, -5, 0), size=(500, 1, 500), 
+        shape=culverin.SHAPE_BOX, motion=culverin.MOTION_STATIC
+    )
     
+    # Create Dynamic Grid (Spawned in the air so they crash down)
+    pos_list = []
+    grid_size = int(np.cbrt(num_bodies)) + 1
+    spacing = 1.5
+    for x in range(grid_size):
+        for y in range(grid_size):
+            for z in range(grid_size):
+                if len(pos_list) < num_bodies:
+                    pos_list.append((x * spacing - 10, y * spacing + 10, z * spacing - 10))
+            
     handles_raw = world.create_bodies_batch(
-        positions=rand_pos.tolist(),
-        sizes=[[0.5, 0.5, 0.5]] * num_bodies,
+        positions=pos_list,
+        sizes=[[0.5, 0.5, 0.5]] * len(pos_list),
         shape_type=culverin.SHAPE_BOX,
         motion_type=culverin.MOTION_DYNAMIC
     )
+    # Store handles in a mutable numpy array for thread-safe-ish updating
     handles = np.array(handles_raw, dtype=np.uint64)
-    world.step(0)
+    world.step(0) # Initial push to BroadPhase
     
     # Thread States
     running = True
-    stats = {"steps": 0, "queries": 0, "mutations": 0}
+    stats = {"steps": 0, "rays": 0, "contacts": 0, "resets": 0, "mutations": 0}
 
+    # --- THREAD 1: THE CORE STEPPER ---
     def worker_stepper():
         while running:
-            world.step(1 / 60.0)
-            stats["steps"] += 1
+            try:
+                # Run as fast as the CPU allows
+                world.step(1.0 / 60.0)
+                stats["steps"] += 1
+            except RuntimeError:
+                # Concurrent step/lock failure - just skip this loop
+                pass 
 
-    def worker_querier():
-        batch_size = 1000
+    # --- THREAD 2: SENSORS (RAYCASTS) ---
+    def worker_sensors():
+        batch_size = 500
         starts = array.array('f', [0.0] * (batch_size * 3))
-        dirs = array.array('f', [0.0, -100.0, 0.0] * batch_size)
+        dirs = array.array('f', [0.0, -1.0, 0.0] * batch_size)
         while running:
-            starts[1] = random.uniform(200, 500)
-            world.raycast_batch(starts=starts, directions=dirs, max_dist=1000.0)
-            stats["queries"] += batch_size
+            starts[1] = random.uniform(20, 50) # Randomize height
+            world.raycast_batch(starts=starts, directions=dirs, max_dist=100.0)
+            stats["rays"] += batch_size
+            time.sleep(0.01) # ~100Hz
 
-    def worker_hammer():
+    # --- THREAD 3: GAMEPLAY LOGIC (ANTI-SLEEP & MEMORYVIEW) ---
+    def worker_housekeeper():
+        # Wrap the raw memoryview in a NumPy array
+        pos_data = np.frombuffer(world.positions, dtype=np.float64).reshape(-1, 4)
+        
         while running:
-            # Recreate 10 bodies per loop
-            v_idx = [random.randint(0, num_bodies - 1) for _ in range(10)]
-            victims = [int(handles[i]) for i in v_idx]
+            # Find fallen bodies
+            fallen_indices = np.where(pos_data[:num_bodies, 1] < 0.0)[0]
             
-            world.destroy_bodies_batch(handles=victims)
-            new_h = world.create_bodies_batch(
-                positions=np.random.uniform(-50, 50, (10, 3)).tolist(),
-                sizes=[[1,1,1]]*10,
-                shape_type=culverin.SHAPE_SPHERE
-            )
-            for i, idx in enumerate(v_idx): handles[idx] = new_h[i]
-            stats["mutations"] += 10
-            time.sleep(0.01)
+            for idx in fallen_indices:
+                h = int(handles[idx])
+                
+                # ADDED: Check if the handle is still valid before calling C
+                if world.is_alive(h):
+                    try:
+                        world.set_position(h, random.uniform(-10, 10), 20.0, random.uniform(-10, 10))
+                        world.set_linear_velocity(h, 0, 0, 0)
+                        stats["resets"] += 1
+                    except ValueError:
+                        # Fallback: if it was destroyed between the is_alive check 
+                        # and the set_position call, just ignore it.
+                        pass
+                
+            time.sleep(0.5)
 
+    # --- THREAD 4: THE MUTATOR (CONTROLLED HAMMER) ---
+    def worker_mutator():
+        while running:
+            try:
+                # Destroy 5 bodies, create 5 bodies. 
+                # Stresses the BroadPhase AABB tree and synchronization locks.
+                idx_to_replace = [random.randint(0, num_bodies - 1) for _ in range(5)]
+                victims = [int(handles[i]) for i in idx_to_replace]
+                
+                world.destroy_bodies_batch(handles=victims)
+                
+                new_h = world.create_bodies_batch(
+                    positions=[(0, 40, 0)] * 5,
+                    sizes=[[1, 1, 1]] * 5,
+                    shape_type=culverin.SHAPE_SPHERE,
+                    motion_type=culverin.MOTION_DYNAMIC
+                )
+                
+                for i, h in enumerate(new_h):
+                    handles[idx_to_replace[i]] = h
+                    world.activate(int(h)) # Force awake
+                    
+                stats["mutations"] += 5
+            except RuntimeError:
+                pass # Lock contention or pool limit hit, try again later
+                
+            time.sleep(0.05) # Run 20 times a second
+
+    # Start Threads
     threads = [
         threading.Thread(target=worker_stepper, name="Stepper"),
-        threading.Thread(target=worker_querier, name="Querier1"),
-        threading.Thread(target=worker_querier, name="Querier2"),
-        threading.Thread(target=worker_hammer, name="Hammer")
+        threading.Thread(target=worker_sensors, name="Sensors"),
+        threading.Thread(target=worker_housekeeper, name="Housekeeper"),
+        threading.Thread(target=worker_mutator, name="Mutator")
     ]
     
     for t in threads: t.start()
     
+    # Monitoring Loop
     start_t = time.time()
     try:
         while time.time() - start_t < duration:
             time.sleep(1.0)
-            print(f"[@ {time.time()-start_t:.1f}s] Steps: {stats['steps']} | Rays: {stats['queries']} | Mutations: {stats['mutations']}")
+            # Fetch raw events to clear the buffer (simulate event handling)
+            contacts = world.get_contact_events_raw()
+            if contacts:
+                stats["contacts"] += 1
+                
+            print(f"[@ {time.time()-start_t:.1f}s] "
+                  f"Steps: {stats['steps']} | "
+                  f"Rays: {stats['rays']} | "
+                  f"Mutations: {stats['mutations']} | "
+                  f"Resets: {stats['resets']}")
+            
             if stats['steps'] == 0:
-                print("❌ CRITICAL: Physics Thread is deadlocked")
+                print("❌ CRITICAL: Physics Thread is completely deadlocked")
                 break
     finally:
         running = False
-        for t in threads: t.join(timeout=2.0)
+        for t in threads: t.join(timeout=1.0)
         
-    print(f"✅ STRESS TEST COMPLETE: {stats['steps']} steps, {stats['queries']} rays.")
+    fps = stats['steps'] / duration
+    print(f"\n✅ TEST COMPLETE")
+    print(f"Final Performance: {fps:.2f} FPS")
+    print(f"Total Steps: {stats['steps']}")
+    print(f"Total Raycasts: {stats['rays']}")
 
 def run_churn_test(duration=10.0):
     # There is a known memory issue. will investigate...