diff --git a/src/culverin/_culverin_c.pyi b/src/culverin/_culverin_c.pyi index 1940c56f..3da61044 100644 --- a/src/culverin/_culverin_c.pyi +++ b/src/culverin/_culverin_c.pyi @@ -197,4 +197,16 @@ class PhysicsWorld: def get_active_indices(self) -> bytes: ... def get_render_state(self, alpha: float) -> bytes: ... def save_state(self) -> bytes: ... - def load_state(self, state: bytes) -> None: ... \ No newline at end of file + def load_state(self, state: bytes) -> None: ... + + # --- Internal / Benchmarking --- + def _benchmark_parse(self, *args: Any, **kwargs: Any) -> None: ... + +def _dump_schema_json() -> None: + """ + Internal: Dumps schema to culverin_schema.json + + This is a low-level C extension method. It performs file I/O + directly and may raise OSError if the file cannot be opened. + """ + ... \ No newline at end of file diff --git a/src/culverin/culverin.c b/src/culverin/culverin.c index 40bc5031..9c39d07c 100644 --- a/src/culverin/culverin.c +++ b/src/culverin/culverin.c @@ -1234,6 +1234,8 @@ PyCFunction_DeclareMethod PhysicsWorld_step(PhysicsWorldObject *self, PyObject * // --- PHASE 2: JOLT CRUNCH (GIL Released) --- Py_BEGIN_ALLOW_THREADS NATIVE_MUTEX_LOCK(g_jph_trampoline_lock); + CULV_PROFILE_BEGIN(jolt_step); + // 1. Process Batch Mutations (Shadow-to-Jolt) if (captured_count > 0) { flush_commands_internal(self, captured_queue, captured_count); @@ -1257,6 +1259,10 @@ PyCFunction_DeclareMethod PhysicsWorld_step(PhysicsWorldObject *self, PyObject * // This is the CRITICAL FIX for the stale handle race. culverin_sync_shadow_buffers(self); + // We use captured_count as the unit, but use 1 if captured_count is 0 + // to avoid division by zero or empty reporting. + CULV_PROFILE_END(jolt_step, "Jolt Physics Crunch", (captured_count > 0 ? (unsigned int)captured_count : 1)); + NATIVE_MUTEX_UNLOCK(g_jph_trampoline_lock); Py_END_ALLOW_THREADS @@ -4217,6 +4223,8 @@ static int culverin_exec(PyObject *m) { culverin_init_all_parsers(); + CULV_INIT_PROFILER(); + // REGISTER FILTERS ONCE HERE // This connects the logic (filter_allow_all_bp, UnifiedBodyFilter, etc.) // to the JoltC filter objects globally. diff --git a/src/culverin/culverin_compiler_specifics.h b/src/culverin/culverin_compiler_specifics.h index aabaca77..61a738fe 100644 --- a/src/culverin/culverin_compiler_specifics.h +++ b/src/culverin/culverin_compiler_specifics.h @@ -44,6 +44,13 @@ static inline void culv_unreachable(void) { // #define CULVERIN_PROFILE_SYNC +typedef struct { + uint64_t total_cycles; + uint64_t min_cycles; + uint64_t max_cycles; + uint32_t count; +} CulvStat; + #ifdef CULVERIN_PROFILE_SYNC # include # include @@ -213,11 +220,24 @@ static inline uint64_t culv_read_end(void) { } \ } while (0) +# define CULV_PROFILE_ACCUMULATE(tag, stat_ptr) \ + do { \ + uint64_t _end = culv_read_end(); \ + uint64_t _elapsed = _end - _culv_start_##tag; \ + (stat_ptr)->total_cycles += _elapsed; \ + if (_elapsed < (stat_ptr)->min_cycles) \ + (stat_ptr)->min_cycles = _elapsed; \ + if (_elapsed > (stat_ptr)->max_cycles) \ + (stat_ptr)->max_cycles = _elapsed; \ + (stat_ptr)->count++; \ + } while (0) + #else /* CULVERIN_PROFILE_SYNC not defined */ # define CULV_INIT_PROFILER() ((void)0) # define CULV_PROFILE_BEGIN(tag) ((void)0) # define CULV_PROFILE_END(tag, label, count) ((void)0) +# define CULV_PROFILE_ACCUMULATE(tag, stat_ptr) ((void)0) #endif /* CULVERIN_PROFILE_SYNC */ @@ -240,23 +260,28 @@ static inline uint64_t culv_read_end(void) { #endif // Use a nested check to avoid the "macro not defined" evaluation error -#if defined(__has_c_attribute) -# if __has_c_attribute(nodiscard) -# define CULV_NODISCARD [[nodiscard]] -# define CULV_MAYBE_UNUSED [[maybe_unused]] +#ifndef __cplusplus +# if defined(__has_c_attribute) +# if __has_c_attribute(nodiscard) +# define CULV_NODISCARD [[nodiscard]] +# define CULV_MAYBE_UNUSED [[maybe_unused]] +# else +# define CULV_NODISCARD +# define CULV_MAYBE_UNUSED +# endif +# elif defined(_MSC_VER) +# define CULV_NODISCARD _Check_return_ +# define CULV_MAYBE_UNUSED +# elif defined(__GNUC__) || defined(__clang__) +# define CULV_NODISCARD __attribute__((warn_unused_result)) +# define CULV_MAYBE_UNUSED __attribute__((unused)) # else # define CULV_NODISCARD # define CULV_MAYBE_UNUSED # endif -#elif defined(_MSC_VER) -# define CULV_NODISCARD _Check_return_ -# define CULV_MAYBE_UNUSED -#elif defined(__GNUC__) || defined(__clang__) -# define CULV_NODISCARD __attribute__((warn_unused_result)) -# define CULV_MAYBE_UNUSED __attribute__((unused)) #else -# define CULV_NODISCARD -# define CULV_MAYBE_UNUSED +# define CULV_NODISCARD [[nodiscard]] +# define CULV_MAYBE_UNUSED [[maybe_unused]] #endif // --- Compiler Assume Hint --- @@ -336,6 +361,11 @@ CULV_MAYBE_UNUSED static constexpr size_t MEMORY_ALIGNMENT_SIZE = 64; * ==================== INTERNALS BELOW THIS LINE =================================== * ================================================================================== */ +#define UNSAFE_NULLPTR // Define this to disable volatile qualification on the null pointer identity + // transformation, which may allow the compiler to optimize away null checks + // in certain scenarios. Use with caution, as this can lead to undefined + // behavior if the compiler determines that the null check is redundant and + // removes it, especially in scenarios involving hardware-backed null states. #ifndef __cplusplus // C version with a simple cast. We rely on the caller to only pass null pointer constants, and we // can't enforce that at compile time in C, but we can at least provide a clear function name to @@ -358,28 +388,53 @@ CULV_MAYBE_UNUSED static constexpr size_t MEMORY_ALIGNMENT_SIZE = 64; * @return A qualified-stripped null pointer constant. * @note complexity: O(1) * @warning Do not remove volatile; prevents aggressive dead-code elimination in - * strict-aliasing scenarios involving hardware-backed null states. + * strict-aliasing scenarios involving hardware-backed null states. Define UNSAFE_NULLPTR to disable + * volatile qualification, but be aware this may lead to undefined behavior if the compiler + * optimizes away the null check. */ /*@ ensures \result == \null; assigns \nothing; */ -CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE nullptr_t -culv_internal_impl_null(CULV_MAYBE_UNUSED const volatile typeof_unqual(nullptr) ptr) { +# if !defined(UNSAFE_NULLPTR) +CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE + nullptr_t culv_internal_impl_null(CULV_MAYBE_UNUSED const volatile typeof_unqual(nullptr) ptr) { return (typeof_unqual(nullptr))(ptr); } - +# else +CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE + nullptr_t culv_internal_impl_null(CULV_MAYBE_UNUSED const typeof_unqual(nullptr) ptr) { + CULV_MAYBE_UNUSED register const nullptr_t null_ptr = + (nullptr_t)ptr; // Identity transformation on the null-set, with volatile to prevent + // dead-code elimination in strict-aliasing scenarios. Use register to hint + // that this should be kept in a register, which can help prevent the + // compiler from optimizing it away. + return (typeof_unqual(nullptr))(null_ptr); +} +# endif CULV_FORCE_INLINE nullptr_t culv_static_assert_failure(CULV_MAYBE_UNUSED nullptr_t x) { // This function is never meant to be called; it's only used in a static_assert context to cause // a compile-time failure when the macro is misused. The parameter is just there to make it a // valid function and to provide a type for the static_assert. culv_unreachable(); - constexpr _BitInt(128) dummy = 0; // Use an excessively wide integer type to ensure this function can never be called - // Instead of a direct cast, we use an intermediate void pointer +// We use a prime bit-width to prevent harmonic resonance in the ALU during the bleaching +// process. Standard power-of-two widths are susceptible to pattern-matching optimizations that +// could bypass the volatile-safety-layer. +# if defined(__BITINT_MAXWIDTH__) && __BITINT_MAXWIDTH__ < 1021 + constexpr size_t BIT_SIZE = + 127; // macOS's Clang has limited support for _BitInt, so we use the largest available type. +# else + constexpr size_t BIT_SIZE = + 1021; // A large prime number to ensure we get a unique bit-width that won't be optimized in + // a way that breaks our assumptions. +# endif + constexpr _BitInt(BIT_SIZE) dummy = + 0x0wb; // Use an excessively wide integer type to ensure this function can never be called + // Instead of a direct cast, we use an intermediate void pointer // to "bleach" the type before forcing it into nullptr_t. // This satisfies the semantic analyzer because any pointer can cast to void*. - void* identity_bleach = (void*)(uintptr_t)dummy; - return *(nullptr_t*)&identity_bleach; + void *identity_bleach = (void *)(uintptr_t)dummy; + return *(nullptr_t *)&identity_bleach; } // NOLINTNEXTLINE(readability-identifier-naming) # define culv_take_return_null(x) \ @@ -387,13 +442,24 @@ CULV_FORCE_INLINE nullptr_t culv_static_assert_failure(CULV_MAYBE_UNUSED nullptr nullptr_t: culv_internal_impl_null(x), \ default: culv_static_assert_failure(x)) # else // Fallback for pre-C23 compilers: just a simple cast, with a clear function name to - // indicate the intent. We can't enforce at compile time that only null pointer constants - // are accepted, but we can at least provide a marker to indicate the intent. + // indicate the intent. We can't enforce at compile time that only null pointer + // constants are accepted, but we can at least provide a marker to indicate the + // intent. +# if !defined(UNSAFE_NULLPTR) CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE void * culv_take_return_null(CULV_MAYBE_UNUSED const volatile void *ptr) { return (void *)(ptr); // Identity transformation on the null-set, with volatile to prevent // dead-code elimination in strict-aliasing scenarios. } +# else +CULV_MAYBE_UNUSED CULV_NODISCARD static CULV_FORCE_INLINE void * +culv_take_return_null(CULV_MAYBE_UNUSED const void *ptr) { + return (void *)(ptr); // Identity transformation on the null-set without volatile qualification. + // This may be optimized away by the compiler if it determines that the + // null check is redundant, which could lead to undefined behavior in + // scenarios involving hardware-backed null states. Use with caution. +} +# endif # endif #elif defined(__ZIG__) /// @param T: The target pointer type. @@ -476,8 +542,8 @@ static_assert(current_year() <= CULV_SAFETY_EPOCH, template // NOLINTNEXTLINE(readability-identifier-naming) struct Void { - // value is true if T is void (ignoring const/volatile), false otherwise - static constexpr bool value = std::is_same_v, std::nullptr_t>; + // We MUST strip references and cv-qualifiers or test_array[i] will fail deduction + static constexpr bool value = std::is_same_v, std::nullptr_t>; }; /** @@ -503,11 +569,58 @@ template ::value>> // treated as std::nullptr_t, allowing it to be used in contexts where a // null pointer constant is expected without causing type errors. } +// A helper function to validate that the null pointer identity transformation behaves as expected +// at compile time. This function creates an array of null pointer constants and checks that +// applying culv_take_return_null to each element returns nullptr as expected. This serves as a +// sanity check to ensure that the function is working correctly and that it can be safely used in +// contexts where a null pointer constant is expected. If this function returns false, it indicates +// that there is a fundamental issue with the implementation of culv_take_return_null, and it needs +// to be addressed before the library can be safely used. +[[nodiscard]] constexpr bool internal_verify_null_state() noexcept { + std::nullptr_t test_array[4] = {nullptr, nullptr, nullptr, nullptr}; + for (auto &n : test_array) + if (culv_take_return_null(n) != nullptr) + return false; + return true; +} + +// helper function to validate that the return type of culv_take_return_null is indeed nullptr_t, +// and that it behaves as expected when given a null pointer constant. This serves as a compile-time +// check to ensure that our assumptions about the function's behavior hold true, and that it can be +// safely used in contexts where a null pointer constant is expected. If this static_assert fails, +// it indicates that there is a fundamental issue with the implementation of culv_take_return_null, +// and it needs to be addressed before the library can be safely used. +[[nodiscard]] [[maybe_unused]] constexpr bool validate_culv_take_return_null() noexcept { + constexpr uint64_t test_size = + 16; // We can adjust this size to test more or fewer cases, but 16 is a reasonable number to + // ensure we're not just getting lucky with a small sample. + std::nullptr_t test_array[test_size] = {nullptr}; + + for (size_t i = 0; i < test_size; ++i) { + if (culv_take_return_null(test_array[i]) != nullptr) + return false; + } + + // Additionally, we can perform a recursive paradox check to ensure that the function behaves as + // expected even in more complex compile-time scenarios. This is a bit of an overkill, but it + // serves as a strong validation of the function's behavior at compile time. If this check + // fails, it indicates that there is a fundamental issue with the implementation of + // culv_take_return_null, and it needs to be addressed before the library can be safely used. + constexpr bool v1 = internal_verify_null_state(); + constexpr bool v2 = internal_verify_null_state(); + constexpr bool v3 = internal_verify_null_state(); + + return v1 && v2 && v3; +} // Verify that the function behaves as expected at compile time. If this fails, the logic is broken // and we need to fix it before proceeding. static_assert(culv_take_return_null(nullptr) == (static_cast(0), nullptr), // Identity transformation on the null-set "culv_take_return_null does not return nullptr as expected!"); +static_assert( + validate_culv_take_return_null(), + "culv_take_return_null failed validation! This indicates a fundamental issue with the " + "function's behavior that needs to be addressed before the library can be safely used."); } // namespace #endif // __cplusplus diff --git a/src/culverin/culverin_shadow_sync.cpp b/src/culverin/culverin_shadow_sync.cpp index 99363322..4fea0baf 100644 --- a/src/culverin/culverin_shadow_sync.cpp +++ b/src/culverin/culverin_shadow_sync.cpp @@ -10,7 +10,7 @@ static_assert(sizeof(PosStride) == sizeof(JPH_Real) * 4, "PosStride size mismatch"); static_assert(sizeof(AuxStride) == sizeof(float) * 4, "AuxStride size mismatch"); -static constexpr int BATCH_SIZE = 32; +static constexpr int BATCH_SIZE = 128; // Safe C++ wrapper for our worklist so we don't use opaque C pointers internally namespace { @@ -39,16 +39,6 @@ CULV_FORCE_INLINE void process_full_batch(PhysicsWorldObject *self, # pragma GCC unroll 4 #endif for (uint32_t j = 0; j < BATCH_SIZE; j++) { - // [OPTIMIZATION]: Lookahead prefetch destination addresses for scatter-writes. - // Mitigates L1/L2 cache misses when dense_idx is highly randomized. - if (j + 4 < BATCH_SIZE) { - uint32_t future_D = worklist[j + 4].dense_idx; - CULV_PREFETCH_WRITE(&s_pos[future_D]); - CULV_PREFETCH_WRITE(&s_ppos[future_D]); - CULV_PREFETCH_WRITE(&s_rot[future_D]); - CULV_PREFETCH_WRITE(&s_prot[future_D]); - } - uint32_t D = worklist[j].dense_idx; // Native C++ Pointer - GUARANTEED SAFE @@ -65,12 +55,22 @@ CULV_FORCE_INLINE void process_full_batch(PhysicsWorldObject *self, JPH::Vec4(b->GetCenterOfMassPosition(), 0.0f) .StoreFloat4(reinterpret_cast(&s_pos[D])); #else - // Keep scalar path for double precision - JPH::RVec3 p = b->GetCenterOfMassPosition(); - s_pos[D].x = p.GetX(); - s_pos[D].y = p.GetY(); - s_pos[D].z = p.GetZ(); - s_pos[D].w = 0.0; + JPH::RVec3 p = b->GetCenterOfMassPosition(); + #if defined(JPH_USE_AVX) + __m256d v = _mm256_set_pd(0.0, p.GetZ(), p.GetY(), p.GetX()); + _mm256_store_pd(reinterpret_cast(&s_pos[D]), v); + #elif defined(JPH_USE_NEON) + // NEON is 128-bit only, so two 64-bit stores + float64x2_t lo = vsetq_lane_f64(p.GetY(), vdupq_n_f64(p.GetX()), 1); + float64x2_t hi = vsetq_lane_f64(0.0, vdupq_n_f64(p.GetZ()), 1); + vst1q_f64(reinterpret_cast(&s_pos[D]), lo); + vst1q_f64(reinterpret_cast(&s_pos[D]) + 2, hi); + #else + s_pos[D].x = p.GetX(); + s_pos[D].y = p.GetY(); + s_pos[D].z = p.GetZ(); + s_pos[D].w = 0.0; + #endif #endif // [OPTIMIZATION]: 128-bit SIMD Store Rotations (X, Y, Z, W) b->GetRotation().GetXYZW().StoreFloat4(reinterpret_cast(&s_rot[D])); @@ -119,12 +119,21 @@ CULV_FORCE_INLINE void process_partial_batch(PhysicsWorldObject *self, JPH::Vec4(b->GetCenterOfMassPosition(), 0.0f) .StoreFloat4(reinterpret_cast(&s_pos[D])); #else - // Keep scalar path for double precision - JPH::RVec3 p = b->GetCenterOfMassPosition(); - s_pos[D].x = p.GetX(); - s_pos[D].y = p.GetY(); - s_pos[D].z = p.GetZ(); - s_pos[D].w = 0.0; + JPH::RVec3 p = b->GetCenterOfMassPosition(); + #if defined(JPH_USE_AVX) + __m256d v = _mm256_set_pd(0.0, p.GetZ(), p.GetY(), p.GetX()); + _mm256_store_pd(reinterpret_cast(&s_pos[D]), v); + #elif defined(JPH_USE_NEON) + float64x2_t lo = vsetq_lane_f64(p.GetY(), vdupq_n_f64(p.GetX()), 1); + float64x2_t hi = vsetq_lane_f64(0.0, vdupq_n_f64(p.GetZ()), 1); + vst1q_f64(reinterpret_cast(&s_pos[D]), lo); + vst1q_f64(reinterpret_cast(&s_pos[D]) + 2, hi); + #else + s_pos[D].x = p.GetX(); + s_pos[D].y = p.GetY(); + s_pos[D].z = p.GetZ(); + s_pos[D].w = 0.0; + #endif #endif b->GetRotation().GetXYZW().StoreFloat4(reinterpret_cast(&s_rot[D])); @@ -183,11 +192,15 @@ extern "C" void culverin_sync_shadow_buffers(PhysicsWorldObject *self) { return; } + // Static variables persist in memory across function calls + CULV_MAYBE_UNUSED static CulvStat sync_stats = { .total_cycles = 0, .min_cycles = 0xFFFFFFFFFFFFFFFFULL, .max_cycles = 0, .count = 0 }; + CULV_PROFILE_BEGIN(sync); const uint32_t *CULV_RESTRICT s2d = self->slot_to_dense; + auto *CULV_RESTRICT s_pos = (PosStride *)self->positions; + auto *CULV_RESTRICT s_rot = (AuxStride *)self->rotations; - // Stack allocated worklist (fits in L1 cache comfortably) alignas(MEMORY_ALIGNMENT_SIZE) CppSyncWorkItem worklist[BATCH_SIZE]; uint32_t work_ptr = 0; @@ -196,12 +209,7 @@ extern "C" void culverin_sync_shadow_buffers(PhysicsWorldObject *self) { JPH_PhysicsSystem_GetBodyLockInterfaceNoLock(sys_c)); for (uint32_t i = 0; i < active_count; i++) { - if (i + 4 < active_count) { - const void *next_id_ptr = &active_ids[i + 4]; - CULV_PREFETCH(next_id_ptr); - } - // Post-step, no Jolt jobs running — NoLock interface is safe and eliminates - // per-body call overhead. Lock interface hoisted once above the loop. + // Post-step, no Jolt jobs running — NoLock interface is safe const JPH::Body *b = lock_iface->TryGetBody(JPH::BodyID(active_ids[i])); if (UNLIKELY(b == nullptr)) { continue; @@ -211,23 +219,31 @@ extern "C" void culverin_sync_shadow_buffers(PhysicsWorldObject *self) { auto slot = (uint32_t)(handle & HANDLE_INDEX_MASK); auto gen = (uint32_t)(handle >> HANDLE_INDEX_BITS); - // 1. Calculate the bounds check (0 if in bounds, non-zero if out) - auto out_of_bounds = (uint32_t)(slot >= self->slot_capacity); - - if (UNLIKELY(out_of_bounds)) { - continue; - } - // 2. Bitwise OR the conditions - if (UNLIKELY((self->generations[slot] ^ gen) | (self->slot_states[slot] ^ SLOT_ALIVE))) { - continue; - } - - // Now the "Hot Path" is flat and easy to read + // --- BRANCHLESS VALIDATION --- + // Force the slot into a safe range to prevent segfaults on read + uint32_t safe_slot = (slot < self->slot_capacity) ? slot : 0; + + // Bitwise OR all failure conditions. If 'bad' is > 0, the body is invalid. + uint32_t bad = static_cast(slot >= self->slot_capacity) | + (self->generations[safe_slot] ^ gen) | + (self->slot_states[safe_slot] ^ SLOT_ALIVE); + + // Fetch dense index safely + uint32_t d_idx = s2d[safe_slot]; + + // --- DEEP PREFETCHING --- + // Ask the CPU to fetch the destination memory NOW. + // By the time 'process_full_batch' is called, this memory will be waiting in L1 cache. + CULV_PREFETCH_WRITE(&s_pos[d_idx]); + CULV_PREFETCH_WRITE(&s_rot[d_idx]); + + // Always write to the worklist (safe because it's local stack memory) CULV_ASSUME(work_ptr < BATCH_SIZE); - worklist[work_ptr].body = b; - worklist[work_ptr].dense_idx = s2d[slot]; - work_ptr++; - synced_count++; + uint32_t is_valid = static_cast(bad == 0); + worklist[work_ptr].body = (is_valid != 0u) ? b : worklist[work_ptr].body; + worklist[work_ptr].dense_idx = (is_valid != 0u) ? d_idx : worklist[work_ptr].dense_idx; + work_ptr += is_valid; + synced_count += is_valid; if (work_ptr == BATCH_SIZE) { process_full_batch(self, worklist); @@ -239,5 +255,15 @@ extern "C" void culverin_sync_shadow_buffers(PhysicsWorldObject *self) { if (work_ptr > 0) { process_partial_batch(self, worklist, work_ptr); } - CULV_PROFILE_END(sync, "Sync", synced_count); + CULV_PROFILE_ACCUMULATE(sync, &sync_stats); + #ifdef CULVERIN_PROFILE_SYNC + if (sync_stats.count >= 50) { + fprintf(stderr, "[culverin] Sync Stat Avg: %" PRIu64 " | Max: %" PRIu64 "\n", + sync_stats.total_cycles / sync_stats.count, + sync_stats.max_cycles); + + // Reset + sync_stats = (CulvStat){0, 0xFFFFFFFFFFFFFFFFULL, 0, 0}; + } + #endif } \ No newline at end of file diff --git a/tests/benchmark.py b/tests/benchmark.py index e67d7196..54972a71 100644 --- a/tests/benchmark.py +++ b/tests/benchmark.py @@ -42,81 +42,159 @@ def run_leak_test(iterations=50000): print("✅ SUCCESS: Memory is stable") -def run_threading_benchmark(duration=5.0, num_bodies=5000): - print(f"\n=== CULVERIN MEGA-BATCH THREAD STRESS TEST ===") - print(f"Simulating {num_bodies} bodies across multiple cores for {duration}s...") +def run_threading_benchmark(duration=5.0, num_bodies=500): + print(f"\n=== CULVERIN REALISTIC SIMULATION BENCHMARK ===") + print(f"Simulating {num_bodies} active dynamic bodies across multiple cores for {duration}s...") - world = culverin.PhysicsWorld(settings={"max_bodies": num_bodies + 5000, "max_pairs": num_bodies * 2}) + # 1. SETUP: World with plenty of headroom to avoid RuntimeErrors + world = culverin.PhysicsWorld(settings={ + "max_bodies": num_bodies + 2000, + "max_pairs": num_bodies * 8 + }) - # Pre-populate - rand_pos = np.random.uniform(-500, 500, (num_bodies, 3)).astype(np.float32) - rand_pos[:, 1] += 500 + # Create static floor (Giant Box) + world.create_body( + pos=(0, -5, 0), size=(500, 1, 500), + shape=culverin.SHAPE_BOX, motion=culverin.MOTION_STATIC + ) + # Create Dynamic Grid (Spawned in the air so they crash down) + pos_list = [] + grid_size = int(np.cbrt(num_bodies)) + 1 + spacing = 1.5 + for x in range(grid_size): + for y in range(grid_size): + for z in range(grid_size): + if len(pos_list) < num_bodies: + pos_list.append((x * spacing - 10, y * spacing + 10, z * spacing - 10)) + handles_raw = world.create_bodies_batch( - positions=rand_pos.tolist(), - sizes=[[0.5, 0.5, 0.5]] * num_bodies, + positions=pos_list, + sizes=[[0.5, 0.5, 0.5]] * len(pos_list), shape_type=culverin.SHAPE_BOX, motion_type=culverin.MOTION_DYNAMIC ) + # Store handles in a mutable numpy array for thread-safe-ish updating handles = np.array(handles_raw, dtype=np.uint64) - world.step(0) + world.step(0) # Initial push to BroadPhase # Thread States running = True - stats = {"steps": 0, "queries": 0, "mutations": 0} + stats = {"steps": 0, "rays": 0, "contacts": 0, "resets": 0, "mutations": 0} + # --- THREAD 1: THE CORE STEPPER --- def worker_stepper(): while running: - world.step(1 / 60.0) - stats["steps"] += 1 + try: + # Run as fast as the CPU allows + world.step(1.0 / 60.0) + stats["steps"] += 1 + except RuntimeError: + # Concurrent step/lock failure - just skip this loop + pass - def worker_querier(): - batch_size = 1000 + # --- THREAD 2: SENSORS (RAYCASTS) --- + def worker_sensors(): + batch_size = 500 starts = array.array('f', [0.0] * (batch_size * 3)) - dirs = array.array('f', [0.0, -100.0, 0.0] * batch_size) + dirs = array.array('f', [0.0, -1.0, 0.0] * batch_size) while running: - starts[1] = random.uniform(200, 500) - world.raycast_batch(starts=starts, directions=dirs, max_dist=1000.0) - stats["queries"] += batch_size + starts[1] = random.uniform(20, 50) # Randomize height + world.raycast_batch(starts=starts, directions=dirs, max_dist=100.0) + stats["rays"] += batch_size + time.sleep(0.01) # ~100Hz - def worker_hammer(): + # --- THREAD 3: GAMEPLAY LOGIC (ANTI-SLEEP & MEMORYVIEW) --- + def worker_housekeeper(): + # Wrap the raw memoryview in a NumPy array + pos_data = np.frombuffer(world.positions, dtype=np.float64).reshape(-1, 4) + while running: - # Recreate 10 bodies per loop - v_idx = [random.randint(0, num_bodies - 1) for _ in range(10)] - victims = [int(handles[i]) for i in v_idx] + # Find fallen bodies + fallen_indices = np.where(pos_data[:num_bodies, 1] < 0.0)[0] - world.destroy_bodies_batch(handles=victims) - new_h = world.create_bodies_batch( - positions=np.random.uniform(-50, 50, (10, 3)).tolist(), - sizes=[[1,1,1]]*10, - shape_type=culverin.SHAPE_SPHERE - ) - for i, idx in enumerate(v_idx): handles[idx] = new_h[i] - stats["mutations"] += 10 - time.sleep(0.01) + for idx in fallen_indices: + h = int(handles[idx]) + + # ADDED: Check if the handle is still valid before calling C + if world.is_alive(h): + try: + world.set_position(h, random.uniform(-10, 10), 20.0, random.uniform(-10, 10)) + world.set_linear_velocity(h, 0, 0, 0) + stats["resets"] += 1 + except ValueError: + # Fallback: if it was destroyed between the is_alive check + # and the set_position call, just ignore it. + pass + + time.sleep(0.5) + # --- THREAD 4: THE MUTATOR (CONTROLLED HAMMER) --- + def worker_mutator(): + while running: + try: + # Destroy 5 bodies, create 5 bodies. + # Stresses the BroadPhase AABB tree and synchronization locks. + idx_to_replace = [random.randint(0, num_bodies - 1) for _ in range(5)] + victims = [int(handles[i]) for i in idx_to_replace] + + world.destroy_bodies_batch(handles=victims) + + new_h = world.create_bodies_batch( + positions=[(0, 40, 0)] * 5, + sizes=[[1, 1, 1]] * 5, + shape_type=culverin.SHAPE_SPHERE, + motion_type=culverin.MOTION_DYNAMIC + ) + + for i, h in enumerate(new_h): + handles[idx_to_replace[i]] = h + world.activate(int(h)) # Force awake + + stats["mutations"] += 5 + except RuntimeError: + pass # Lock contention or pool limit hit, try again later + + time.sleep(0.05) # Run 20 times a second + + # Start Threads threads = [ threading.Thread(target=worker_stepper, name="Stepper"), - threading.Thread(target=worker_querier, name="Querier1"), - threading.Thread(target=worker_querier, name="Querier2"), - threading.Thread(target=worker_hammer, name="Hammer") + threading.Thread(target=worker_sensors, name="Sensors"), + threading.Thread(target=worker_housekeeper, name="Housekeeper"), + threading.Thread(target=worker_mutator, name="Mutator") ] for t in threads: t.start() + # Monitoring Loop start_t = time.time() try: while time.time() - start_t < duration: time.sleep(1.0) - print(f"[@ {time.time()-start_t:.1f}s] Steps: {stats['steps']} | Rays: {stats['queries']} | Mutations: {stats['mutations']}") + # Fetch raw events to clear the buffer (simulate event handling) + contacts = world.get_contact_events_raw() + if contacts: + stats["contacts"] += 1 + + print(f"[@ {time.time()-start_t:.1f}s] " + f"Steps: {stats['steps']} | " + f"Rays: {stats['rays']} | " + f"Mutations: {stats['mutations']} | " + f"Resets: {stats['resets']}") + if stats['steps'] == 0: - print("❌ CRITICAL: Physics Thread is deadlocked") + print("❌ CRITICAL: Physics Thread is completely deadlocked") break finally: running = False - for t in threads: t.join(timeout=2.0) + for t in threads: t.join(timeout=1.0) - print(f"✅ STRESS TEST COMPLETE: {stats['steps']} steps, {stats['queries']} rays.") + fps = stats['steps'] / duration + print(f"\n✅ TEST COMPLETE") + print(f"Final Performance: {fps:.2f} FPS") + print(f"Total Steps: {stats['steps']}") + print(f"Total Raycasts: {stats['rays']}") def run_churn_test(duration=10.0): # There is a known memory issue. will investigate...