src/cuda/: scaffold native CUDA backend (skeleton + work plan)

slarson · slarson · commit 9f92972a7875 · 2026-05-03T16:24:51.000+01:00
Lays out the structure for a native CUDA backend that mirrors PR #222's native Metal port. The actual kernel ports (translating sphFluid.cl's 1515 lines to sphFluid.cu CUDA C++) are deferred — they're ~2 weeks of focused work and depend on PR #222's owSolver abstract base landing first to avoid a refactor. What this commit provides: src/cuda/README.md Implementation plan (5 phases, ~2 weeks estimated, with file-by-file mapping to PR #222's Metal port for structural cribbing) src/cuda/sphFluid.cu Skeleton with __global__ kernel signatures (TODO bodies). One per OpenCL kernel in sphFluid.cl. Includes an explicit reminder that pcisphComputeElasticForces must keep elastic forces in WORLD coordinates (this is where the Taichi pancake bug originates; CUDA port should match OpenCL exactly). inc/owCudaSolver.h Public C++ interface mirroring owOpenCLSolver.h. Method signatures commented out so this header doesn't try to declare functions whose definitions don't exist yet. What this commit does NOT do: - Touch makefile, makefile.OSX — they don't reference these files, existing builds are unaffected. - Add backend=cuda to owConfigProperty — would require a real implementation to dispatch to. - Attempt actual CUDA kernel implementation — that's Phase 1 of the work plan in src/cuda/README.md. Reasoning: this gives the next developer (or a future LLM session) a clear target architecture without committing to a half-baked port. When PR #222 lands, the CUDA work can proceed against the same owSolver abstract base + src/kernels/ descriptor pattern that PR #222 introduces, rather than being structurally divergent from the Metal half.
diff --git a/inc/owCudaSolver.h b/inc/owCudaSolver.h
@@ -0,0 +1,63 @@
+// owCudaSolver — native CUDA backend for Sibernetic's PCISPH solver.
+//
+// STATUS: skeleton only. Public interface defined; method bodies are
+// declared but not implemented. See ../src/cuda/README.md for the
+// implementation plan.
+//
+// Mirrors owOpenCLSolver.h's interface exactly so owPhysicsFluidSimulator
+// can dispatch between OpenCL, Metal (PR #222), and CUDA via the same
+// base class. Once PR #222's owSolver.h lands, this should inherit from
+// owSolver and override the virtual methods rather than duplicating the
+// signatures.
+
+#ifndef OWCUDASOLVER_H_
+#define OWCUDASOLVER_H_
+
+#include "owConfigProperty.h"
+
+class owCudaSolver {
+ public:
+  // TODO: constructors mirroring owOpenCLSolver
+  // owCudaSolver(float *positions, float *velocities, owConfigProperty *config,
+  //              float *connections = nullptr, float *membranes = nullptr,
+  //              int *particleMembranes = nullptr);
+  // ~owCudaSolver();
+
+  // ── Neighbor search ──
+  // unsigned int _runClearBuffers(owConfigProperty *config);
+  // unsigned int _runHashParticles(owConfigProperty *config);
+  // void _runSort(owConfigProperty *config);
+  // unsigned int _runSortPostPass(owConfigProperty *config);
+  // unsigned int _runIndexx(owConfigProperty *config);
+  // void _runIndexPostPass(owConfigProperty *config);
+  // unsigned int _runFindNeighbors(owConfigProperty *config);
+
+  // ── PCISPH physics ──
+  // unsigned int _run_pcisph_computeDensity(owConfigProperty *config);
+  // unsigned int _run_pcisph_computeForcesAndInitPressure(owConfigProperty *config);
+  // unsigned int _run_pcisph_computeElasticForces(owConfigProperty *config);
+  // unsigned int _run_pcisph_predictPositions(owConfigProperty *config);
+  // unsigned int _run_pcisph_predictDensity(owConfigProperty *config);
+  // unsigned int _run_pcisph_correctPressure(owConfigProperty *config);
+  // unsigned int _run_pcisph_computePressureForceAcceleration(owConfigProperty *config);
+  // unsigned int _run_pcisph_integrate(int iterationCount,
+  //                                     int pcisph_integrate_mode,
+  //                                     owConfigProperty *config);
+
+  // ── Membrane interaction ──
+  // unsigned int _run_clearMembraneBuffers(owConfigProperty *config);
+  // unsigned int _run_computeInteractionWithMembranes(owConfigProperty *config);
+
+  // ── Buffer access (mirrors owOpenCLSolver pattern) ──
+  // void read_position_buffer(float *positions, owConfigProperty *config);
+  // void read_velocity_buffer(float *velocities, owConfigProperty *config);
+  // void read_density_buffer(float *density, owConfigProperty *config);
+
+ private:
+  // TODO: CUDA stream, device pointers for each buffer (positions,
+  // velocities, accelerations, density, pressure, neighbor map, grid
+  // index, etc.). See PR #222's owMetalSolver.cpp for the equivalent
+  // Metal implementation as a structural template.
+};
+
+#endif  // OWCUDASOLVER_H_
diff --git a/src/cuda/README.md b/src/cuda/README.md
@@ -0,0 +1,113 @@
+# Sibernetic CUDA backend (skeleton)
+
+> **Status: scaffolding only.** Files in this directory and `inc/owCudaSolver.h` lay out the structure for a native CUDA backend that mirrors PR #222's native Metal port. The actual CUDA kernel ports (translating `src/sphFluid.cl` to `src/cuda/sphFluid.cu`) are deferred — they are weeks of focused work and depend on PR #222's `owSolver` abstract base landing first.
+
+## Why this exists
+
+The `ow-native-gpu-0.1.0` line is built on the strategic decision (see `DEVELOPMENT_LOG.md`) to use **two vendor-backed GPU backends** instead of relying on cross-platform Python compilers like Taichi (whose maintenance has slowed) or PyTorch (whose per-kernel-launch overhead makes it 21× slower than OpenCL on the same hardware):
+
+| Platform | Backend | Status |
+|---|---|---|
+| Apple Silicon | native Metal (PR #222 by Wei Weng) | merged-pending in `weiweng/modernize-makefile-osx` |
+| NVIDIA | **native CUDA (this directory)** | **scaffolded; not implemented** |
+| Linux server | OpenCL via NVIDIA runtime (existing) | parity baseline; do not invest |
+| Cross-platform fallback | Taichi-CUDA / Taichi-Metal (existing) | bug-blocked; needs `taichi_solver.py` fix |
+
+## Structure (mirrors PR #222 / Metal)
+
+```
+src/cuda/
+├── README.md          ← you are here
+├── sphFluid.cu        ← all CUDA __global__ kernels (port of src/sphFluid.cl)
+├── CudaContext.cpp/h  ← CUDA device init, stream, memory pools (TODO)
+└── kernels/           ← one .cuh per kernel descriptor (mirrors PR #222's src/kernels/) (TODO)
+
+inc/
+└── owCudaSolver.h     ← public C++ interface, mirrors owOpenCLSolver
+
+src/
+├── owCudaSolver.cpp   ← bridge from owSolver virtual interface to .cu kernels (TODO)
+└── backend/
+    └── CudaBackend.cpp/h  ← (TODO) CUDA runtime API wrapper, equivalent to PR #222's MetalBackend
+```
+
+## Implementation plan
+
+Sequenced so each step produces a working artifact:
+
+### Phase 0: Wait for PR #222 to land
+Reason: PR #222 introduces `inc/owSolver.h` (the abstract base both backends implement) and the `src/kernels/` descriptor pattern. Building the CUDA backend on the pre-PR-#222 OpenCL-only structure means refactoring once #222 lands. Wait until #222 merges.
+
+### Phase 1: Port `sphFluid.cl` → `sphFluid.cu` (literal translation)
+Translate every OpenCL kernel in `src/sphFluid.cl` (1515 lines) to CUDA. Mostly mechanical:
+- `__kernel void` → `__global__ void`
+- `__global float4 *buf` → `float4 *buf` with explicit pointer args
+- `get_global_id(0)` → `blockIdx.x * blockDim.x + threadIdx.x`
+- `barrier(CLK_LOCAL_MEM_FENCE)` → `__syncthreads()`
+- `__local` → `__shared__`
+- OpenCL math intrinsics → CUDA intrinsics (`fabs` → `fabsf`, etc.)
+
+Estimated: 2-3 days for a careful translation, with a parity test against the OpenCL output at each kernel.
+
+### Phase 2: Implement `CudaBackend.cpp` (host-side dispatch)
+Wraps cuBLAS-style CUDA runtime calls:
+- `cudaMalloc` / `cudaFree` for buffers
+- `cudaMemcpy` for host-device transfers
+- `<<<grid, block>>>` kernel launches
+- `cudaStreamSynchronize` for ordering
+
+Estimated: 2-3 days.
+
+### Phase 3: Implement `owCudaSolver.cpp`
+Bridge between `owSolver` virtual interface (PR #222's abstraction) and `CudaBackend.cpp`'s kernel launches. Mirrors `owMetalSolver.cpp` from PR #222 line-by-line.
+
+Estimated: 1-2 days.
+
+### Phase 4: Wire into the build
+- Add `nvcc` to the Linux makefile path
+- Add CUDA backend selection to `owConfigProperty.cpp` (`backend=cuda`)
+- Update `Dockerfile` for sibernetic-runner to install CUDA toolkit (already has CUDA runtime via nvidia/cuda image)
+- Add `backend=cuda` to the cross-backend regression script
+
+Estimated: 1 day.
+
+### Phase 5: Cross-backend parity validation
+Run `scripts/cross_backend_regression.py --backend cuda --backend opencl --local-binary <PR222-Metal>`. All three should produce demo1 cube-stability metrics within the existing tolerance bands (extent retention ≥ 80%, mean_y fell ≥ 50%).
+
+Estimated: 1 day of measurement + tuning.
+
+### Total estimated effort: ~2 weeks of focused work for a competent CUDA developer.
+
+## Reference files in PR #222 to model from
+
+When PR #222 lands, the matching CUDA files would mirror these structurally:
+
+| Metal file | CUDA equivalent |
+|---|---|
+| `inc/owMetalSolver.h` | `inc/owCudaSolver.h` |
+| `src/owMetalSolver.cpp` | `src/owCudaSolver.cpp` |
+| `src/owMetalPrivateImpl.cpp` | `src/owCudaPrivateImpl.cpp` (if needed) |
+| `src/backend/MetalBackend.{cpp,h}` | `src/backend/CudaBackend.{cpp,h}` |
+| `src/metal/sphFluid.metal` | `src/cuda/sphFluid.cu` |
+| `src/kernels/*.h` | `src/kernels/*.h` (already shared with Metal — same descriptors) |
+
+The Metal/CUDA divergence is **only** in the kernel language (MSL vs CUDA C++) and the host-side runtime API (Metal C++ vs CUDA Runtime). The algorithm specification, kernel descriptors, and abstract solver interface are shared.
+
+## Why not just use Taichi-CUDA?
+
+We tested it on 2026-05-03:
+- 1-sec demo1 sim: cube didn't move at all (mean_y unchanged 44.42 → 44.42)
+- 5-sec demo1 sim on Apple Silicon Taichi-Metal: cube pancaked (extent retention 7.6%)
+- Both manifest the same algorithmic bug in `taichi_solver.py` (the documented coordinate-scale issue)
+
+Fixing `taichi_solver.py` would unblock Taichi as a backend for both Metal and CUDA. Until then, native CUDA is the path forward for NVIDIA hardware.
+
+## Why not just use OpenCL on NVIDIA?
+
+It actually works fine — Cloud Run + L4 + NVIDIA's OpenCL runtime measures at 86 sec for a 1-sec demo1 sim, with cube physics intact. However:
+- Apple killed OpenCL on Apple Silicon, so it's not a path forward for cross-platform dev
+- The 2015 AMD APP SDK we historically link against is abandoned
+- NVIDIA's OpenCL is still maintained but not actively invested in
+- For long-term maintainability we want vendor-backed APIs (CUDA on NVIDIA, Metal on Apple)
+
+OpenCL on NVIDIA stays as the **parity baseline** in the cross-backend regression: when we add the native CUDA backend, its outputs must match OpenCL within tolerance.
diff --git a/src/cuda/sphFluid.cu b/src/cuda/sphFluid.cu
@@ -0,0 +1,111 @@
+// CUDA kernel scaffolding for Sibernetic's PCISPH solver.
+//
+// STATUS: skeleton only. Function signatures and a high-level work plan
+// are present; the actual kernel bodies are TODO. See ./README.md for
+// the implementation phases.
+//
+// Each kernel here should be a literal port of its OpenCL counterpart in
+// src/sphFluid.cl. Algorithm-level changes are out of scope — the goal
+// is for `backend=cuda` to produce position/velocity output matching
+// `backend=opencl` within fp32 noise on the cross-backend regression.
+
+#ifndef SIBERNETIC_CUDA_SPHFLUID_CU
+#define SIBERNETIC_CUDA_SPHFLUID_CU
+
+#include <cuda_runtime.h>
+
+// -----------------------------------------------------------------------
+// Neighbor search
+// -----------------------------------------------------------------------
+
+__global__ void clearBuffers(/* TODO: signature mirroring sphFluid.cl */) {
+  // TODO: port from src/sphFluid.cl ::clearBuffers
+}
+
+__global__ void hashParticles(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::hashParticles
+  // OpenCL: global_id → particle index, compute cell, write to gridCellIndex
+}
+
+// Sort: in CUDA we'd use thrust::sort_by_key on (cellId, particleIndex).
+// No __global__ kernel needed; called from the host in CudaBackend.cpp.
+
+__global__ void sortPostPass(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::sortPostPass
+}
+
+__global__ void indexx(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::indexx
+}
+
+__global__ void indexPostPass(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::indexPostPass
+}
+
+__global__ void findNeighbors(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::findNeighbors
+  // 27-cell neighborhood walk; build per-particle neighbor list.
+}
+
+// -----------------------------------------------------------------------
+// PCISPH physics
+// -----------------------------------------------------------------------
+
+__global__ void pcisphComputeDensity(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::pcisph_computeDensity
+  // Wpoly6 kernel, density estimation per particle.
+}
+
+__global__ void pcisphComputeForcesAndInitPressure(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::pcisph_computeForcesAndInitPressure
+  // Viscosity, surface tension, gravity, body forces.
+}
+
+__global__ void pcisphComputeElasticForces(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::pcisph_computeElasticForces
+  // Spring forces between elastic-bonded particles.
+  // CRITICAL: this is the kernel where Taichi's coordinate-scale bug
+  // manifests. CUDA port should match OpenCL's coordinate handling
+  // exactly — keep elastic forces in WORLD coordinates, not scaled.
+}
+
+__global__ void pcisphPredictPositions(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::pcisph_predictPositions
+  // PCISPH iteration step 1: predict positions under current acceleration.
+}
+
+__global__ void pcisphPredictDensity(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::pcisph_predictDensity
+  // PCISPH iteration step 2: re-evaluate density at predicted positions.
+}
+
+__global__ void pcisphCorrectPressure(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::pcisph_correctPressure
+  // PCISPH iteration step 3: update pressure field to enforce
+  // incompressibility (density deviation < 1%).
+}
+
+__global__ void pcisphComputePressureForceAcceleration(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::pcisph_computePressureForceAcceleration
+  // gradWspiky; symmetric pressure force.
+}
+
+__global__ void pcisphIntegrate(/* TODO */) {
+  // TODO: port from src/sphFluid.cl ::pcisph_integrate
+  // Leapfrog time integration; mode 0 = position update,
+  // mode 1 = velocity update.
+}
+
+// -----------------------------------------------------------------------
+// Membrane interaction
+// -----------------------------------------------------------------------
+
+__global__ void clearMembraneBuffers(/* TODO */) {
+  // TODO
+}
+
+__global__ void computeInteractionWithMembranes(/* TODO */) {
+  // TODO
+}
+
+#endif // SIBERNETIC_CUDA_SPHFLUID_CU