NVIDIA
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.gitlab-ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitlab/ci/clang-build-and-test.yml‎
Lines changed: 8 additions & 0 deletions b/‎.gitlab/ci/clang-build-and-test.yml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 32 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎PUBLICATIONS.md‎
Lines changed: 6 additions & 0 deletions b/‎PUBLICATIONS.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎VERSION.md‎
Lines changed: 1 addition & 1 deletion b/‎VERSION.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎asv/benchmarks/atomics.py‎
Lines changed: 168 additions & 0 deletions b/‎asv/benchmarks/atomics.py‎
Lines changed: 168 additions & 0 deletions
diff --git a/‎build_lib.py‎
Lines changed: 14 additions & 14 deletions b/‎build_lib.py‎
Lines changed: 14 additions & 14 deletions
@@ -355,7 +355,7 @@ linux-aarch64 test:
   image: ubuntu:22.04
   needs: [linux-aarch64 build]
   extends:
-    - .runner-test-linux-aarch64-gpu
+    - .runner-test-linux-aarch64 # TODO: Change to .runner-test-linux-aarch64-gpu when runners are available
     - .test_common_with_coverage
   before_script:
     - echo -e "\\e[0Ksection_start:`date +%s`:install_dependencies[collapsed=true]\\r\\e[0KInstalling dependencies"
@@ -457,7 +457,7 @@ linux-x86_64-blackwell test:
     - uv venv
     - source .venv/bin/activate
     - uv sync --extra dev
-    - uv pip install -U --pre torch --index-url https://download.pytorch.org/whl/nightly/cu130
+    - uv pip install -U torch --index-url https://download.pytorch.org/whl/cu130
     - uv pip install -U "jax[cuda13]"
     - uv pip install -e .
     - echo -e "\\e[0Ksection_end:`date +%s`:install_dependencies\\r\\e[0K"
 
@@ -61,6 +61,14 @@ linux-x86_64 build:
     - mv warp/bin/warp.so warp/bin/linux-x86_64
     - mv warp/bin/warp-clang.so warp/bin/linux-x86_64
 
+linux-x86_64 build cuda 13:
+  image: gitlab-master.nvidia.com:5005/omniverse/warp/cuda:13.0.1-devel-ubuntu24.04
+  extends:
+    - .build_linux_base
+    - .ipp_lnx_x86_64_cpu_medium
+  script:
+    - uv run build_lib.py --clang_build_toolchain
+
 # ==============================================================================
 # Unit Testing Jobs
 #
 
@@ -1,5 +1,36 @@
 # Changelog
 
+## [1.10.1] - 2025-12-01
+
+### Fixed
+
+- Fix type inference errors when passing reference arguments (such as array elements) to built-in functions
+  ([GH-1071](https://github.com/NVIDIA/warp/issues/1071)).
+- Fix `module="unique"` kernels to properly reuse existing module objects when defined multiple times,
+  avoiding unnecessary module creation overhead ([GH-995](https://github.com/NVIDIA/warp/issues/995)).
+- Add validation in `wp.compile_aot_module()` to detect generic kernels without overloads and generic kernels with
+  multiple overloads when `strip_hash=True` ([GH-919](https://github.com/NVIDIA/warp/issues/919)).
+- Fix compilation error in `wp.tile_load_indexed()` when indices tile has been reshaped or transformed
+  ([GH-1008](https://github.com/NVIDIA/warp/issues/1008)).
+- Fix multiple issues with kernel-local arrays (arrays created with `wp.zeros()` in kernels):
+  - Fix `.ptr` access ([GH-999](https://github.com/NVIDIA/warp/issues/999)).
+  - Fix indexing when requesting a subarray ([GH-1081](https://github.com/NVIDIA/warp/issues/1081)).
+  - Fix shape parameter to accept a single integer (e.g., `wp.zeros(shape=123, dtype=float)`)
+    ([GH-1081](https://github.com/NVIDIA/warp/issues/1081)).
+- Fix code-generation ordering for custom gradient functions (`@wp.func_grad`) when used with nested function calls
+  ([GH-967](https://github.com/NVIDIA/warp/issues/967)).
+- Fix invalid reads when using `wp.fem.TemporaryStore` during tape capture for automatic differentiation
+  ([GH-1021](https://github.com/NVIDIA/warp/issues/1021)).
+- Fix reference cycles introduced by `wp.fem.Temporary` and `wp.fem.ShapeBasisSpace`
+  ([GH-1076](https://github.com/NVIDIA/warp/issues/1076)).
+- Improve documentation and error messages about requiring a BVH for `wp.fem.lookup()` and related functionality
+  ([GH-1072](https://github.com/NVIDIA/warp/issues/1072)).
+
+### Documentation
+
+- Add more examples to the Tiles and SIMT code documentation, demonstrating caveats when switching between
+  the CPU and GPU and using `wp.tile()` ([GH-1042](https://github.com/NVIDIA/warp/issues/1042)).
+
 ## [1.10.0] - 2025-11-02
 
 ### Added
@@ -1939,6 +1970,7 @@
 
 - Initial publish for alpha testing
 
+[1.10.1]: https://github.com/NVIDIA/warp/releases/tag/v1.10.1
 [1.10.0]: https://github.com/NVIDIA/warp/releases/tag/v1.10.0
 [1.9.1]: https://github.com/NVIDIA/warp/releases/tag/v1.9.1
 [1.9.0]: https://github.com/NVIDIA/warp/releases/tag/v1.9.0
 
@@ -7,6 +7,11 @@ pull request on GitHub or email a link to your arXiv preprint (preferred) or DOI
 
 ## 2025
 
+- **NeuSpring: Neural Spring Fields for Reconstruction and Simulation of Deformable Objects from Videos**. *Q. Xu, J. Liu, S. Yu, Y. Wang, Y. Zhou, J. Zhou, J. Cui, Y. Ong, H. Zhang*. November 2025. [arXiv:2511.08310](https://arxiv.org/abs/2511.08310)
+- **Improving Long-Range Interactions in Graph Neural Simulators via Hamiltonian Dynamics**. *T. Hoang, A. Trenta, A. Gravina, N. Freymuth, P. Becker, D. Bacciu, G. Neumann*. November 2025. [arXiv:2511.08185](https://arxiv.org/abs/2511.08185)
+- **Real-to-Sim Robot Policy Evaluation with Gaussian Splatting Simulation of Soft-Body Interactions**. *K. Zhang, S. Sha, H. Jiang, M. Loper, H. Song, G. Cai, Z. Xu, X. Hu, C. Zheng, Y. Li*. November 2025. [arXiv:2511.04665](https://arxiv.org/abs/2511.04665)
+- **Human Mesh Modeling for Anny Body**. *R. Brégier, G. Fiche, L. Bravo-Sánchez, T. Lucas, M. Armando, P. Weinzaepfel, G. Rogez, F. Baradel*. November 2025. [arXiv:2511.03589](https://arxiv.org/abs/2511.03589)
+- **VoMP: Predicting Volumetric Mechanical Property Fields**. *R. Dagli, D. Xiang, V. Modi, C. Loop, C. F. Tsang, A. H. Chen, A. Hu, G. State, D. I. W. Levin, M. Shugrina*. October 2025. [arXiv:2510.22975](https://arxiv.org/abs/2510.22975)
 - **Learning to Design Soft Hands using Reward Models**. *X. Bai, N. Hansen, A. Singh, M. T. Tolley, Y. Duan, P. Abbeel, X. Wang, S. Yi*. October 2025. [arXiv:2510.17086](https://arxiv.org/abs/2510.17086)
 - **Feedback Matters: Augmenting Autonomous Dissection with Visual and Topological Feedback**. *C. Wang, C. Chen, X. Liang, S. Atar, F. Richter, M. Yip*. October 2025. [arXiv:2510.04074](https://arxiv.org/abs/2510.04074)
 - **MPMAvatar: Learning 3D Gaussian Avatars with Accurate and Robust Physics-Based Dynamics**. *C. Lee, J. Lee, T. Kim*. October 2025. [arXiv:2510.01619](https://arxiv.org/abs/2510.01619)
@@ -15,6 +20,7 @@ pull request on GitHub or email a link to your arXiv preprint (preferred) or DOI
 - **MechStyle: Augmenting Generative AI with Mechanical Simulation to Create Stylized and Structurally Viable 3D Models**. *F. Faruqi, A. Abdel-Rahman, L. Tejedor, M. Nisser, J. Li, V. Phadnis, V. Jampani, N. Gershenfeld, M. Hofmann, S. Mueller*. September 2025. [arXiv:2509.20571](https://arxiv.org/abs/2509.20571)
 - **AERO-MPPI: Anchor-Guided Ensemble Trajectory Optimization for Agile Mapless Drone Navigation**. *X. Chen, R. Huang, L. Tang, L. Zhao*. September 2025. [arXiv:2509.17340](https://arxiv.org/abs/2509.17340)
 - **Discovering neural elastoplasticity from kinematic observations**. *G. B. Gavris, W. Sun*. September 2025. [DOI:10.1073/pnas.2508732122](https://doi.org/10.1073/pnas.2508732122)
+
 - **Learning Simulatable Models of Cloth with Spatially-varying Constitutive Properties**. *G. Chen, S. Suri, Y. Wu, E. Voulga, D. I. W. Levin, D. K. Pai*. July 2025. [arXiv:2507.21288](https://arxiv.org/abs/2507.21288)
 - **GeoWarp: An automatically differentiable and GPU-accelerated implicit MPM framework for geomechanics based on NVIDIA Warp**. *Y. Zhao, X. Li, C. Jiang, J. Choo*. July 2025. [arXiv:2507.09435](https://arxiv.org/abs/2507.09435)
 - **Transforming Unstructured Hair Strands into Procedural Hair Grooms**. *W. Chang, A. L. Russell, S. Grabli, M. J. Chiang, C. Hery, D. Roble, R. Ramamoorthi, T. Li, O. Maury*. July 2025. [DOI:10.1145/3731168](https://doi.org/10.1145/3731168)
 
@@ -43,9 +43,9 @@ the `pip install` command, e.g.
 
 | Platform        | Install Command                                                                                                               |
 | --------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.10.0/warp_lang-1.10.0+cu13-py3-none-manylinux_2_34_aarch64.whl` |
-| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.10.0/warp_lang-1.10.0+cu13-py3-none-manylinux_2_28_x86_64.whl`  |
-| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.10.0/warp_lang-1.10.0+cu13-py3-none-win_amd64.whl`             |
+| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.10.1/warp_lang-1.10.1+cu13-py3-none-manylinux_2_34_aarch64.whl` |
+| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.10.1/warp_lang-1.10.1+cu13-py3-none-manylinux_2_28_x86_64.whl`  |
+| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.10.1/warp_lang-1.10.1+cu13-py3-none-win_amd64.whl`             |
 
 The `--force-reinstall` option may need to be used to overwrite a previous installation.
 
 
@@ -1 +1 @@
-1.10.0
+1.10.1rc1
@@ -0,0 +1,168 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Benchmarks for atomic operations under high thread contention.
+
+All threads write to a single output location (index 0) to maximize contention
+and measure worst-case atomic operation performance.
+"""
+
+from typing import Any
+
+import numpy as np
+
+import warp as wp
+
+# Map string parameter names to warp dtypes
+DTYPE_MAP = {
+    "float32": wp.float32,
+    "int32": wp.int32,
+}
+
+NUM_ELEMENTS = 32 * 1024 * 1024
+
+
+@wp.kernel
+def max_kernel(
+    vals: wp.array(dtype=Any),
+    out: wp.array(dtype=Any),
+):
+    tid = wp.tid()
+    val = vals[tid]
+    wp.atomic_max(out, 0, val)  # All threads contend on out[0]
+
+
+@wp.kernel
+def min_kernel(
+    vals: wp.array(dtype=Any),
+    out: wp.array(dtype=Any),
+):
+    tid = wp.tid()
+    val = vals[tid]
+    wp.atomic_min(out, 0, val)  # All threads contend on out[0]
+
+
+class AtomicMax:
+    """Benchmark wp.atomic_max() with high thread contention.
+
+    Uses 4x larger arrays (128M elements) to reduce measurement variation,
+    as atomic_max showed ~10% variation with the default 32M elements.
+    """
+
+    params = ["float32", "int32"]
+    param_names = ["dtype"]
+
+    repeat = 50
+    number = 15
+
+    # Use 4x more elements to reduce measurement variation
+    num_elements = 4 * NUM_ELEMENTS
+
+    def setup_cache(self):
+        rng = np.random.default_rng(42)
+        # Generate vals_np for each dtype in DTYPE_MAP
+        vals_np_dict = {}
+        for dtype_str_key, dtype in DTYPE_MAP.items():
+            if dtype == wp.float32:
+                vals_np = rng.random(self.num_elements).astype(np.float32)
+            elif dtype == wp.int32:
+                vals_np = rng.integers(0, 2**31 - 1, size=self.num_elements, dtype=np.int32)
+            else:
+                vals_np = None
+            vals_np_dict[dtype_str_key] = vals_np
+
+        return vals_np_dict
+
+    def setup(self, vals_np_dict, dtype_str):
+        wp.init()
+        self.device = wp.get_device("cuda:0")
+
+        dtype = DTYPE_MAP[dtype_str]
+
+        self.vals = wp.array(vals_np_dict[dtype_str], dtype=dtype, device=self.device)
+        self.out = wp.zeros(shape=(1,), dtype=dtype, device=self.device)
+
+        self.cmd = wp.launch(
+            max_kernel,
+            (self.num_elements,),
+            inputs=[self.vals],
+            outputs=[self.out],
+            device=self.device,
+            record_cmd=True,
+        )
+
+        # Launch once to compile
+        self.cmd.launch()
+        wp.synchronize_device(self.device)
+
+    def time_cuda(self, vals_np_dict, dtype_str):
+        self.out.zero_()
+        self.cmd.launch()
+        wp.synchronize_device(self.device)
+
+
+class AtomicMin:
+    """Benchmark wp.atomic_min() with high thread contention.
+
+    Uses standard array size (32M elements) as measurements are already stable.
+    """
+
+    params = ["float32", "int32"]
+    param_names = ["dtype"]
+
+    repeat = 100
+    number = 25
+
+    def setup_cache(self):
+        rng = np.random.default_rng(42)
+        # Generate vals_np for each dtype in DTYPE_MAP
+        vals_np_dict = {}
+        for dtype_str_key, dtype in DTYPE_MAP.items():
+            if dtype == wp.float32:
+                vals_np = rng.random(NUM_ELEMENTS).astype(np.float32)
+            elif dtype == wp.int32:
+                vals_np = rng.integers(0, 2**31 - 1, size=NUM_ELEMENTS, dtype=np.int32)
+            else:
+                vals_np = None
+            vals_np_dict[dtype_str_key] = vals_np
+
+        return vals_np_dict
+
+    def setup(self, vals_np_dict, dtype_str):
+        wp.init()
+        self.device = wp.get_device("cuda:0")
+
+        dtype = DTYPE_MAP[dtype_str]
+
+        self.vals = wp.array(vals_np_dict[dtype_str], dtype=dtype, device=self.device)
+        self.out = wp.zeros(shape=(1,), dtype=dtype, device=self.device)
+
+        self.cmd = wp.launch(
+            min_kernel,
+            (NUM_ELEMENTS,),
+            inputs=[self.vals],
+            outputs=[self.out],
+            device=self.device,
+            record_cmd=True,
+        )
+
+        # Launch once to compile
+        self.cmd.launch()
+        wp.synchronize_device(self.device)
+
+    def time_cuda(self, vals_np_dict, dtype_str):
+        self.out.zero_()
+        self.cmd.launch()
+        wp.synchronize_device(self.device)
@@ -31,11 +31,11 @@
 import glob
 import os
 import platform
-import re
 import shutil
 import subprocess
 import sys
 
+import build_llvm
 import warp._src.build_dll as build_dll
 import warp._src.config as config
 from warp._src.context import export_builtins
@@ -355,6 +355,14 @@ def main(argv: list[str] | None = None) -> int:
     # propagate verbosity to build subsystem
     build_dll.verbose_cmd = args.verbose
 
+    # check LLVM build dependencies early if --build_llvm is set
+    if args.build_llvm:
+        try:
+            build_llvm.check_build_dependencies(verbose=args.verbose)
+        except RuntimeError as e:
+            print(f"Warp build error: {e}")
+            return 1
+
     # setup CUDA Toolkit path
     if platform.system() == "Darwin":
         args.cuda_path = None
@@ -382,6 +390,11 @@ def main(argv: list[str] | None = None) -> int:
             if not args.host_compiler:
                 print("Warp build error: Could not find MSVC compiler")
                 return 1
+    else:
+        args.host_compiler = build_dll.find_host_compiler()
+        if not args.host_compiler:
+            print("Warp build error: Could not find C++ compiler")
+            return 1
 
     try:
         # Handle CI nightly builds (returns updated version string if triggered, else None)
@@ -392,17 +405,6 @@ def main(argv: list[str] | None = None) -> int:
         else:
             build_version = config.version
 
-        # Reset git hash to None for non-scheduled builds (keeps config clean for local dev)
-        if nightly_version is None:
-            config_file = os.path.join(base_path, "warp", "_src", "config.py")
-            with open(config_file) as f:
-                content = f.read()
-            # Reset _git_commit_hash to None
-            pattern = r'^(_git_commit_hash\s*:\s*Optional\[str\]\s*=\s*)(None|"[^"]*")(.*)$'
-            updated_content = re.sub(pattern, r"\g<1>None\g<3>", content, flags=re.MULTILINE)
-            with open(config_file, "w") as f:
-                f.write(updated_content)
-
         if args.verbose:
             print(f"Building Warp version {build_version}")
 
@@ -457,8 +459,6 @@ def main(argv: list[str] | None = None) -> int:
 
         # build warp-clang.dll
         if args.standalone:
-            import build_llvm
-
             if args.build_llvm:
                 build_llvm.build_llvm_clang_from_source(args)