Update 26.01.07

seanprime7 · Angelogeb · seanprime7 · commit 452bf8dffd8f · 2026-01-07T21:50:01.000Z
- 37d4a50ff89b6bce498dd86e57f02f68849ccd9d by Anxhelo Xhebraj &lt;axhebraj@nvidia.com&gt;

Co-authored-by: Anxhelo Xhebraj &lt;axhebraj@nvidia.com&gt;
Signed-off-by: Sean Lee &lt;selee@nvidia.com&gt;
GitOrigin-RevId: 37d4a50ff89b6bce498dd86e57f02f68849ccd9d
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
@@ -22,4 +22,5 @@ pytest tests/test_mpmd_array.py
 if [ $(nvidia-smi -L | wc -l) -ge 8 ]; then
     N_PROCS=2 N_GPUS=4 COMMAND="python -u tests/test_reshard_utils.py" ./scripts/local_mc.sh
     N_PROCS=2 N_GPUS=4 COMMAND="python -u examples/mpmd_reshard.py" ./scripts/local_mc.sh
+    N_PROCS=2 N_GPUS=4 COMMAND="python -u tests/test_dime2.py" ./scripts/local_mc.sh
 fi
diff --git a/src/jaxpp/core.py b/src/jaxpp/core.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/src/jaxpp/dlpack.py b/src/jaxpp/dlpack.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -65,26 +65,51 @@ class DLDevice(ctypes.Structure):
         ("device_id", ctypes.c_int),
     ]
 
-
+# https://github.com/dmlc/dlpack/blob/6ea9b3eb64c881f614cd4537f95f0e125a35555c/include/dlpack/dlpack.h#L141-L182
 class DLDataTypeCode(ctypes.c_uint8):
     """An integer that encodes the category of DLTensor elements' data type."""
     kDLInt = 0
     kDLUInt = 1
     kDLFloat = 2
-    kDLOpaquePointer = 3
+    kDLOpaqueHandle = 3
     kDLBfloat = 4
     kDLComplex = 5
     kDLBool = 6
+    # FP8 data types
+    kDLFloat8_e3m4 = 7
+    kDLFloat8_e4m3 = 8
+    kDLFloat8_e4m3b11fnuz = 9
+    kDLFloat8_e4m3fn = 10
+    kDLFloat8_e4m3fnuz = 11
+    kDLFloat8_e5m2 = 12
+    kDLFloat8_e5m2fnuz = 13
+    kDLFloat8_e8m0fnu = 14
+    # FP6 data types
+    kDLFloat6_e2m3fn = 15
+    kDLFloat6_e3m2fn = 16
+    # FP4 data types
+    kDLFloat4_e2m1fn = 17
 
     def __str__(self):
         return {
-            self.kDLBool: "bool",
             self.kDLInt: "int",
             self.kDLUInt: "uint",
             self.kDLFloat: "float",
+            self.kDLOpaqueHandle: "void_p",
             self.kDLBfloat: "bfloat",
             self.kDLComplex: "complex",
-            self.kDLOpaquePointer: "void_p"
+            self.kDLBool: "bool",
+            self.kDLFloat8_e3m4: "float8_e3m4",
+            self.kDLFloat8_e4m3: "float8_e4m3",
+            self.kDLFloat8_e4m3b11fnuz: "float8_e4m3b11fnuz",
+            self.kDLFloat8_e4m3fn: "float8_e4m3fn",
+            self.kDLFloat8_e4m3fnuz: "float8_e4m3fnuz",
+            self.kDLFloat8_e5m2: "float8_e5m2",
+            self.kDLFloat8_e5m2fnuz: "float8_e5m2fnuz",
+            self.kDLFloat8_e8m0fnu: "float8_e8m0fnu",
+            self.kDLFloat6_e2m3fn: "float6_e2m3fn",
+            self.kDLFloat6_e3m2fn: "float6_e3m2fn",
+            self.kDLFloat4_e2m1fn: "float4_e2m1fn",
         }[self.value]
 
 
@@ -112,11 +137,22 @@ class DLDataType(ctypes.Structure):
         "uint32": (DLDataTypeCode.kDLUInt, 32, 1),
         "uint64": (DLDataTypeCode.kDLUInt, 64, 1),
         "float16": (DLDataTypeCode.kDLFloat, 16, 1),
-        "bfloat16": (DLDataTypeCode.kDLBfloat, 16, 1),  # Added
+        "bfloat16": (DLDataTypeCode.kDLBfloat, 16, 1),
         "float32": (DLDataTypeCode.kDLFloat, 32, 1),
         "float64": (DLDataTypeCode.kDLFloat, 64, 1),
         "complex64": (DLDataTypeCode.kDLComplex, 64, 1),
-        "complex128": (DLDataTypeCode.kDLComplex, 128, 1)
+        "complex128": (DLDataTypeCode.kDLComplex, 128, 1),
+        # FP4 types
+        "float4_e2m1fn": (DLDataTypeCode.kDLFloat4_e2m1fn, 4, 1),
+        # FP8 types
+        "float8_e3m4": (DLDataTypeCode.kDLFloat8_e3m4, 8, 1),
+        "float8_e4m3": (DLDataTypeCode.kDLFloat8_e4m3, 8, 1),
+        "float8_e4m3b11fnuz": (DLDataTypeCode.kDLFloat8_e4m3b11fnuz, 8, 1),
+        "float8_e4m3fn": (DLDataTypeCode.kDLFloat8_e4m3fn, 8, 1),
+        "float8_e4m3fnuz": (DLDataTypeCode.kDLFloat8_e4m3fnuz, 8, 1),
+        "float8_e5m2": (DLDataTypeCode.kDLFloat8_e5m2, 8, 1),
+        "float8_e5m2fnuz": (DLDataTypeCode.kDLFloat8_e5m2fnuz, 8, 1),
+        "float8_e8m0fnu": (DLDataTypeCode.kDLFloat8_e8m0fnu, 8, 1),
     }
 
     REV_MAP = {v: k for k, v in TYPE_MAP.items()}
@@ -165,7 +201,7 @@ class DLManagedTensor(ctypes.Structure):
 
 
 class NcclDataType(ctypes.c_uint8):
-    # https://github.com/NVIDIA/nccl/blob/559b70f86c190a0d8f67f0d7a0f2c9810dd1e8c7/src/nccl.h.in#L190-L205C3
+    # https://github.com/NVIDIA/nccl/blob/1e0c869c39bb33f1034cb9920bd2a8a8406f04a3/src/nccl.h.in#L328-L341
     ncclInt8 = 0
     ncclUint8 = 1
     ncclInt32 = 2
@@ -176,6 +212,8 @@ class NcclDataType(ctypes.c_uint8):
     ncclFloat32 = 7
     ncclFloat64 = 8
     ncclBfloat16 = 9
+    ncclFloat8e4m3 = 10
+    ncclFloat8e5m2 = 11
 
     TYPE_MAP = {
         "bool": ncclUint8,
@@ -189,6 +227,8 @@ class NcclDataType(ctypes.c_uint8):
         "float32": ncclFloat32,
         "float64": ncclFloat64,
         "bfloat16": ncclBfloat16,
+        "float8_e4m3fn": ncclFloat8e4m3,
+        "float8_e5m2": ncclFloat8e5m2,
     }
 
 
@@ -219,9 +259,13 @@ def dlpack_nccl_args(dla) -> tuple[RawDataPointer, int, NcclDataType]:
         dltensor.dtype.bits,
         dltensor.dtype.lanes,
     )
+
+    if dtype_key not in DLDataType.REV_MAP:
+        raise ValueError(f"Unsupported dtype: {dtype_key}")
+
     dtype_name = DLDataType.REV_MAP[dtype_key]
-    return (
-        RawDataPointer(data_ptr),
-        nelems,
-        NcclDataType(NcclDataType.TYPE_MAP[dtype_name]),
-    )
+    if dtype_name not in NcclDataType.TYPE_MAP:
+        raise ValueError(f"Unsupported dtype: {dtype_name}")
+    nccl_dtype = NcclDataType.TYPE_MAP[dtype_name]
+
+    return RawDataPointer(data_ptr), nelems, NcclDataType(nccl_dtype)
diff --git a/tests/test_dime2.py b/tests/test_dime2.py
@@ -0,0 +1,84 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import jax
+import jax.numpy as jnp
+import ml_dtypes
+import numpy as np
+from jax.sharding import PartitionSpec as P
+from parameterized import parameterized
+
+import jaxpp.distributed_utils as jppdu
+from jaxpp.dime2 import send_or_recv
+
+
+class SendOrRecvTest(jppdu.JaxDistributedTest):
+    @parameterized.expand(
+        [
+            ("float32", jnp.float32, np.float32),
+            ("bfloat16", jnp.bfloat16, ml_dtypes.bfloat16),
+            ("float8_e4m3fn", jnp.float8_e4m3fn, ml_dtypes.float8_e4m3fn),
+            ("float8_e5m2", jnp.float8_e5m2, ml_dtypes.float8_e5m2),
+        ]
+    )
+    def test_send_or_recv(self, name, jax_dtype, np_dtype):
+        process_count = jax.process_count()
+        process_index = jax.process_index()
+        local_device_count = jax.local_device_count()
+
+        # Use first device from each of the first two processes
+        devices = np.array(jax.devices()).reshape(process_count, local_device_count)
+        sender_device = devices[0:1]
+        receiver_device = devices[1:2]
+
+        sender_mesh = jax.sharding.Mesh(sender_device, axis_names=("mpmd", "x"))
+        receiver_mesh = jax.sharding.Mesh(receiver_device, axis_names=("mpmd", "x"))
+
+        pspec = P("x")
+        sender_sharding = jax.sharding.NamedSharding(sender_mesh, pspec)
+        receiver_sharding = jax.sharding.NamedSharding(receiver_mesh, pspec)
+
+        global_shape = (8,)
+        expected_values = np.array([1, 2, 3, 4, 5, 6, 7, 8], dtype=np_dtype)
+
+        if process_index == 0:
+            array = jax.device_put(
+                jnp.array(expected_values, dtype=jax_dtype), sender_sharding
+            )
+
+            [wait_send_finish] = send_or_recv(
+                [array], [receiver_sharding], is_send=True
+            )
+            wait_send_finish()
+        else:
+            buffer = jax.device_put(
+                jnp.zeros(global_shape, dtype=jax_dtype), receiver_sharding
+            )
+
+            [enqueue_recv] = send_or_recv([buffer], [sender_sharding], is_send=False)
+            received_array = enqueue_recv()
+
+            received_values = np.array(received_array)
+            np.testing.assert_array_equal(
+                received_values,
+                expected_values,
+                err_msg=f"Received data mismatch for dtype {name}",
+            )
+
+
+if __name__ == "__main__":
+    jppdu.distributed_main(unittest.main)
diff --git a/tests/test_dlpack.py b/tests/test_dlpack.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes
+import unittest
+
+import jax.numpy as jnp
+import ml_dtypes
+import numpy as np
+from jax._src.dlpack import to_dlpack
+from parameterized import parameterized
+
+from jaxpp.dlpack import capsule_name, dlpack_nccl_args
+
+_libcudart = ctypes.CDLL("libcudart.so")
+_libcudart.cudaMemcpy.argtypes = [
+    ctypes.c_void_p,  # dst
+    ctypes.c_void_p,  # src
+    ctypes.c_size_t,  # count (bytes)
+    ctypes.c_int,  # kind
+]
+_libcudart.cudaMemcpy.restype = ctypes.c_int
+_cudaMemcpyDeviceToHost = 2
+
+
+def cuda_memcpy_to_host(device_ptr: int, num_bytes: int) -> bytes:
+    host_buffer = (ctypes.c_uint8 * num_bytes)()
+    err = _libcudart.cudaMemcpy(
+        host_buffer, device_ptr, num_bytes, _cudaMemcpyDeviceToHost
+    )
+    if err != 0:
+        raise RuntimeError(f"cudaMemcpy failed with error {err}")
+    return bytes(host_buffer)
+
+
+class TestDlpackExport(unittest.TestCase):
+    @parameterized.expand(
+        [
+            ("float32", jnp.float32, np.float32),
+            ("bfloat16", jnp.bfloat16, ml_dtypes.bfloat16),
+            ("float8_e4m3fn", jnp.float8_e4m3fn, ml_dtypes.float8_e4m3fn),
+            ("float8_e5m2", jnp.float8_e5m2, ml_dtypes.float8_e5m2),
+        ]
+    )
+    def test_dlpack_export(self, name, jax_dtype, np_dtype):
+        x = jnp.array([1, 2, 3], dtype=jax_dtype)
+        capsule = to_dlpack(x)
+        self.assertEqual(capsule_name(capsule), "dltensor")
+        data_ptr, count, nccl_dtype = dlpack_nccl_args(capsule)
+
+        self.assertEqual(count, 3)
+
+        itemsize = np.dtype(np_dtype).itemsize
+        raw_bytes = cuda_memcpy_to_host(data_ptr, count * itemsize)
+        values = np.frombuffer(raw_bytes, dtype=np_dtype)
+
+        np.testing.assert_array_equal(values, np.array([1, 2, 3], dtype=np_dtype))
+
+    def test_unsupported_dtype(self):
+        x = jnp.array([1, 2, 3], dtype=jnp.float8_e4m3b11fnuz)
+        capsule = to_dlpack(x)
+        with self.assertRaises(ValueError) as ctx:
+            dlpack_nccl_args(capsule)
+        self.assertIn("Unsupported dtype", str(ctx.exception))
+
+
+if __name__ == "__main__":
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
	`1`	`+# SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.`
`2`	`2`	`# SPDX-License-Identifier: Apache-2.0`
`3`	`3`	`#`
`4`	`4`	`# Licensed under the Apache License, Version 2.0 (the "License");`