Merge branch 'cuq26-03-2-public' into 'main'

haidarazzam · haidarazzam · commit 9da0df2fbcaf · 2026-05-01T13:31:41.000-07:00
sync with internal repo1 (commit 2fab2fd88)

See merge request cuda-hpc-libraries/cuquantum-sdk/cuquantum-public!52
diff --git a/python/README.md b/python/README.md
@@ -53,9 +53,9 @@ Runtime dependencies of the cuQuantum Python package include:
 * Driver: Linux (525.60.13+ for CUDA 12, 580.65.06+ for CUDA 13)
 * CUDA Toolkit 12.x or 13.x
 * cuStateVec 1.13.1+
-* cuTensorNet 2.12.1+
-* cuDensityMat >=0.5.1, <0.6.0
-* cuPauliProp  >=0.3.1, <0.4.0
+* cuTensorNet 2.12.2+
+* cuDensityMat >=0.5.2, <0.6.0
+* cuPauliProp  >=0.3.2, <0.4.0
 * cuStabilizer >=0.3.0, <0.4.0
 * Python >=3.11, <3.14
 * NumPy v1.21+
@@ -66,6 +66,7 @@ Runtime dependencies of the cuQuantum Python package include:
 * Qiskit v1.4.2+ (optional, see [installation guide](https://qiskit.org/documentation/getting_started.html))
 * Cirq v0.6.0+ (optional, see [installation guide](https://quantumai.google/cirq/install))
 * mpi4py v3.1.0+ (optional, see [installation guide](https://mpi4py.readthedocs.io/en/stable/install.html))
+* nccl4py (optional)
 
 If you install everything from conda-forge, all the required dependencies are taken care for you (except for the driver).
 
diff --git a/python/cuquantum/_version.py b/python/cuquantum/_version.py
@@ -5,4 +5,4 @@
 # Note: cuQuantum Python follows the cuQuantum SDK version, which is now
 # switched to YY.MM and is different from individual libraries' (semantic)
 # versioning scheme.
-__version__ = '26.3.1'
+__version__ = '26.3.2'
diff --git a/python/cuquantum/densitymat/spectrum.py b/python/cuquantum/densitymat/spectrum.py
@@ -66,24 +66,32 @@ class OperatorSpectrumConfig:
             If not specified, a default value will be chosen. Defaults to 1.
         max_buffer_ratio: Maximum ratio of the total number of blocks in the Krylov subspace to the number of requested eigenvalues.
             If not specified, a default value will be chosen. Must be greater than 1. Defaults to 5.
-        max_restarts: Maximum number of restart cycles allowed during the iterative eigenvalue computation.
-            If not specified, a default value will be chosen. Defaults to 20.
+        max_restarts: Maximum number of thick restarts of the block Krylov algorithm.
+            The total number of Krylov-subspace expansions performed is at most
+            ``max_restarts + 1`` (one initial expansion plus up to ``max_restarts``
+            restarted expansions). A value of ``0`` corresponds to a single expansion
+            with no restart. If not specified, a default value will be chosen.
+            Defaults to 19 (i.e. up to 20 expansions).
     """
     min_krylov_block_size: Optional[int] = None
     max_buffer_ratio: Optional[int] = None
     max_restarts: Optional[int] = None
     
     def _check_int(self, attribute, name, min_value=0):
-        message = f"Invalid value ({attribute}) for '{name}'. Expect non-zero integer or None."
+        if min_value == -1:
+            bound_desc = "non-negative integer"
+        else:
+            bound_desc = f"integer greater than {min_value}"
+        message = f"Invalid value ({attribute}) for '{name}'. Expect {bound_desc} or None."
         if not isinstance(attribute, (type(None), int)):
             raise ValueError(message)
-        if isinstance(attribute, int) and not attribute > min_value: 
+        if isinstance(attribute, int) and not attribute > min_value:
             raise ValueError(message)
 
     def __post_init__(self):
         self._check_int(self.min_krylov_block_size, "min_krylov_block_size",0)
-        self._check_int(self.max_buffer_ratio, "max_buffer_ratio",1)
-        self._check_int(self.max_restarts, "max_restarts",0)
+        self._check_int(self.max_buffer_ratio, "max_buffer_ratio",1) 
+        self._check_int(self.max_restarts, "max_restarts", -1)
     
     @classmethod
     def _option_to_enum(cls, name):
diff --git a/python/extensions/cuquantum/densitymat/jax/cppsrc/cudensitymat.h b/python/extensions/cuquantum/densitymat/jax/cppsrc/cudensitymat.h
@@ -64,7 +64,7 @@
 
 #define CUDENSITYMAT_MAJOR 0 //!< cuDensityMat major version.
 #define CUDENSITYMAT_MINOR 5 //!< cuDensityMat minor version.
-#define CUDENSITYMAT_PATCH 1 //!< cuDensityMat patch version.
+#define CUDENSITYMAT_PATCH 2 //!< cuDensityMat patch version.
 #define CUDENSITYMAT_VERSION (CUDENSITYMAT_MAJOR * 10000 + CUDENSITYMAT_MINOR * 100 + CUDENSITYMAT_PATCH)
 
 
@@ -222,7 +222,7 @@ typedef enum
 typedef enum
 {
   CUDENSITYMAT_OPERATOR_SPECTRUM_CONFIG_MAX_EXPANSION = 0,  ///< int32_t: Configures the max ratio of the number of Krylov subspace blocks to the number of requested eigen-pairs (defaults to 5)
-  CUDENSITYMAT_OPERATOR_SPECTRUM_CONFIG_MAX_RESTARTS = 1,   ///< int32_t: Configures the max number of restarted iterations of the block Krylov algorithm (defaults to 20)
+  CUDENSITYMAT_OPERATOR_SPECTRUM_CONFIG_MAX_RESTARTS = 1,   ///< int32_t: Configures the max number of restarted iterations of the block Krylov algorithm (defaults to 19)
   CUDENSITYMAT_OPERATOR_SPECTRUM_CONFIG_MIN_BLOCK_SIZE = 2, ///< int32_t: Configures the min block size of the block Krylov algorithm (defaults to 1)
 } cudensitymatOperatorSpectrumConfig_t;
 
@@ -2032,33 +2032,6 @@ cudensitymatStatus_t cudensitymatAttachBatchedCoefficients(
                       void * operatorProductBatchedCoeffsTmp[],
                       void * operatorProductBatchedCoeffs[]);
 
-/**
- * \brief Configures the operator action on a quantum state.
- *
- * \param[in] handle Library handle.
- * \param[inout] superoperator Operator.
- * \param[in] stateIn Representative input quantum state on which the operator
- * is supposed to act. The actual quantum state acted on during computation
- * may be different, but it has to be of the same shape, kind,
- * and factorization structure (topology, bond dimensions, etc).
- * \param[in] stateOut Representative output quantum state produced by the action
- * of the operator on the input quantum state. The actual quantum state acted on
- * during computation may be different, but it has to be of the same shape,
- * kind, and factorization structure (topology, bond dimensions, etc).
- * \param[in] attribute Configuration attribute.
- * \param[in] attributeValue Pointer to the configuration attribute value (type-erased).
- * \param[in] attributeSize The size of the configuration attribute value.
- * \return cudensitymatStatus_t 
- */
-cudensitymatStatus_t cudensitymatOperatorConfigureAction(
-                    const cudensitymatHandle_t handle,
-                    cudensitymatOperator_t superoperator,
-                    const cudensitymatState_t stateIn,
-                    const cudensitymatState_t stateOut,
-                    //cudensitymatOperatorActionAttributes_t attribute, //`FIXME
-                    const void * attributeValue,
-                    size_t attributeSize);
-
 /**
  * \brief Prepares the operator for an action on a quantum state.
  *
diff --git a/python/samples/densitymat/operator_mpi_nccl_example.py b/python/samples/densitymat/operator_mpi_nccl_example.py
@@ -67,7 +67,11 @@ def ordered_print(msg):
 
 # Setup NCCL communicator (initialized via nvmath.distributed)
 nvmath.distributed.initialize(dev.id, comm, backends=["nccl"])
-nccl_comm_ptr = nvmath.distributed.get_context().nccl_comm
+nccl_comm = nvmath.distributed.get_context().nccl_comm
+if isinstance(nccl_comm, int):
+    nccl_comm_ptr = nccl_comm
+else:
+    nccl_comm_ptr = nccl_comm.ptr
 ctx.set_communicator(nccl_comm_ptr, provider="NCCL")
 ordered_print("Set NCCL communicator on execution context, enabling distributed computation.")
 
diff --git a/python/samples/densitymat/operator_spectrum_example.py b/python/samples/densitymat/operator_spectrum_example.py
@@ -138,7 +138,7 @@ def take_complex_conjugate_transpose(arr):
 max_num_eigvals = 5
 min_block_size = 4
 max_buffer_ratio = 25
-max_restarts = 10
+max_restarts = 9
 
 
 # Create a sequence of pure states |ψ_i⟩
diff --git a/python/setup.py b/python/setup.py
@@ -37,9 +37,9 @@
     'nvmath-python>=0.7.0, <1.0.0',  # ">=0.7.0,<1.0.0"
     # 'torch', # <-- PyTorch is optional; also, the PyPI version does not support GPU...
     f'custatevec-cu{utils.cuda_major_ver}>=1.13.1, <2',  # ">=1.13.1,<2"
-    f'cutensornet-cu{utils.cuda_major_ver}>=2.12.1, <3',  # ">=2.12.0,<3"
-    f'cudensitymat-cu{utils.cuda_major_ver}>=0.5.1, <0.6',  # ">=0.5.1,<0.6.0"
-    f'cupauliprop-cu{utils.cuda_major_ver}>=0.3.1, <0.4',  # ">=0.3.1,<0.4.0"
+    f'cutensornet-cu{utils.cuda_major_ver}>=2.12.2, <3',  # ">=2.12.2,<3"
+    f'cudensitymat-cu{utils.cuda_major_ver}>=0.5.2, <0.6',  # ">=0.5.1,<0.6.0"
+    f'cupauliprop-cu{utils.cuda_major_ver}>=0.3.2, <0.4',  # ">=0.3.2,<0.4.0"
     f'custabilizer-cu{utils.cuda_major_ver}>=0.3.0, <0.4',  # ">=0.3.0,<0.4.0"
 ]
 if utils.cuda_major_ver == '12':
diff --git a/python/tests/cuquantum_tests/bindings/test_internal.py b/python/tests/cuquantum_tests/bindings/test_internal.py
@@ -64,5 +64,5 @@ def test_data_type_alignment_with_nvmath():
             if val.name != ref_val.name:
                 raise ValueError(f"{val.name} from nvmath has a different name than cuquantum.cudaDataType")
         except ValueError:
-            # nvmath.CudaDataType has two additional values that are not captured by cuquantum.cudaDataType
-            assert val.name in {"CUDA_R_8F_E4M3", "CUDA_R_8F_E5M2"}
+            # nvmath.CudaDataType has additional values that are not captured by cuquantum.cudaDataType
+            assert val.name in {"CUDA_R_8F_E4M3", "CUDA_R_8F_E5M2", "CUDA_R_4F_E2M1"}
diff --git a/python/tests/cuquantum_tests/densitymat/distributed_utils.py b/python/tests/cuquantum_tests/densitymat/distributed_utils.py
@@ -42,7 +42,11 @@ def get_available_provider():
 if AVAILABLE_PROVIDER == "NCCL":
     with cp.cuda.Device(device_id):
         nvmath.distributed.initialize(device_id, mpi_comm, backends=["nccl"])
-        NCCL_COMM_PTR = nvmath.distributed.get_context().nccl_comm
+        nccl_comm = nvmath.distributed.get_context().nccl_comm
+        if isinstance(nccl_comm, int):
+            NCCL_COMM_PTR = nccl_comm
+        else:
+            NCCL_COMM_PTR = nccl_comm.ptr
 
 def skip_if_provider_unavailable(provider: str):
     """Skip test if the requested provider doesn't match the loaded interface."""
diff --git a/python/tests/cuquantum_tests/densitymat/test_work_stream_mpi.py b/python/tests/cuquantum_tests/densitymat/test_work_stream_mpi.py
@@ -9,7 +9,9 @@
 from mpi4py import MPI
 import pytest
 
-from nvmath.bindings import nccl
+# nvmath-python >= 0.9 no longer ships its own NCCL bindings; NCCL is now
+# provided by the standalone ``nccl4py`` package (imported as ``nccl.core``).
+import nccl.core as nccl
 
 from .distributed_utils import (
     mpi_comm as comm,
@@ -72,37 +74,42 @@ def test_work_stream_mpi_communicator_from_int_pointer():
         assert rank == ctx.get_proc_rank()
 
 
+def _bootstrap_nccl_communicator(rank, size):
+    """Create an externally-managed NCCL communicator using nccl4py.
+
+    All ranks must call this collectively. A unique id is generated on every
+    rank to obtain a same-sized buffer, then rank 0's bytes are broadcast via
+    MPI so all ranks join the same communicator.
+    """
+    unique_id = nccl.get_unique_id()
+    comm.Bcast(unique_id.as_ndarray.view(np.int8), root=0)
+    return nccl.Communicator.init(nranks=size, rank=rank, unique_id=unique_id)
+
+
 @pytest.mark.parametrize("sequence_type", [tuple, list])
 def test_work_stream_nccl_communicator_from_pointer(sequence_type):
     """Test setting NCCL communicator from (pointer, size) sequence with externally managed ncclComm_t."""
     skip_if_provider_unavailable("NCCL")
-    
+
     rank = comm.Get_rank()
     size = comm.Get_size()
     device_id = CURRENT_DEVICE_ID
 
     with cp.cuda.Device(device_id):
-        # Bootstrap NCCL communicator externally (following library_handle pattern)
-        unique_id = nccl.UniqueId()
-        if rank == 0:
-            nccl.get_unique_id(unique_id.ptr)
-        comm.Bcast(unique_id._data.view(np.int8), root=0)
-
-        nccl_comm_ptr = nccl.comm_init_rank(size, unique_id.ptr, rank)
-
+        nccl_comm = _bootstrap_nccl_communicator(rank, size)
         try:
             ctx = WorkStream(device_id=device_id)
             # Pass (ncclComm_t value, size) - library_handle wraps it in numpy array internally
             # The size value is not actually used (library uses itemsize of internal holder)
             ctx.set_communicator(
-                sequence_type([nccl_comm_ptr, np.dtype(np.intp).itemsize]),
+                sequence_type([nccl_comm.ptr, np.dtype(np.intp).itemsize]),
                 provider="NCCL"
             )
             assert size == ctx.get_num_ranks()
             assert rank == ctx.get_proc_rank()
         finally:
             # Clean up externally managed NCCL communicator
-            nccl.comm_destroy(nccl_comm_ptr)
+            nccl_comm.destroy()
 
 
 def test_work_stream_nccl_communicator_from_int_pointer():
@@ -114,21 +121,14 @@ def test_work_stream_nccl_communicator_from_int_pointer():
     device_id = CURRENT_DEVICE_ID
 
     with cp.cuda.Device(device_id):
-        # Bootstrap NCCL communicator externally (following library_handle pattern)
-        unique_id = nccl.UniqueId()
-        if rank == 0:
-            nccl.get_unique_id(unique_id.ptr)
-        comm.Bcast(unique_id._data.view(np.int8), root=0)
-
-        nccl_comm_ptr = nccl.comm_init_rank(size, unique_id.ptr, rank)
-
+        nccl_comm = _bootstrap_nccl_communicator(rank, size)
         try:
             ctx = WorkStream(device_id=device_id)
-            ctx.set_communicator(int(nccl_comm_ptr), provider="NCCL")
+            ctx.set_communicator(int(nccl_comm.ptr), provider="NCCL")
             assert size == ctx.get_num_ranks()
             assert rank == ctx.get_proc_rank()
         finally:
             ctx = None
             cp.cuda.Device().synchronize()
-            nccl.comm_finalize(nccl_comm_ptr)
-            nccl.comm_destroy(nccl_comm_ptr)
+            nccl_comm.finalize()
+            nccl_comm.destroy()
diff --git a/python/tests/cuquantum_tests/tensornet/experimental/_internal/state_matrix.py b/python/tests/cuquantum_tests/tensornet/experimental/_internal/state_matrix.py
@@ -440,14 +440,14 @@ def get_random_modes():
     ),
     # Non-Hermitian operator (random unitary product terms)
     _exp_grad_config("complex128", "cupy", create_state_factory(4, "complex128", "SDSD", np.random.default_rng(52), backend="cupy", mark_gradients=True),
-        hamiltonian=NetworkOperatorFactory((2, 2, 2, 2), np.random.default_rng(38), "cupy", dtype="complex128", num_repeats=2, real_coefficients=True, use_random_unitary=True, add_mpo=False),
+        hamiltonian=NetworkOperatorFactory((2, 2, 2, 2), np.random.default_rng(38), "cupy", dtype="complex128", num_repeats=2, real_coefficients=False, use_random_unitary=True, add_mpo=False),
         non_hermitian=True,
     ),
 ]
 
 expectation_gradient_L0_torch = [
     _exp_grad_config("complex128", "torch", create_state_factory((2, 3, 2, 4, 2, 5, 2, 3), "complex128", "SDSDSD", np.random.default_rng(43), backend="torch", mark_gradients=True),
-        hamiltonian=NetworkOperatorFactory((2, 3, 2, 4, 2, 5, 2, 3), np.random.default_rng(31), "torch", dtype="complex128", num_repeats=3, real_coefficients=True, use_random_hermitian=True),
+        hamiltonian=NetworkOperatorFactory((2, 3, 2, 4, 2, 5, 2, 3), np.random.default_rng(31), "torch", dtype="complex128", num_repeats=3, real_coefficients=False, use_random_hermitian=True),
     ),
     # Same as above but with an MPO term
     _exp_grad_config("complex128", "torch", create_state_factory((2, 3, 2, 4, 2, 5, 2, 3), "complex128", "SDSDSD", np.random.default_rng(43), backend="torch", mark_gradients=True),
@@ -457,15 +457,15 @@ def get_random_modes():
 
 expectation_gradient_L1 = [
     _exp_grad_config("complex64", "cupy", create_state_factory(8, "complex64", "SDSDDSD", np.random.default_rng(45), backend="cupy", mark_gradients=True),
-        hamiltonian={"ZYIZXZIZ": 5.0, "XZZYIZXZ": 2.0, "ZZYIXZYY": 3.0}
+        hamiltonian={"ZYIZXZIZ": 5.0j, "XZZYIZXZ": 2.0j, "ZZYIXZYY": 4+3.0j}
     ),
     # Same with identity removal (lightcone simplification)
     _exp_grad_config("complex64", "cupy", create_state_factory(8, "complex64", "SDSDDSD", np.random.default_rng(45), backend="cupy", mark_gradients=True),
         hamiltonian={"ZYIZXZIZ": 5.0, "XZZYIZXZ": 2.0, "ZZYIXZYY": 3.0},
         remove_identity=True,
     ),
     _exp_grad_config("complex128", "numpy", create_state_factory((3, 2, 4, 4, 2, 5), "complex128", "SDSDSD", np.random.default_rng(46), backend="numpy", mark_gradients=True),
-        hamiltonian=NetworkOperatorFactory((3, 2, 4, 4, 2, 5), np.random.default_rng(32), "numpy", dtype="complex128", num_repeats=3, real_coefficients=True, use_random_hermitian=True),
+        hamiltonian=NetworkOperatorFactory((3, 2, 4, 4, 2, 5), np.random.default_rng(32), "numpy", dtype="complex128", num_repeats=3, real_coefficients=False, use_random_hermitian=True),
     ),
     # Same as above but with an MPO term
     _exp_grad_config("complex128", "numpy", create_state_factory((3, 2, 4, 4, 2, 5), "complex128", "SDSDSD", np.random.default_rng(46), backend="numpy", mark_gradients=True),
@@ -475,7 +475,7 @@ def get_random_modes():
 
 expectation_gradient_L1_torch = [
     _exp_grad_config("float64", "torch", create_state_factory(6, "float64", "SDSDSDS", np.random.default_rng(44), backend="torch", mark_gradients=True),
-        hamiltonian={"ZXIXZI": 4.0, "IXZIZX": 3.0}
+        hamiltonian={"ZXIXZI": 4.0+2j, "IXZIZX": 3.0}
     ),
     # Same with identity removal (lightcone simplification)
     _exp_grad_config("float64", "torch", create_state_factory(6, "float64", "SDSDSDS", np.random.default_rng(44), backend="torch", mark_gradients=True),
@@ -489,7 +489,7 @@ def get_random_modes():
     ),
     # Non-Hermitian operator with MPO term
     _exp_grad_config("complex64", "torch", create_state_factory((2, 3, 2, 3), "complex64", "SDSDS", np.random.default_rng(53), backend="torch", mark_gradients=True),
-        hamiltonian=NetworkOperatorFactory((2, 3, 2, 3), np.random.default_rng(39), "torch", dtype="complex64", num_repeats=3, real_coefficients=True, use_random_unitary=True, add_mpo=True),
+        hamiltonian=NetworkOperatorFactory((2, 3, 2, 3), np.random.default_rng(39), "torch", dtype="complex64", num_repeats=3, real_coefficients=False, use_random_unitary=True, add_mpo=True),
         non_hermitian=True,
     ),
 ] if torch is not None else []
@@ -500,15 +500,15 @@ def get_random_modes():
     ),
     # Same as above but with an MPO term
     _exp_grad_config("complex128", "cupy", create_state_factory((3, 3, 3, 3, 3), "complex128", "SSDDS", np.random.default_rng(47), backend="cupy", mark_gradients=True),
-        hamiltonian=NetworkOperatorFactory((3, 3, 3, 3, 3), np.random.default_rng(33), "cupy", dtype="complex128", num_repeats=4, real_coefficients=True, use_random_hermitian=True, add_mpo=True),
+        hamiltonian=NetworkOperatorFactory((3, 3, 3, 3, 3), np.random.default_rng(33), "cupy", dtype="complex128", num_repeats=4, real_coefficients=False, use_random_hermitian=True, add_mpo=True),
     ),
     _exp_grad_config("complex64", "numpy", create_state_factory((2, 3, 2, 4, 2, 5, 2, 3), "complex64", "SDSDDSS", np.random.default_rng(49), backend="numpy", mark_gradients=True),
         hamiltonian=NetworkOperatorFactory((2, 3, 2, 4, 2, 5, 2, 3), np.random.default_rng(35), "numpy", dtype="complex64", num_repeats=4, real_coefficients=True, use_random_unitary=True),
         non_hermitian=True,
     ),
     # Same as above but with an MPO term
     _exp_grad_config("complex64", "numpy", create_state_factory((2, 3, 2, 4, 2, 5, 2, 3), "complex64", "SDSDDSS", np.random.default_rng(49), backend="numpy", mark_gradients=True),
-        hamiltonian=NetworkOperatorFactory((2, 3, 2, 4, 2, 5, 2, 3), np.random.default_rng(35), "numpy", dtype="complex64", num_repeats=4, real_coefficients=True, use_random_unitary=True, add_mpo=True),
+        hamiltonian=NetworkOperatorFactory((2, 3, 2, 4, 2, 5, 2, 3), np.random.default_rng(35), "numpy", dtype="complex64", num_repeats=4, real_coefficients=False, use_random_unitary=True, add_mpo=True),
         non_hermitian=True,
     ),
 ]
@@ -523,8 +523,8 @@ def get_random_modes():
         hamiltonian=NetworkOperatorFactory((2, 2, 2, 2, 2, 2), np.random.default_rng(34), "torch", dtype="float64", num_repeats=2, real_coefficients=True, add_mpo=False),
     ),
     # MPO term only (different RNG state than combined case)
-    _exp_grad_config("float64", "torch", create_state_factory(6, "float64", "SDSDD", np.random.default_rng(48), backend="torch", mark_gradients=True),
-        hamiltonian=NetworkOperatorFactory((2, 2, 2, 2, 2, 2), np.random.default_rng(34), "torch", dtype="float64", num_repeats=0, real_coefficients=True, add_mpo=True),
+    _exp_grad_config("complex64", "torch", create_state_factory(6, "complex64", "SDSDD", np.random.default_rng(48), backend="torch", mark_gradients=True),
+        hamiltonian=NetworkOperatorFactory((2, 2, 2, 2, 2, 2), np.random.default_rng(34), "torch", dtype="complex64", num_repeats=0, real_coefficients=False, add_mpo=True),
     ),
 ] if torch is not None else []
 
diff --git a/python/tests/requirements.txt b/python/tests/requirements.txt
@@ -12,3 +12,4 @@ jsonschema == 4.17.3
 networkx
 mpi4py
 stim == 1.15.0
+nccl4py