Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions python/tests/kernel/test_kernel_qvector_state_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,3 +450,66 @@ def kernel(vec: cudaq.State):

counts = cudaq.sample(kernel, c)
assert 'Invalid runtime argument type.' in repr(e)


@skipIfNvidiaFP64NotInstalled
def test_extra_qubit_before_qvector_state_f64():
cudaq.reset_target()
cudaq.set_target('nvidia', option='fp64')

c = np.array([0., 0., 0., 1.], dtype=cudaq.complex())
state = cudaq.State.from_data(c)

@cudaq.kernel
def kernel(vec: cudaq.State):
p = cudaq.qubit()
q = cudaq.qvector(vec)
mz(p)
mz(q)

counts = cudaq.sample(kernel, state)
assert '011' in counts
assert len(counts) == 1


@skipIfNvidiaNotInstalled
def test_extra_qubit_before_qvector_state_f32():
cudaq.reset_target()
cudaq.set_target('nvidia')

c = np.array([0., 0., 0., 1.], dtype=np.complex64)
state = cudaq.State.from_data(c)

@cudaq.kernel
def kernel(vec: cudaq.State):
p = cudaq.qubit()
q = cudaq.qvector(vec)
mz(p)
mz(q)

counts = cudaq.sample(kernel, state)
assert '011' in counts
assert len(counts) == 1


@skipIfNvidiaFP64NotInstalled
def test_extra_qubit_before_qvector_large_state_f64():
cudaq.reset_target()
cudaq.set_target('nvidia', option='fp64')

n = 20
v = np.zeros(2**n, dtype=cudaq.complex())
v[-1] = 1.
state = cudaq.State.from_data(v)

@cudaq.kernel
def kernel(vec: cudaq.State):
p = cudaq.qubit()
q = cudaq.qvector(vec)
mz(p)
mz(q)

counts = cudaq.sample(kernel, state, shots_count=100)
expected = '0' + '1' * n
assert expected in counts
assert len(counts) == 1
12 changes: 10 additions & 2 deletions runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ class CuStateVecCircuitSimulator
std::mt19937 randomEngine;
bool ownsDeviceVector = true;

uint32_t maxGridDimY = 65535;

/// @brief Generate a vector of random values
std::vector<double> randomValues(uint64_t num_samples, double max_value) {
std::vector<double> rs;
Expand Down Expand Up @@ -245,7 +247,7 @@ class CuStateVecCircuitSimulator
"CuStateVecCircuitSimulator::addQubitsToState kronprod");
// Compute the kronecker product
nvqir::kronprod<CudaDataType>(
n_blocks, threads_per_block, previousStateDimension,
maxGridDimY, threads_per_block, previousStateDimension,
deviceStateVector, (1UL << count), otherState, newDeviceStateVector);
HANDLE_CUDA_ERROR(cudaGetLastError());
}
Expand Down Expand Up @@ -292,7 +294,7 @@ class CuStateVecCircuitSimulator
"CuStateVecCircuitSimulator::addQubitsToState kronprod");
// Compute the kronecker product
nvqir::kronprod<CudaDataType>(
n_blocks, threads_per_block, previousStateDimension,
maxGridDimY, threads_per_block, previousStateDimension,
deviceStateVector, (1UL << in_state.getNumQubits()),
casted->getDevicePointer(), newDeviceStateVector);
HANDLE_CUDA_ERROR(cudaGetLastError());
Expand Down Expand Up @@ -405,6 +407,12 @@ class CuStateVecCircuitSimulator

HANDLE_CUDA_ERROR(cudaFree(0));
randomEngine = std::mt19937(randomDevice());

int dev;
HANDLE_CUDA_ERROR(cudaGetDevice(&dev));
cudaDeviceProp prop;
HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, dev));
maxGridDimY = static_cast<uint32_t>(prop.maxGridSize[1]);
}

/// The destructor
Expand Down
27 changes: 17 additions & 10 deletions runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -89,26 +89,33 @@ __global__ void cudaKronprod(size_t tsize1, const CudaDataType *arr1,
#pragma pop

template <typename CudaDataType>
void kronprod(uint32_t n_blocks, int32_t threads_per_block,
void kronprod(uint32_t maxGridDimY, int32_t threads_per_block,
size_t tsize1, const void *arr1,
size_t tsize2, const void *arr2,
size_t tsize2, const void *arr2,
void *arr0) {
cudaKronprod<<<n_blocks, threads_per_block>>>(
tsize1, reinterpret_cast<const CudaDataType *>(arr1),
uint32_t n_blocks_x =
(static_cast<uint32_t>(tsize1) + threads_per_block - 1) / threads_per_block;
uint32_t n_blocks_y = static_cast<uint32_t>(
std::min((tsize2 + static_cast<size_t>(threads_per_block) - 1) /
static_cast<size_t>(threads_per_block),
static_cast<size_t>(maxGridDimY)));
dim3 grid(n_blocks_x, n_blocks_y);
cudaKronprod<<<grid, threads_per_block>>>(
tsize1, reinterpret_cast<const CudaDataType *>(arr1),
tsize2, reinterpret_cast<const CudaDataType *>(arr2),
reinterpret_cast<CudaDataType *>(arr0));
}

template void
kronprod<cuFloatComplex>(uint32_t n_blocks, int32_t threads_per_block,
size_t tsize1, const void *arr1,
size_t tsize2, const void *arr2,
kronprod<cuFloatComplex>(uint32_t maxGridDimY, int32_t threads_per_block,
size_t tsize1, const void *arr1,
size_t tsize2, const void *arr2,
void *arr0);

template void
kronprod<cuDoubleComplex>(uint32_t n_blocks, int32_t threads_per_block,
size_t tsize1, const void *arr1,
size_t tsize2, const void *arr2,
kronprod<cuDoubleComplex>(uint32_t maxGridDimY, int32_t threads_per_block,
size_t tsize1, const void *arr1,
size_t tsize2, const void *arr2,
void *arr0);

/// @brief Kernel to set the first N elements of the state vector sv equal to
Expand Down
2 changes: 1 addition & 1 deletion runtime/nvqir/custatevec/CuStateVecCircuitSimulator.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ void initializeDeviceStateVector(uint32_t n_blocks, int32_t threads_per_block,
size_t stateDimension);

template <typename CudaDataType>
void kronprod(uint32_t n_blocks, int32_t threads_per_block, size_t tsize1,
void kronprod(uint32_t maxGridDimY, int32_t threads_per_block, size_t tsize1,
const void *arr1, size_t tsize2, const void *arr2, void *arr0);

#pragma pack(push, 4)
Expand Down
Loading