Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add native setStateVector support to lightning.gpu #930

Merged
merged 24 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
73308b4
initial commit
multiphaseCFD Oct 1, 2024
573b52b
Auto update version from '0.39.0-dev34' to '0.39.0-dev35'
ringo-but-quantum Oct 1, 2024
e475f26
add changelog
multiphaseCFD Oct 1, 2024
3ae3f25
remove non-necessary state data copy
multiphaseCFD Oct 1, 2024
c2b9a70
Trigger CIs
multiphaseCFD Oct 1, 2024
7cfdcb9
update docstring
multiphaseCFD Oct 1, 2024
04c5816
Merge branch 'master' into direct_setStateVector_lgpu
multiphaseCFD Oct 3, 2024
a76de38
Auto update version from '0.39.0-dev38' to '0.39.0-dev39'
ringo-but-quantum Oct 3, 2024
098df13
update _apply_state_vector based on recent changes
multiphaseCFD Oct 3, 2024
f04b52d
apply alfredo's suggestion
multiphaseCFD Oct 3, 2024
6e7bb17
quick fix
multiphaseCFD Oct 3, 2024
130aed5
revert some changes
multiphaseCFD Oct 4, 2024
4e5b12b
add use_async to setStateVector method
multiphaseCFD Oct 4, 2024
a8d2671
deprecate sync and use use_async only
multiphaseCFD Oct 4, 2024
a1bae77
unify sync and use_async in the python layer
multiphaseCFD Oct 4, 2024
6d1dfc8
set previous setStateVector as private
multiphaseCFD Oct 4, 2024
2b7d224
tidy up C++ unit tests / pybind layer
multiphaseCFD Oct 4, 2024
3d84821
quick fix
multiphaseCFD Oct 4, 2024
b750db3
Merge branch 'master' into direct_setStateVector_lgpu
multiphaseCFD Oct 7, 2024
1b8678e
add docs
multiphaseCFD Oct 7, 2024
e99f774
Auto update version from '0.39.0-dev39' to '0.39.0-dev40'
ringo-but-quantum Oct 7, 2024
c3d57fa
quick update
multiphaseCFD Oct 7, 2024
f2a0bdf
Trigger CIs
multiphaseCFD Oct 7, 2024
93f1ffb
Trigger CIs
multiphaseCFD Oct 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@

### Improvements

* Add `setStateVector(state, wire)` support to the `lightning.gpu` C++ layer.
[(#930)](https://github.com/PennyLaneAI/pennylane-lightning/pull/930)

* Add zero-state initialization to both `StateVectorCudaManaged` and `StateVectorCudaMPI` constructors to remove the `reset_state` in the python layer ctor and refactor `setBasisState(state, wires)` in the C++ layer.
[(#933)](https://github.com/PennyLaneAI/pennylane-lightning/pull/933)

Expand Down
2 changes: 1 addition & 1 deletion pennylane_lightning/core/_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@
Version number (major.minor.patch[-label])
"""

__version__ = "0.39.0-dev39"
__version__ = "0.39.0-dev40"
Original file line number Diff line number Diff line change
Expand Up @@ -289,61 +289,48 @@ class StateVectorCudaMPI final
BaseType::getDataBuffer().zeroInit();
setBasisState_(value, index, use_async);
}

/**
* @brief Set values for a batch of elements of the state-vector. This
* method is implemented by the customized CUDA kernel defined in the
* DataBuffer class.
* @brief Set values for a batch of elements of the state-vector.
*
* @param num_indices Number of elements to be passed to the state vector.
* @param values Pointer to values to be set for the target elements.
* @param indices Pointer to indices of the target elements.
* @param async Use an asynchronous memory copy.
* @param state_ptr Pointer to initial state data.
* @param num_states Length of initial state data.
* @param wires Wires.
* @param use_async Use an asynchronous memory copy. Default is false.
*/
template <class index_type, std::size_t thread_per_block = 256>
void setStateVector(const index_type num_indices,
const std::complex<Precision> *values,
const index_type *indices, const bool async = false) {
BaseType::getDataBuffer().zeroInit();

std::vector<index_type> indices_local;
std::vector<std::complex<Precision>> values_local;

for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
i++) {
int index = indices[i];
PL_ASSERT(index >= 0);
std::size_t rankId =
static_cast<std::size_t>(index) >> BaseType::getNumQubits();

if (rankId == mpi_manager_.getRank()) {
int local_index = static_cast<int>(
compute_local_index(static_cast<std::size_t>(index),
this->getNumLocalQubits()));
indices_local.push_back(local_index);
values_local.push_back(values[i]);
void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
const std::vector<std::size_t> &wires,
bool use_async = false) {
PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
"Inconsistent state and wires dimensions.");

const auto num_qubits = this->getTotalNumQubits();

PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
[&num_qubits](const auto i) {
return i >= num_qubits;
}) == wires.end(),
"Invalid wire index.");

using index_type =
typename std::conditional<std::is_same<PrecisionT, float>::value,
int32_t, int64_t>::type;

// Calculate the indices of the state-vector to be set.
// TODO: Could move to GPU/MPI calculation if the state size is large.
std::vector<index_type> indices(num_states);
constexpr std::size_t one{1U};
for (std::size_t i = 0; i < num_states; i++) {
multiphaseCFD marked this conversation as resolved.
Show resolved Hide resolved
std::size_t index{0U};
for (std::size_t j = 0; j < wires.size(); j++) {
const std::size_t bit = (i & (one << j)) >> j;
const std::size_t wire = wires[wires.size() - 1 - j];
index |= bit << (num_qubits - 1 - wire);
}
indices[i] = static_cast<index_type>(index);
}

auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();

index_type num_elements = indices_local.size();

DataBuffer<index_type, int> d_indices{
static_cast<std::size_t>(num_elements), device_id, stream_id, true};

DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
device_id, stream_id, true};

d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
async);
d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
async);

setStateVector_CUDA(BaseType::getData(), num_elements,
d_values.getData(), d_indices.getData(),
thread_per_block, stream_id);
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
setStateVector_<index_type>(num_states, state_ptr, indices.data(),
use_async);
mpi_manager_.Barrier();
}

Expand Down Expand Up @@ -1548,6 +1535,62 @@ class StateVectorCudaMPI final
return t_indices;
}

/**
* @brief Set values for a batch of elements of the state-vector. This
* method is implemented by the customized CUDA kernel defined in the
* DataBuffer class.
*
* @param num_indices Number of elements to be passed to the state vector.
* @param values Pointer to values to be set for the target elements.
* @param indices Pointer to indices of the target elements.
* @param async Use an asynchronous memory copy.
*/
template <class index_type, std::size_t thread_per_block = 256>
void setStateVector_(const index_type num_indices,
const std::complex<Precision> *values,
const index_type *indices, const bool async = false) {
BaseType::getDataBuffer().zeroInit();

std::vector<index_type> indices_local;
std::vector<std::complex<Precision>> values_local;

for (std::size_t i = 0; i < static_cast<std::size_t>(num_indices);
multiphaseCFD marked this conversation as resolved.
Show resolved Hide resolved
i++) {
int index = indices[i];
PL_ASSERT(index >= 0);
std::size_t rankId =
static_cast<std::size_t>(index) >> BaseType::getNumQubits();

if (rankId == mpi_manager_.getRank()) {
int local_index = static_cast<int>(
compute_local_index(static_cast<std::size_t>(index),
this->getNumLocalQubits()));
indices_local.push_back(local_index);
values_local.push_back(values[i]);
}
}

auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();

index_type num_elements = indices_local.size();

DataBuffer<index_type, int> d_indices{
static_cast<std::size_t>(num_elements), device_id, stream_id, true};

DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
device_id, stream_id, true};

d_indices.CopyHostDataToGpu(indices_local.data(), d_indices.getLength(),
async);
d_values.CopyHostDataToGpu(values_local.data(), d_values.getLength(),
async);

setStateVector_CUDA(BaseType::getData(), num_elements,
d_values.getData(), d_indices.getData(),
thread_per_block, stream_id);
}

/**
* @brief Set value for a single element of the state-vector on device. This
* method is implemented by cudaMemcpy.
Expand Down Expand Up @@ -1637,8 +1680,8 @@ class StateVectorCudaMPI final
}

/**
* @brief Apply parametric Pauli gates to local statevector using custateVec
* calls.
* @brief Apply parametric Pauli gates to local statevector using
* custateVec calls.
*
* @param pauli_words List of Pauli words representing operation.
* @param ctrls Control wires
Expand Down Expand Up @@ -1708,7 +1751,8 @@ class StateVectorCudaMPI final
});

// Initialize a vector to store the status of wires and default its
// elements as zeros, which assumes there is no target and control wire.
// elements as zeros, which assumes there is no target and control
// wire.
std::vector<int> statusWires(this->getTotalNumQubits(),
WireStatus::Default);

Expand Down Expand Up @@ -1868,7 +1912,8 @@ class StateVectorCudaMPI final
});

// Initialize a vector to store the status of wires and default its
// elements as zeros, which assumes there is no target and control wire.
// elements as zeros, which assumes there is no target and control
// wire.
std::vector<int> statusWires(this->getTotalNumQubits(),
WireStatus::Default);

Expand Down Expand Up @@ -2009,7 +2054,8 @@ class StateVectorCudaMPI final
});

// Initialize a vector to store the status of wires and default its
// elements as zeros, which assumes there is no target and control wire.
// elements as zeros, which assumes there is no target and control
// wire.
std::vector<int> statusWires(this->getTotalNumQubits(),
WireStatus::Default);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -215,36 +215,46 @@ class StateVectorCudaManaged
}

/**
* @brief Set values for a batch of elements of the state-vector. This
* method is implemented by the customized CUDA kernel defined in the
* DataBuffer class.
* @brief Set values for a batch of elements of the state-vector.
*
* @param num_indices Number of elements to be passed to the state vector.
* @param values Pointer to values to be set for the target elements.
* @param indices Pointer to indices of the target elements.
* @param async Use an asynchronous memory copy.
* @param state_ptr Pointer to the initial state data.
* @param num_states Length of the initial state data.
* @param wires Wires.
* @param use_async Use an asynchronous memory copy. Default is false.
*/
template <class index_type, std::size_t thread_per_block = 256>
void setStateVector(const index_type num_indices,
const std::complex<Precision> *values,
const index_type *indices, const bool async = false) {
BaseType::getDataBuffer().zeroInit();

auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();

index_type num_elements = num_indices;
DataBuffer<index_type, int> d_indices{
static_cast<std::size_t>(num_elements), device_id, stream_id, true};
DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
device_id, stream_id, true};
void setStateVector(const ComplexT *state_ptr, const std::size_t num_states,
const std::vector<std::size_t> &wires,
bool use_async = false) {
PL_ABORT_IF_NOT(num_states == Pennylane::Util::exp2(wires.size()),
"Inconsistent state and wires dimensions.");

d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
d_values.CopyHostDataToGpu(values, d_values.getLength(), async);
const auto num_qubits = BaseType::getNumQubits();

setStateVector_CUDA(BaseType::getData(), num_elements,
d_values.getData(), d_indices.getData(),
thread_per_block, stream_id);
PL_ABORT_IF_NOT(std::find_if(wires.begin(), wires.end(),
[&num_qubits](const auto i) {
return i >= num_qubits;
}) == wires.end(),
"Invalid wire index.");

using index_type =
typename std::conditional<std::is_same<PrecisionT, float>::value,
int32_t, int64_t>::type;

// Calculate the indices of the state-vector to be set.
// TODO: Could move to GPU calculation if the state size is large.
std::vector<index_type> indices(num_states);
constexpr std::size_t one{1U};
for (std::size_t i = 0; i < num_states; i++) {
std::size_t index{0U};
for (std::size_t j = 0; j < wires.size(); j++) {
const std::size_t bit = (i & (one << j)) >> j;
const std::size_t wire = wires[wires.size() - 1 - j];
index |= bit << (num_qubits - 1 - wire);
}
indices[i] = static_cast<index_type>(index);
}
setStateVector_<index_type>(num_states, state_ptr, indices.data(),
use_async);
}

/**
Expand Down Expand Up @@ -1346,9 +1356,8 @@ class StateVectorCudaManaged
return t_indices;
}

/**
* @brief Set value for a single element of the state-vector on device. This
* method is implemented by cudaMemcpy.
/** @brief Set value for a single element of the state-vector on device.
* This method is implemented by cudaMemcpy.
*
* @param value Value to be set for the target element.
* @param index Index of the target element.
Expand All @@ -1362,6 +1371,40 @@ class StateVectorCudaManaged
stream_id);
}

/**
* @brief Set values for a batch of elements of the state-vector. This
* method is implemented by the customized CUDA kernel defined in the
* DataBuffer class.
*
* @param num_indices Number of elements to be passed to the state vector.
* @param values Pointer to values to be set for the target elements.
* @param indices Pointer to indices of the target elements.
* @param async Use an asynchronous memory copy.
*/
template <class index_type, std::size_t thread_per_block = 256>
void setStateVector_(const index_type num_indices,
const std::complex<Precision> *values,
const index_type *indices, const bool async = false) {
BaseType::getDataBuffer().zeroInit();

auto device_id = BaseType::getDataBuffer().getDevTag().getDeviceID();
auto stream_id = BaseType::getDataBuffer().getDevTag().getStreamID();

index_type num_elements = num_indices;
DataBuffer<index_type, int> d_indices{
static_cast<std::size_t>(num_elements), device_id, stream_id, true};
DataBuffer<CFP_t, int> d_values{static_cast<std::size_t>(num_elements),
device_id, stream_id, true};

d_indices.CopyHostDataToGpu(indices, d_indices.getLength(), async);
d_values.CopyHostDataToGpu(values, d_values.getLength(), async);

setStateVector_CUDA(BaseType::getData(), num_elements,
d_values.getData(), d_indices.getData(),
thread_per_block, stream_id);
PL_CUDA_IS_SUCCESS(cudaDeviceSynchronize());
}

/**
* @brief Apply parametric Pauli gates using custateVec calls.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,10 +63,6 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
using ParamT = PrecisionT; // Parameter's data precision
using np_arr_c = py::array_t<std::complex<ParamT>,
py::array::c_style | py::array::forcecast>;
using np_arr_sparse_ind = typename std::conditional<
std::is_same<ParamT, float>::value,
py::array_t<int32_t, py::array::c_style | py::array::forcecast>,
py::array_t<int64_t, py::array::c_style | py::array::forcecast>>::type;

registerGatesForStateVector<StateVectorT>(pyclass);

Expand All @@ -91,20 +87,15 @@ void registerBackendClassSpecificBindings(PyClass &pyclass) {
"Set the state vector to a basis state on GPU.")
.def(
"setStateVector",
[](StateVectorT &sv, const np_arr_sparse_ind &indices,
const np_arr_c &state, const bool use_async) {
using index_type = typename std::conditional<
std::is_same<ParamT, float>::value, int32_t, int64_t>::type;

sv.template setStateVector<index_type>(
static_cast<index_type>(indices.request().size),
static_cast<std::complex<PrecisionT> *>(
state.request().ptr),
static_cast<index_type *>(indices.request().ptr),
use_async);
[](StateVectorT &sv, const np_arr_c &state,
const std::vector<std::size_t> &wires, const bool async) {
const auto state_buffer = state.request();
const auto state_ptr =
static_cast<const std::complex<ParamT> *>(state_buffer.ptr);
sv.setStateVector(state_ptr, state_buffer.size, wires, async);
},
"Set State Vector on GPU with values and their corresponding "
"indices for the state vector on device")
"Set State Vector on GPU with values for the state vector and "
"wires on the host memory.")
.def(
"DeviceToDevice",
[](StateVectorT &sv, const StateVectorT &other, bool async) {
Expand Down
Loading
Loading