From 897e4b7fc065db7d906fddd6f5fdfe566a204854 Mon Sep 17 00:00:00 2001 From: chensgit169 Date: Thu, 30 Nov 2023 15:36:53 +0800 Subject: [PATCH 1/5] fet: use oracle in circuit --- quafu/circuits/quantum_circuit.py | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/quafu/circuits/quantum_circuit.py b/quafu/circuits/quantum_circuit.py index a73837f..7ec4f3b 100644 --- a/quafu/circuits/quantum_circuit.py +++ b/quafu/circuits/quantum_circuit.py @@ -381,27 +381,12 @@ def wrap_to_gate(self, name: str): """ Wrap the circuit to a subclass of QuantumGate, create by metaclass. """ - from quafu.elements.quantum_gate import customize_gate + from quafu.elements.oracle import customize_gate + from copy import deepcopy - gate_structure = [] - qubit_mapping = {q: i for i, q in enumerate(self.used_qubits)} - for gate in self.gates: - if isinstance(gate, QuantumGate): - gate = copy.deepcopy(gate) - # TODO: handel control pos - if isinstance(gate.pos, int): - gate.pos = qubit_mapping[gate.pos] - else: - gate.pos = [qubit_mapping[p] for p in gate.pos] - gate_structure.append(gate) - else: - raise ValueError() - - # TODO: error check - - customized = customize_gate(cls_name=name.lower(), - qubit_num=len(self.used_qubits), - gate_structure=gate_structure) + # TODO: check validity of instructions + gate_structure = [deepcopy(ins) for ins in self.instructions] + customized = customize_gate(name, gate_structure, self.num) return customized def id(self, pos: int) -> "QuantumCircuit": From 4aba7aa34844a6fd9a4adcd1d15c3caa17110a74 Mon Sep 17 00:00:00 2001 From: chensgit169 Date: Thu, 30 Nov 2023 15:37:21 +0800 Subject: [PATCH 2/5] chore: (mistake) remove redundant files --- src/qfvm/circuit.hpp | 241 --- src/qfvm/operators.hpp | 97 -- src/qfvm/qasm.hpp | 42 - src/qfvm/qfvm.cpp | 164 -- src/qfvm/simulator.hpp | 116 -- src/qfvm/statevector.hpp | 1355 ----------------- src/qfvm/types.hpp | 33 - src/qfvm/util.h | 99 -- src/qfvm_gpu/apply_gate_custate.cuh | 46 - src/qfvm_gpu/apply_gate_gpu.cuh | 411 ----- src/qfvm_gpu/cuda_simulator.cuh | 91 -- src/qfvm_gpu/cuda_statevector.cuh | 58 - src/qfvm_gpu/cuda_utils/CudaTexture.h | 39 - src/qfvm_gpu/cuda_utils/helper_cuda.h | 953 ------------ src/qfvm_gpu/cuda_utils/helper_custatevec.hpp | 31 - src/qfvm_gpu/cuda_utils/helper_string.h | 683 --------- src/qfvm_gpu/cuda_utils/ticktock.h | 9 - src/qfvm_gpu/custate_simu.cuh | 35 - 18 files changed, 4503 deletions(-) delete mode 100644 src/qfvm/circuit.hpp delete mode 100644 src/qfvm/operators.hpp delete mode 100644 src/qfvm/qasm.hpp delete mode 100644 src/qfvm/qfvm.cpp delete mode 100644 src/qfvm/simulator.hpp delete mode 100644 src/qfvm/statevector.hpp delete mode 100644 src/qfvm/types.hpp delete mode 100644 src/qfvm/util.h delete mode 100644 src/qfvm_gpu/apply_gate_custate.cuh delete mode 100644 src/qfvm_gpu/apply_gate_gpu.cuh delete mode 100644 src/qfvm_gpu/cuda_simulator.cuh delete mode 100644 src/qfvm_gpu/cuda_statevector.cuh delete mode 100644 src/qfvm_gpu/cuda_utils/CudaTexture.h delete mode 100644 src/qfvm_gpu/cuda_utils/helper_cuda.h delete mode 100644 src/qfvm_gpu/cuda_utils/helper_custatevec.hpp delete mode 100644 src/qfvm_gpu/cuda_utils/helper_string.h delete mode 100644 src/qfvm_gpu/cuda_utils/ticktock.h delete mode 100644 src/qfvm_gpu/custate_simu.cuh diff --git a/src/qfvm/circuit.hpp b/src/qfvm/circuit.hpp deleted file mode 100644 index cd08bea..0000000 --- a/src/qfvm/circuit.hpp +++ /dev/null @@ -1,241 +0,0 @@ -#pragma once -#include "operators.hpp" -#include "qasm.hpp" -#include -#include -#include -#include -#include "util.h" -#include -#include -#include - -namespace py = pybind11; -using namespace pybind11::literals; - - -void check_operator(QuantumOperator &op){ - std::cout << "-------------" << std::endl; - - std::cout << "name: " << op.name() << std::endl; - std::cout << "pos: "; - Qfutil::printVector(op.positions()); - - std::cout << "paras: "; - Qfutil::printVector(op.paras()); - - std::cout << "control number: "; - std::cout << op.control_num() << std::endl; - - std::cout << "matrix: " << std::endl; - std::cout << op.mat() << std::endl; - - std::cout << "flatten matrix: " << std::endl; - auto mat = op.mat(); - // Eigen::Map v1(mat.data(), mat.size()); - // std::cout << "v1: " << v1 << std::endl; - auto matv = mat.data(); - for (auto i = 0;i < mat.size();i++){ - std::cout << matv[i] << " "; - } - std::cout << std::endl; - std::cout << "-------------" << std::endl; -} - - -class Circuit{ - private: - uint qubit_num_; - vector instructions_; - uint max_targe_num_; - uint cbit_num_; - // to sample count - vector> measure_vec_; - bool final_measure_ = true; - - public: - Circuit(); - explicit Circuit(uint qubit_num); - explicit Circuit(vector &ops); - explicit Circuit(py::object const&pycircuit); - - void add_op(QuantumOperator &op); - void compress_instructions(); - uint qubit_num() const { return qubit_num_; } - uint cbit_num() const { return cbit_num_; } - uint max_targe_num() const { return max_targe_num_; } - bool final_measure() const { return final_measure_; } - vector gates(); - vector> measure_vec() { return measure_vec_; } - vector instructions() const { return instructions_; } - QuantumOperator from_pyops(py::object const &obj); -}; - -void Circuit::add_op(QuantumOperator &op){ - for (pos_t pos : op.positions()){ - if (pos > qubit_num_) { - throw "invalid position on quantum registers"; - } - else{ - instructions_.push_back(op); - } - } -} - - Circuit::Circuit(){}; - Circuit::Circuit(uint qubit_num) - : - qubit_num_(qubit_num){ } - - Circuit::Circuit(vector &ops) - : - instructions_(ops), - max_targe_num_(0){ - qubit_num_ = 0; - for (auto op : ops){ - for (pos_t pos : op.positions()){ - if (op.targe_num() > max_targe_num_) - max_targe_num_ = op.targe_num(); - if (pos+1 > qubit_num_){ qubit_num_ = pos+1; } - } - } -} - -vector Circuit::gates(){ - // provide gates for gpu and custate - std::vector classics = {"measure", "cif", "reset"}; - vector gates; - for(auto op : instructions_){ - if(std::find(classics.begin(), classics.end(), op.name()) == classics.end()){ - gates.push_back(op); - } - } - return gates; -} - -// Construct C++ operators from pygates -QuantumOperator Circuit::from_pyops(py::object const &obj){ - string name; - vector positions; - vector qbits; - vector cbits; - vector paras; - uint control_num = 0; - RowMatrixXcd mat; - - name = obj.attr("name").attr("lower")().cast(); - if (!(name == "barrier" || name == "delay" || name == "id" || name == "measure" || name == "reset" || name == "cif")) - { - if (py::isinstance(obj.attr("pos"))){ - positions = obj.attr("pos").cast>(); - } - else if(py::isinstance(obj.attr("pos"))){ - positions = vector{obj.attr("pos").cast()}; - } - - if (py::isinstance(obj.attr("paras"))){ - paras = obj.attr("paras").cast>(); - } - else if(py::isinstance(obj.attr("paras")) || py::isinstance(obj.attr("paras"))){ - paras = vector{obj.attr("paras").cast()}; - } - - if (py::hasattr(obj, "ctrls")){ - control_num = py::len(obj.attr("ctrls")); - } - - //Reverse order for multi-target gate - if (py::hasattr(obj, "_targ_matrix")){ - mat = obj.attr("get_targ_matrix")("reverse_order"_a=true).cast(); - } - else{ //Single target gate - mat = obj.attr("matrix").cast(); - } - return QuantumOperator(name, paras, positions, control_num, mat); - - }else if(name == "measure"){ - if (py::isinstance(obj.attr("qbits"))){ - qbits = obj.attr("qbits").cast>(); - }else if(py::isinstance(obj.attr("qbits"))){ - qbits = vector{obj.attr("qbits").cast()}; - } - - if (py::isinstance(obj.attr("cbits"))){ - cbits = obj.attr("cbits").cast>(); - }else if(py::isinstance(obj.attr("cbits"))){ - cbits = vector{obj.attr("cbits").cast()}; - } - //record qbit-cbit measure map - for(uint i = 0; i < qbits.size(); i++){ - measure_vec_.push_back(std::make_pair(qbits[i], cbits[i])); - } - return QuantumOperator(name, qbits, cbits); - - }else if(name == "reset"){ - if (py::isinstance(obj.attr("pos"))){ - positions = obj.attr("pos").cast>(); - } - else if(py::isinstance(obj.attr("pos"))){ - positions = vector{obj.attr("pos").cast()}; - } - return QuantumOperator(name, positions); - - }else if(name == "cif"){ - uint condition = 0; - vector instructions; - if (py::isinstance(obj.attr("cbits"))){ - cbits = obj.attr("cbits").cast>(); - }else if(py::isinstance(obj.attr("cbits"))){ - cbits = vector{obj.attr("cbits").cast()}; - } - - if(py::isinstance(obj.attr("condition"))){ - condition = obj.attr("condition").cast(); - } - - // Recursively handdle instruction - if (py::isinstance(obj.attr("instructions"))){ - auto pyops = obj.attr("instructions"); - for(auto pyop_h : pyops){ - py::object pyop = py::reinterpret_borrow(pyop_h); - QuantumOperator op = from_pyops(pyop); - if (op){ - if (op.targe_num() > max_targe_num_) - max_targe_num_ = op.targe_num(); - instructions.push_back(std::move(op)); - } - } - } - return QuantumOperator(name, cbits, condition, instructions); - - }else{ - return QuantumOperator(); - } - -} - -Circuit::Circuit(py::object const&pycircuit) -: -max_targe_num_(0) -{ - // auto pygates = pycircuit.attr("gates"); - auto pyops = pycircuit.attr("instructions"); - auto used_qubits = pycircuit.attr("used_qubits").cast>(); - cbit_num_ = pycircuit.attr("cbits_num").cast(); - qubit_num_ = *std::max_element(used_qubits.begin(), used_qubits.end())+1; - // judge wheather op qubit after measure - bool measured = false; - for (auto pyop_h : pyops){ - py::object pyop = py::reinterpret_borrow(pyop_h); - QuantumOperator op = from_pyops(pyop); - if (op){ - if (op.targe_num() > max_targe_num_) - max_targe_num_ = op.targe_num(); - if(op.name() == "measure") {measured = true;} - else if(measured == true) {final_measure_ = false; } - instructions_.push_back(std::move(op)); - } - } -} - -void Circuit::compress_instructions(){} diff --git a/src/qfvm/operators.hpp b/src/qfvm/operators.hpp deleted file mode 100644 index cb9fd66..0000000 --- a/src/qfvm/operators.hpp +++ /dev/null @@ -1,97 +0,0 @@ -#pragma once - -#include -#include "statevector.hpp" - -class QuantumOperator{ - protected: - string name_; - vector positions_; - vector paras_; - uint control_num_; - uint targe_num_; - bool diag_; - bool real_; - RowMatrixXcd mat_; - vector qbits_; - vector cbits_; - vector instructions_; - uint condition_; - public: - //Constructor - QuantumOperator(); - QuantumOperator(string name, vector const &qbits); - QuantumOperator(string name, vector const &qbits, vector const &cbits); - QuantumOperator(string name, vector const &cbits, const uint condition, vector const &ins); - QuantumOperator(string name, vector paras, vector const &control_qubits, vector const &targe_qubits, RowMatrixXcd const &mat, bool diag=false, bool real=false); - QuantumOperator(string name,vector paras, vector const &positions, uint control_num, RowMatrixXcd const &mat, bool diag=false, bool real=false); - - //data accessor - string name() const {return name_;} - vector paras() const {return paras_;} - bool has_control() const{return control_num_ == 0 ? false : true;} - bool is_real() const{ return real_; } - bool is_diag() const{ return diag_; } - RowMatrixXcd mat() const { return mat_;} - uint control_num() const { return control_num_; } - uint targe_num() const { return targe_num_; } - uint condition() const { return condition_; } - vector positions(){ return positions_; } - explicit operator bool() const { - return !(name_ == "empty"); - } - vector qbits(){ return qbits_; } - vector cbits(){ return cbits_; } - vector instructions(){ return instructions_; } - //Apply method - virtual void apply_to_state(StateVector & state){ }; -}; - - -QuantumOperator::QuantumOperator() : name_("empty"){ }; - -QuantumOperator::QuantumOperator(string name, vector const &qbits) -: -name_(name), -targe_num_(0), -qbits_(qbits){} - -QuantumOperator::QuantumOperator(string name, vector const &qbits, vector const &cbits) -: -name_(name), -targe_num_(0), -qbits_(qbits), -cbits_(cbits){} - -QuantumOperator::QuantumOperator(string name, vector const &cbits, const uint condition, vector const &ins) -: -name_(name), -targe_num_(0), -cbits_(cbits), -instructions_(ins), -condition_(condition){} - -QuantumOperator::QuantumOperator(string name, vector paras, vector const &positions, uint control_num, RowMatrixXcd const &mat, bool diag, bool real) -: -name_(name), -paras_(paras), -positions_(positions), -control_num_(control_num), -targe_num_(positions.size()-control_num), -diag_(diag), -real_(real), -mat_(mat){ } - -QuantumOperator::QuantumOperator(string name, vector paras, vector const &control_qubits, vector const &targe_qubits, RowMatrixXcd const &mat, bool diag, bool real) -: -name_(name), -paras_(paras), -diag_(diag), -real_(real), -mat_(mat){ - positions_ = control_qubits; - positions_.insert(positions_.end(), targe_qubits.begin(), targe_qubits.end()); - control_num_ = control_qubits.size(); - targe_num_ = targe_qubits.size(); -} - diff --git a/src/qfvm/qasm.hpp b/src/qfvm/qasm.hpp deleted file mode 100644 index e0a2285..0000000 --- a/src/qfvm/qasm.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once - -#include -#include "types.hpp" -#include "util.h" - -using Qfutil::split_string; -using Qfutil::find_numbers; -#define Pair(name) {#name, Opname::name} - -enum class Opname{ - creg, x, y, z, h, s, sdg, t, tdg, p, rx, ry, rz, cnot, cx, cz, crx, cp, ccx, toffoli, swap, iswap, rxx, ryy, rzz, measure, reset, cif -}; - -std::unordered_map OPMAP{Pair(creg), Pair(x), Pair(y), Pair(z), Pair(h), Pair(s), Pair(sdg), Pair(t), - Pair(tdg), Pair(p), Pair(rx), Pair(ry), Pair(rz), Pair(cnot), Pair(cx), Pair(cz), - Pair(crx), Pair(cp), Pair(ccx), Pair(swap), Pair(iswap), Pair(rxx), Pair(ryy), - Pair(rzz), Pair(measure), Pair(reset), Pair(cif)}; - -struct Operation{ - string name; - vector positions; - vector params; - - void print_info(){ - std::cout << "name " << name << std::endl; - std::cout << "positions: "; - for (auto pos : positions){ - std::cout << pos << " "; - } - std::cout << std::endl; - - if (params.size() > 0){ - printf("parameters: "); - for (auto para : params){ - printf("%.6f ", para); - } - } - printf("\n"); - printf("-----\n"); - } -} ; diff --git a/src/qfvm/qfvm.cpp b/src/qfvm/qfvm.cpp deleted file mode 100644 index f318da6..0000000 --- a/src/qfvm/qfvm.cpp +++ /dev/null @@ -1,164 +0,0 @@ -#include -#include -#include "simulator.hpp" -#include -#include -#ifdef _USE_GPU -#include -#endif - -#ifdef _USE_CUQUANTUM -#include -#endif - -namespace py = pybind11; - -template -py::array_t to_numpy(const std::tuple &src) { - auto src_ptr = std::get<0>(src); - auto src_size = std::get<1>(src); - - auto capsule = py::capsule(src_ptr, [](void* p) { - delete [] reinterpret_cast(p); - }); - return py::array_t( - src_size, - src_ptr, - capsule - ); -} - -std::pair, py::array_t> > simulate_circuit(py::object const&pycircuit, py::array_t> &np_inputstate, const int &shots){ - auto circuit = Circuit(pycircuit); - py::buffer_info buf = np_inputstate.request(); - auto* data_ptr = reinterpret_cast*>(buf.ptr); - size_t data_size = buf.size; - // If measure all at the end, simulate once - uint actual_shots = shots; - if (circuit.final_measure()) actual_shots = 1; - StateVector global_state; - vector>measures = circuit.measure_vec(); - std::mapcbit_measured; - for(auto &pair: measures){ - cbit_measured[pair.second] = true; - } - // Store outcome's count - std::map outcount; - for(uint i =0; i < actual_shots; i++){ - StateVector state; - if(data_size == 0){ - simulate(circuit, state); - }else{ - //deepcopy state - vector> data_copy(data_ptr, data_ptr + data_size); - state = std::move(StateVector(data_copy.data(), data_copy.size())); - simulate(circuit, state); - } - if(!circuit.final_measure()){ - // store reg - vector tmpcreg = state.creg(); - uint outcome = 0; - for(uint j=0;j tmpcount(global_state.size(), 0); - vector probs = global_state.probabilities(); - std::random_device rd; - std::mt19937 global_rng(rd()); - for(uint i = 0; i < shots; i++){ - uint outcome = std::discrete_distribution(probs.begin(), probs.end())(global_rng); - tmpcount[outcome]++; - } - // map to reg - for(uint i = 0; i < global_state.size(); i++){ - if(tmpcount[i] == 0) continue; - vector tmpcreg(global_state.cbit_num(), 0); - vector tmpout = int2vec(i, 2); - if(tmpout.size() < global_state.num()) - tmpout.resize(global_state.num()); - for(auto &pair: measures){ - tmpcreg[pair.second] = tmpout[pair.first]; - } - uint outcome = 0; - for(uint j=0;j> &np_inputstate){ - auto circuit = Circuit(pycircuit); - py::buffer_info buf = np_inputstate.request(); - auto* data_ptr = reinterpret_cast*>(buf.ptr); - size_t data_size = buf.size; - - - if (data_size == 0){ - StateVector state; - simulate_gpu(circuit, state); - return to_numpy(state.move_data_to_python()); - } - else{ - StateVector state(data_ptr, buf.size); - simulate_gpu(circuit, state); - state.move_data_to_python(); - return np_inputstate; - } -} -#endif - -#ifdef _USE_CUQUANTUM -py::object simulate_circuit_custate(py::object const&pycircuit, py::array_t> &np_inputstate){ - auto circuit = Circuit(pycircuit); - py::buffer_info buf = np_inputstate.request(); - auto* data_ptr = reinterpret_cast*>(buf.ptr); - size_t data_size = buf.size; - - - if (data_size == 0){ - StateVector state; - simulate_custate(circuit, state); - return to_numpy(state.move_data_to_python()); - } - else{ - StateVector state(data_ptr, buf.size); - simulate_custate(circuit, state); - state.move_data_to_python(); - return np_inputstate; - } -} -#endif - - - -PYBIND11_MODULE(qfvm, m) { - m.doc() = "Qfvm simulator"; - m.def("simulate_circuit", &simulate_circuit, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t>(0), py::arg("shots")); - - #ifdef _USE_GPU - m.def("simulate_circuit_gpu", &simulate_circuit_gpu, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t>(0)); - #endif - - #ifdef _USE_CUQUANTUM - m.def("simulate_circuit_custate", &simulate_circuit_custate, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t>(0)); - #endif -} - diff --git a/src/qfvm/simulator.hpp b/src/qfvm/simulator.hpp deleted file mode 100644 index 1499922..0000000 --- a/src/qfvm/simulator.hpp +++ /dev/null @@ -1,116 +0,0 @@ -#pragma once - -#include "statevector.hpp" -#include "circuit.hpp" - -void apply_op(QuantumOperator &op, StateVector &state){ - bool matched = false; - switch (OPMAP[op.name()]){ - //Named gate - case Opname::x: - state.apply_x(op.positions()[0]); - break; - case Opname::y: - state.apply_y(op.positions()[0]); - break; - case Opname::z: - state.apply_z(op.positions()[0]); - break; - case Opname::h: - state.apply_h(op.positions()[0]); - break; - case Opname::s: - state.apply_s(op.positions()[0]); - break; - case Opname::sdg: - state.apply_sdag(op.positions()[0]); - break; - case Opname::t: - state.apply_t(op.positions()[0]); - break; - case Opname::tdg: - state.apply_tdag(op.positions()[0]); - break; - case Opname::p: - state.apply_p(op.positions()[0], op.paras()[0]); - break; - case Opname::rx: - state.apply_rx(op.positions()[0], op.paras()[0]); - break; - case Opname::ry: - state.apply_ry(op.positions()[0], op.paras()[0]); - break; - case Opname::rz: - state.apply_rz(op.positions()[0], op.paras()[0]); - break; - case Opname::cx: - state.apply_cnot(op.positions()[0], op.positions()[1]); - break; - case Opname::cnot: - state.apply_cnot(op.positions()[0], op.positions()[1]); - break; - case Opname::cp: - state.apply_cp(op.positions()[0], op.positions()[1], op.paras()[0]); - break; - case Opname::cz: - state.apply_cz(op.positions()[0], op.positions()[1]); - break; - case Opname::ccx: - state.apply_ccx(op.positions()[0], op.positions()[1], op.positions()[2]); - break; - case Opname::toffoli: - state.apply_ccx(op.positions()[0], op.positions()[1], op.positions()[2]); - case Opname::rzz: - state.apply_cnot(op.positions()[0], op.positions()[1]); - state.apply_rz(op.positions()[1], op.paras()[0]); - state.apply_cnot(op.positions()[0], op.positions()[1]); - break; - case Opname::measure: - state.apply_measure(op.qbits(), op.cbits()); - break; - case Opname::reset: - state.apply_reset(op.qbits()); - break; - case Opname::cif: - // check cbits and condition - matched = state.check_cif(op.cbits(), op.condition()); - // apply op in instructions - if(matched){ - for(auto op_h :op.instructions()){ - apply_op(op_h, state); - } - } - break; - //Other general gate - default: - { - if (op.targe_num() == 1){ - auto mat_temp = op.mat(); - complex *mat = mat_temp.data(); - if (op.control_num() == 0){ - state.apply_one_targe_gate_general<0>(op.positions(), mat); - }else if (op.control_num() == 1){ - state.apply_one_targe_gate_general<1>(op.positions(), mat); - }else{ - state.apply_one_targe_gate_general<2>(op.positions(), mat); - } - }else if(op.targe_num() > 1){ - state.apply_multi_targe_gate_general(op.positions(), op.control_num(), op.mat()); - }else{ - throw "Invalid target number"; - } - } - } -} - -void simulate(Circuit const& circuit, StateVector & state){ - state.set_num(circuit.qubit_num()); - state.set_creg(circuit.cbit_num()); - // skip measure and handle it in qfvm.cpp - bool skip_measure = circuit.final_measure(); - for (auto op : circuit.instructions()){ - if(skip_measure == true && op.name() == "measure") continue; - apply_op(op , state); - } -} - diff --git a/src/qfvm/statevector.hpp b/src/qfvm/statevector.hpp deleted file mode 100644 index 99cf5f7..0000000 --- a/src/qfvm/statevector.hpp +++ /dev/null @@ -1,1355 +0,0 @@ -#pragma once -#include "types.hpp" -#include "util.h" -#include -#include -#include -#include -#include -#include -#include -#ifdef USE_SIMD -#ifdef _MSC_VER -#include -#else -#include -#endif -#endif - -template -class StateVector{ - private: - uint num_; - // classical bit - uint cbit_num_; - vector creg_; - size_t size_; - std::unique_ptr[]> data_; - //random engine - std::mt19937_64 rng_; - - public: - //construct function - StateVector(); - explicit StateVector(uint num); - explicit StateVector(complex *data, size_t data_size); - //move assign - // StateVector& operator=(StateVector&& other){ - // if(this != &other){ - // data_ = std::move(other.data_); - // creg_ = std::move(other.creg_); - // num_ = other.num_; - // cbit_num_ = other.cbit_num_; - // size_ = other.size_; - - // } - // return *this; - // } - - //Named gate function - void apply_x(pos_t pos); - void apply_y(pos_t pos); - void apply_z(pos_t pos); - void apply_h(pos_t pos); - void apply_s(pos_t pos); - void apply_sdag(pos_t pos); - void apply_t(pos_t pos); - void apply_tdag(pos_t pos); - void apply_p(pos_t pos, real_t phase); - void apply_rx(pos_t pos, real_t theta); - void apply_ry(pos_t pos, real_t theta); - void apply_rz(pos_t pos, real_t theta); - void apply_cnot(pos_t control, pos_t targe); - void apply_cz(pos_t control, pos_t targe); - void apply_cp(pos_t control, pos_t targe, real_t phase); - void apply_crx(pos_t control, pos_t targe, real_t theta); - void apply_cry(pos_t control, pos_t targe, real_t theta); - void apply_ccx(pos_t control1, pos_t control2, pos_t targe); - void apply_swap(pos_t q1, pos_t q2); - - //General implementation - //One-target gate, ctrl_num equal 2 represent multi-controlled gate - template - void apply_one_targe_gate_general(vector const& posv, complex *mat); - template - void apply_one_targe_gate_diag(vector const& posv, complex *mat); - template - void apply_one_targe_gate_real(vector const& posv, complex *mat); - template - void apply_one_targe_gate_x(vector const& posv); - - //Multiple-target gate - void apply_multi_targe_gate_general(vector const& posv, uint control_num, RowMatrixXcd const&mat); - - // Measure and Reset - std::pair sample_measure_probs(vector const& qbits); - vector probabilities() const; - void apply_diagonal_matrix(vector const& qbits, vector > const& mdiag); - void update(vector const& qbits, const uint final_state, const uint meas_state, const double meas_prob); - void apply_measure(vector const& qbits,const vector &cbits); - void apply_reset(vector const& qbits); - - // cif check - bool check_cif(const vector &cbits, const uint condition); - - complex operator[] (size_t j) const ; - void set_num(uint num); - void set_creg(uint num){ - if(num > 0){ - cbit_num_ = num; - creg_.resize(cbit_num_, 0); - }else{ - throw std::logic_error("The number of cbit must be positive."); - } - } - - vector creg(){ - return creg_; - } - - void set_rng(){ - std::random_device rd; - rng_.seed(rd()); - } - - void print_state(); - std::tuple*, size_t> move_data_to_python() { - auto data_ptr = data_.release(); - return std::make_tuple(std::move(data_ptr), size_); - } - - complex* data(){ return data_.get(); } - size_t size(){ return size_; } - uint num(){ return num_; } - uint cbit_num(){ return cbit_num_; } -}; - - -//////// constructors /////// - -template -StateVector::StateVector(uint num) -: num_(num), -size_(1ULL<[]>(size_); - data_[0] = complex(1., 0); -}; - -template -StateVector::StateVector() : StateVector(0){ } - -template -StateVector::StateVector(complex *data, size_t data_size) -: -data_(data), -size_(data_size) -{ - num_ = static_cast(std::log2(size_)); -} - - - -//// useful functions ///// -template -std::complex StateVector::operator[] (size_t j) const{ - return data_[j]; -} - -template -void StateVector::set_num(uint num){ - if (num_ > 0) { - // Initialized from statevector, - // should not resize - return; - } - num_ = num; - - if (size_ != 1ULL << num) { - data_.reset(); - size_ = 1ULL << num; - data_ = std::make_unique[]>(size_); - data_[0] = complex(1, 0); - } -} -template -bool StateVector::check_cif(const vector &cbits, const uint condition){ - uint out = 0; - for(uint i = 0; i < cbits.size(); i++){ - out *= 2; - out += creg_[cbits[i]]; - } - return out == condition; -} - -template -void StateVector::print_state(){ - std::cout << "state_data: "; - for (auto i=0;i -void StateVector::apply_x(pos_t pos){ - const size_t offset = 1<>1; - if (pos == 0){ //single step -#ifdef USE_SIMD -#pragma omp parallel for - for(omp_i j = 0;j < size_;j+=2){ - double* ptr = (double*)(data_.get() + j); - __m256d data = _mm256_loadu_pd(ptr); - data = _mm256_permute4x64_pd(data, 78); - _mm256_storeu_pd(ptr, data); - } -#else -#pragma omp parallel for - for(omp_i j = 0;j < size_;j+=2){ - std::swap(data_[j], data_[j+1]); - } -#endif - } - else{ -#ifdef USE_SIMD -#pragma omp parallel for - for(omp_i j = 0;j < rsize;j += 2){ - size_t i = (j&(offset-1)) | (j>>pos<>pos< -void StateVector::apply_y(pos_t pos){ - const size_t offset = 1<>1; - const complex im = imag_I; - if (pos == 0){ //single step -#ifdef USE_SIMD - __m256d minus_half = _mm256_set_pd(1, -1, -1, 1); -#pragma omp parallel for - for(omp_i j = 0;j < size_;j+=2){ - double* ptr = (double*)(data_.get() + j); - __m256d data = _mm256_loadu_pd(ptr); - data = _mm256_permute4x64_pd(data, 27); - data = _mm256_mul_pd(data, minus_half); - _mm256_storeu_pd(ptr, data); - } -#else -#pragma omp parallel for - for(omp_i j = 0;j < size_;j+=2){ - complex temp = data_[j]; - data_[j] = -im*data_[j+1]; - data_[j+1] = im*temp; - } -#endif - } - else{ -#ifdef USE_SIMD - __m256d minus_even = _mm256_set_pd(1, -1, 1, -1); - __m256d minus_odd = _mm256_set_pd(-1, 1, -1, 1); - -#pragma omp parallel for - for(omp_i j = 0;j < rsize;j += 2){ - size_t i = (j&(offset-1)) | (j>>pos<>pos< temp = data_[i]; - data_[i] = -im*data_[i+offset]; - data_[i+offset] = im*temp; - complex temp1 = data_[i1]; - data_[i1] = -im*data_[i1+offset]; - data_[i1+offset] = im*temp1; - } -#endif - } -} - -template -void StateVector::apply_z(pos_t pos){ - const size_t offset = 1<>1; - if (pos == 0){ //single step -#pragma omp parallel for - for(omp_i j = 1;j < size_;j+=2){ - data_[j] *= -1; - } - } - else{ -#ifdef USE_SIMD - __m256d minus_one = _mm256_set_pd(-1, -1, -1, -1); -#pragma omp parallel for - for(omp_i j = 0;j < rsize;j += 2){ - size_t i = (j&(offset-1)) | (j>>pos<>pos< -void StateVector::apply_h(pos_t pos){ - const double sqrt2inv = 1. / std::sqrt(2.); - complex mat[4] = {sqrt2inv, sqrt2inv, sqrt2inv, -sqrt2inv}; - apply_one_targe_gate_real<0>(vector{pos}, mat); - -} - -template -void StateVector::apply_s(pos_t pos){ - complex mat[2] = {1., imag_I}; - apply_one_targe_gate_diag<0>(vector{pos}, mat); -} - -template -void StateVector::apply_sdag(pos_t pos){ - complex mat[2] = {1., -imag_I}; - apply_one_targe_gate_diag<0>(vector{pos}, mat); - -} - -template -void StateVector::apply_t(pos_t pos){ - complex p = imag_I*PI/4.; - complex mat[2] = {1., std::exp(p)}; - apply_one_targe_gate_diag<0>(vector{pos}, mat); - -} - -template -void StateVector::apply_tdag(pos_t pos){ - complex p = -imag_I*PI/4.; - complex mat[2] = {1., std::exp(p)}; - apply_one_targe_gate_diag<0>(vector{pos}, mat); - -} - -template -void StateVector::apply_p(pos_t pos, real_t phase){ - complex p = imag_I*phase; - complex mat[2] = {1., std::exp(p)}; - apply_one_targe_gate_diag<0>(vector{pos}, mat); -} - - -template -void StateVector::apply_rx(pos_t pos, real_t theta){ - complex mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)}; - apply_one_targe_gate_general<0>(vector{pos}, mat); -} - - -template -void StateVector::apply_ry(pos_t pos, real_t theta){ - complex mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)}; - apply_one_targe_gate_real<0>(vector{pos}, mat); -} - -template -void StateVector::apply_rz(pos_t pos, real_t theta){ - complex z0 = -imag_I*theta/2.; - complex z1 = imag_I*theta/2.; - complex mat[2] = {std::exp(z0), std::exp(z1)}; - apply_one_targe_gate_diag<0>(vector{pos}, mat); -} - -template -void StateVector::apply_cnot(pos_t control, pos_t targe){ - apply_one_targe_gate_x<1>(vector{control, targe}); -} - -template -void StateVector::apply_cz(pos_t control, pos_t targe){ - complex mat[2] = {1., -1.}; - apply_one_targe_gate_diag<1>(vector{control, targe}, mat); -} - -template -void StateVector::apply_cp(pos_t control, pos_t targe, real_t phase){ - complex p = imag_I*phase; - complex mat[2] = {1., std::exp(p)}; - apply_one_targe_gate_diag<1>(vector{control, targe}, mat); -} - -template -void StateVector::apply_crx(pos_t control, pos_t targe, real_t theta){ - complex mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)}; - - apply_one_targe_gate_general<1>(vector{control, targe}, mat); -} - -template -void StateVector::apply_cry(pos_t control, pos_t targe, real_t theta){ - complex mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)}; - - apply_one_targe_gate_real<1>(vector{control, targe}, mat); -} - -template -void StateVector::apply_ccx(pos_t control1, pos_t control2, pos_t targe){ - apply_one_targe_gate_x<2>(vector{control1, control2, targe}); -} - -/////// General implementation ///////// - -template -template -void StateVector::apply_one_targe_gate_general(vector const& posv, complex *mat) -{ - std::function getind_func_near; - std::function getind_func; - size_t rsize; - size_t offset; - size_t targe; - size_t control = 0; - size_t setbit; - size_t poffset; - bool has_control=false; - vector posv_sorted = posv; - if (ctrl_num == 0){ - targe = posv[0]; - offset = 1ll<>1; - getind_func_near = [&](size_t j)-> size_t { - return 2*j; - }; - - getind_func = [&](size_t j)-> size_t { - return (j&(offset-1)) | (j>>targe<targe) { - control--; - } - poffset=1ll<>2; - getind_func = [&](size_t j) -> size_t { - size_t i = (j>>control<<(control+1))|(j&(poffset-1)); - i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit; - return i; - }; - - getind_func_near = getind_func; - - - } - else if(ctrl_num == 2){ - has_control = true; - control = *min_element(posv.begin(), posv.end()-1); - targe = *(posv.end()-1); - offset = 1ll<>posv.size(); - getind_func = [&](size_t j)-> size_t{ - size_t i = j; - for (size_t k=0;k < posv.size();k++) - { - size_t _pos = posv_sorted[k]; - i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1); - } - for (size_t k=0;k < posv.size()-1;k++){ - i |= 1ll< mat00 = mat[0]; - const complex mat01 = mat[1]; - const complex mat10 = mat[2]; - const complex mat11 = mat[3]; - if (targe == 0){ -#pragma omp parallel for - for(omp_i j = 0;j < rsize;j++){ - size_t i = getind_func_near(j); - complex temp = data_[i]; - data_[i] = mat00*data_[i] + mat01*data_[i+1]; - data_[i+1] = mat10*temp + mat11*data_[i+1]; - } - }else if (has_control && control == 0){ //single step -#pragma omp parallel for - for(omp_i j = 0;j < rsize;j++){ - size_t i = getind_func(j); - complex temp = data_[i]; - data_[i] = mat00*data_[i] + mat01*data_[i+offset]; - data_[i+offset] = mat10*temp + mat11*data_[i+offset]; - } - - }else{//unroll to 2 -#ifdef USE_SIMD - __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(), mat[0].real()); - __m256d m_00im = _mm256_set_pd(mat[0].imag(), -mat[0].imag(), mat[0].imag(), -mat[0].imag()); - __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(), mat[1].real(), mat[1].real()); - __m256d m_01im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(), mat[1].imag(), -mat[1].imag()); - - __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real()); - __m256d m_10im = _mm256_set_pd(mat[2].imag(), -mat[2].imag(),mat[2].imag(), -mat[2].imag()); - __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real()); - __m256d m_11im = _mm256_set_pd(mat[3].imag(), -mat[3].imag(), mat[3].imag(), -mat[3].imag()); -#pragma omp parallel for - for(omp_i j = 0;j < rsize; j+= 2){ - size_t i = getind_func(j); - - double* p0 = (double*)(data_.get()+i); - double* p1 = (double*)(data_.get()+i+offset); - //load data - __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0 - __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1 - __m256d data0_p = _mm256_permute_pd(data0, 5); - __m256d data1_p = _mm256_permute_pd(data1, 5); - - //row0 - __m256d temp00re = _mm256_mul_pd(m_00re, data0); - __m256d temp00im = _mm256_mul_pd(m_00im, data0_p); - __m256d temp00 = _mm256_add_pd(temp00re, temp00im); - __m256d temp01re = _mm256_mul_pd(m_01re, data1); - __m256d temp01im = _mm256_mul_pd(m_01im, data1_p); - __m256d temp01 = _mm256_add_pd(temp01re, temp01im); - __m256d temp0 = _mm256_add_pd(temp00, temp01); - - //row1 - __m256d temp10re = _mm256_mul_pd(m_10re, data0); - __m256d temp10im = _mm256_mul_pd(m_10im, data0_p); - __m256d temp10 = _mm256_add_pd(temp10re, temp10im); - __m256d temp11re = _mm256_mul_pd(m_11re, data1); - __m256d temp11im = _mm256_mul_pd(m_11im, data1_p); - __m256d temp11 = _mm256_add_pd(temp11re, temp11im); - __m256d temp1 = _mm256_add_pd(temp10, temp11); - - _mm256_storeu_pd(p0, temp0); - _mm256_storeu_pd(p1, temp1); - } -#else -#pragma omp parallel for - for(omp_i j = 0;j < rsize;j += 2){ - size_t i = getind_func(j); - size_t i1 = i+1; - complex temp = data_[i]; - complex temp1 = data_[i1]; - data_[i] = mat00*data_[i] + mat01*data_[i+offset]; - data_[i+offset] = mat10*temp + mat11*data_[i+offset]; - data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset]; - data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset]; - } -#endif - } -} - - -template -template -void StateVector::apply_one_targe_gate_x(vector const& posv) -{ - std::function getind_func_near; - std::function getind_func; - size_t rsize; - size_t offset; - size_t targe; - size_t control; - size_t setbit; - size_t poffset; - vector posv_sorted = posv; - bool has_control=false; - if (ctrl_num == 0){ - targe = posv[0]; - offset = 1ll<>1; - getind_func_near = [&](size_t j)-> size_t { - return 2*j; - }; - - getind_func = [&](size_t j)-> size_t { - return (j&(offset-1)) | (j>>targe<targe) { - control--; - } - poffset=1ll<>2; - getind_func = [&](size_t j) -> size_t { - size_t i = (j>>control<<(control+1))|(j&(poffset-1)); - i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit; - return i; - }; - getind_func_near = getind_func; - } - else if(ctrl_num == 2){ - has_control = true; - control = *min_element(posv.begin(), posv.end()-1); - targe = *(posv.end()-1); - offset = 1ll<>posv.size(); - - getind_func = [&](size_t j) -> size_t{ - size_t i = j; - for (size_t k=0;k < posv.size();k++){ - size_t _pos = posv_sorted[k]; - i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1); - } - for (size_t k=0;k < posv.size()-1;k++){ - i |= 1ll< -template -void StateVector::apply_one_targe_gate_real(vector const& posv, complex *mat) -{ - std::function getind_func_near; - std::function getind_func; - size_t rsize; - size_t offset; - size_t targe; - size_t control = 0; - size_t setbit; - size_t poffset; - bool has_control=false; - vector posv_sorted = posv; - if (ctrl_num == 0){ - targe = posv[0]; - offset = 1ll<>1; - getind_func_near = [&](size_t j)-> size_t { - return 2*j; - }; - - getind_func = [&](size_t j)-> size_t { - return (j&(offset-1)) | (j>>targe<targe) { - control--; - } - poffset=1ll<>2; - getind_func = [&](size_t j) -> size_t { - size_t i = (j>>control<<(control+1))|(j&(poffset-1)); - i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit; - return i; - }; - - getind_func_near = getind_func; - } - else if(ctrl_num == 2){ - has_control = true; - control = *min_element(posv.begin(), posv.end()-1); - targe = *(posv.end()-1); - offset = 1ll<>posv.size(); - getind_func = [&](size_t j)-> size_t{ - size_t i = j; - for (size_t k=0;k < posv.size();k++){ - size_t _pos = posv_sorted[k]; - i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1); - } - for (size_t k=0;k < posv.size()-1;k++){ - i |= 1ll< temp = data_[i]; - data_[i] = mat00*data_[i] + mat01*data_[i+1]; - data_[i+1] = mat10*temp + mat11*data_[i+1]; - } - - }else if (has_control && control == 0){ //single step - -#pragma omp parallel for - for(omp_i j = 0;j < rsize;j++){ - size_t i = getind_func(j); - complex temp = data_[i]; - data_[i] = mat00*data_[i] + mat01*data_[i+offset]; - data_[i+offset] = mat10*temp + mat11*data_[i+offset]; - } - }else{//unroll to 2 -#ifdef USE_SIMD - __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(), mat[0].real()); - __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(), mat[1].real(), mat[1].real()); - __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real()); - __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real()); -#pragma omp parallel for - for(omp_i j = 0;j < rsize; j+= 2){ - size_t i = getind_func(j); - - double* p0 = (double*)(data_.get()+i); - double* p1 = (double*)(data_.get()+i+offset); - //load data - __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0 - __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1 - __m256d data0_p = _mm256_permute_pd(data0, 5); - __m256d data1_p = _mm256_permute_pd(data1, 5); - - //row0 - __m256d temp00re = _mm256_mul_pd(m_00re, data0); - __m256d temp01re = _mm256_mul_pd(m_01re, data1); - __m256d temp0 = _mm256_add_pd(temp00re, temp01re); - - //row1 - __m256d temp10re = _mm256_mul_pd(m_10re, data0); - __m256d temp11re = _mm256_mul_pd(m_11re, data1); - __m256d temp1 = _mm256_add_pd(temp10re, temp11re); - - _mm256_storeu_pd(p0, temp0); - _mm256_storeu_pd(p1, temp1); - } -#else -#pragma omp parallel for - for(omp_i j = 0;j < rsize;j += 2){ - size_t i = getind_func(j); - size_t i1 = i+1; - complex temp = data_[i]; - complex temp1 = data_[i1]; - data_[i] = mat00*data_[i] + mat01*data_[i+offset]; - data_[i+offset] = mat10*temp + mat11*data_[i+offset]; - data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset]; - data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset]; - } -#endif - } -} - - -template -template -void StateVector::apply_one_targe_gate_diag(vector const& posv, complex *mat) -{ - std::function getind_func_near; - std::function getind_func; - size_t rsize; - size_t offset; - size_t targe; - size_t control = 0; - size_t setbit; - size_t poffset; - bool has_control=false; - vector posv_sorted = posv; - if (ctrl_num == 0){ - targe = posv[0]; - offset = 1ll<>1; - getind_func_near = [&](size_t j)-> size_t { - return 2*j; - }; - - getind_func = [&](size_t j)-> size_t { - return (j&(offset-1)) | (j>>targe<targe) { - control--; - } - poffset=1ll<>2; - getind_func = [&](size_t j) -> size_t { - size_t i = (j>>control<<(control+1))|(j&(poffset-1)); - i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit; - return i; - }; - - getind_func_near = getind_func; - - } - else if(ctrl_num == 2){ - has_control = true; - control = *min_element(posv.begin(), posv.end()-1); - targe = *(posv.end()-1); - offset = 1ll<>posv.size(); - getind_func = [&](size_t j)-> size_t - { - size_t i = j; - for (size_t k=0;k < posv.size();k++){ - size_t _pos = posv_sorted[k]; - i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1); - } - for (size_t k=0;k < posv.size()-1;k++){ - i |= 1ll< temp = data_[i]; - data_[i] *= mat[0]; - data_[i+offset] *= mat[1]; - } - - }else{//unroll to 2 -#ifdef USE_SIMD - __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(), mat[0].real()); - __m256d m_00im = _mm256_set_pd(mat[0].imag(), -mat[0].imag(), mat[0].imag(), -mat[0].imag()); - __m256d m_11re = _mm256_set_pd(mat[1].real(), mat[1].real(), mat[1].real(), mat[1].real()); - __m256d m_11im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(), mat[1].imag(), -mat[1].imag()); -#pragma omp parallel for - for(omp_i j = 0;j < rsize; j+= 2){ - size_t i = getind_func(j); - - double* p0 = (double*)(data_.get()+i); - double* p1 = (double*)(data_.get()+i+offset); - - //load data - __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0 - __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1 - __m256d data0_p = _mm256_permute_pd(data0, 5); - __m256d data1_p = _mm256_permute_pd(data1, 5); - - //row0 - __m256d temp00re = _mm256_mul_pd(m_00re, data0); - __m256d temp00im = _mm256_mul_pd(m_00im, data0_p); - __m256d temp00 = _mm256_add_pd(temp00re, temp00im); - - //row1 - __m256d temp11re = _mm256_mul_pd(m_11re, data1); - __m256d temp11im = _mm256_mul_pd(m_11im, data1_p); - __m256d temp11 = _mm256_add_pd(temp11re, temp11im); - - _mm256_storeu_pd(p0, temp00); - _mm256_storeu_pd(p1, temp11); - } -#else -#pragma omp parallel for - for(omp_i j = 0;j < rsize;j += 2){ - size_t i = getind_func(j); - size_t i1 = i+1; - data_[i] *= mat[0]; - data_[i+offset] *= mat[1]; - data_[i1] *= mat[0]; - data_[i1+offset] *= mat[1]; - } -#endif - } -} - -template -void StateVector::apply_multi_targe_gate_general(vector const& posv, uint control_num, RowMatrixXcd const& mat) -{ - auto posv_sorted = posv; - auto targs = vector(posv.begin()+control_num, posv.end()); - sort(posv_sorted.begin(), posv_sorted.end()); - size_t rsize = size_ >> posv.size(); - uint targe_num = targs.size(); - size_t matsize= 1<< targe_num; - std::vector targ_mask(matsize); - //create target mask - for (size_t m = 0; m < matsize;m++){ - for (size_t j = 0; j < targe_num; j++){ - if ((m>>j)&1){ - auto mask_pos = targs[j]; - targ_mask[m] |= 1ll<>_pos<<_pos<<1); - } - // Set control - for (size_t k=0; k < control_num;k++){ - i |= 1ll< const& qubits_sorted, const uint k) { - uint lowbits, retval = k; - for (size_t j = 0; j < qubits_sorted.size(); j++) { - lowbits = retval & ((1LL << qubits_sorted[j]) -1); - retval >>= qubits_sorted[j]; - retval <<= qubits_sorted[j] + 1; - retval |= lowbits; - } - return retval; -} - -using indexes_t = vector; -inline indexes_t indexes(vector const& qbits, vector const& qubits_sorted, const uint k) { - const auto N = qubits_sorted.size(); - indexes_t ret(1LL << N, 0); - // Get index0 - ret[0] = index0(qubits_sorted, k); - for (size_t i = 0; i < N; i++) { - const auto n = 1LL << i; - const auto bit = 1ll << qbits[i]; - for (size_t j = 0; j < n; j++) - ret[n + j] = ret[j] | bit; - } - return ret; -} - -template -vector StateVector::probabilities() const { - const int len = 1LL << num_; - vector probs(len, 0.); -#pragma omp parallel for - for (int j = 0; j < len; j++) { - probs[j] = std::real(data_[j] * std::conj(data_[j])); - } - return probs; -} - -vector> convert(const vector> &v){ - vector> ret(v.size(), 0.); - for (size_t i = 0; i< v.size(); ++i) - ret[i] = v[i]; - return ret; -} - -template -void StateVector::apply_diagonal_matrix(vector const&qbits, vector > const&diag){ - // just one qubit - if(qbits.size() == 1){ - const uint qubit = qbits[0]; - vector qbit0{qubit}; - if (diag[0] == 1.0) { // [[1, 0], [0, z]] matrix - if (diag[1] == 1.0) - return; // Identity - if (diag[1] == std::complex(0., -1.)) { // [[1, 0], [0, -i]] - auto func = [&](const indexes_t &inds) -> void { - const auto k = inds[1]; - double cache = data_[k].imag(); - data_[k].imag(data_[k].real() * -1.); - data_[k].real(cache); - }; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds); - } - return; - } - if (diag[1] == std::complex(0., 1.)) { - // [[1, 0], [0, i]] - auto func = [&](const indexes_t &inds) -> void { - const auto k = inds[1]; - double cache = data_[k].imag(); - data_[k].imag(data_[k].real()); - data_[k].real(cache * -1.); - }; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds); - } - return; - } - if (diag[0] == 0.0) { - // [[1, 0], [0, 0]] - auto func = [&](const indexes_t &inds) -> void { - data_[inds[1]] = 0.0; - }; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds); - } - return; - } - // general [[1, 0], [0, z]] - auto func = [&](const indexes_t &inds, const vector > &_mat) -> void { - const auto k = inds[1]; - data_[k] *= _mat[1]; - }; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds, convert(diag)); - } - return; - } else if (diag[1] == 1.0) { - // [[z, 0], [0, 1]] matrix - if (diag[0] == std::complex(0., -1.)) { - // [[-i, 0], [0, 1]] - auto func = [&](const indexes_t &inds) -> void { - const auto k = inds[1]; - double cache = data_[k].imag(); - data_[k].imag(data_[k].real() * -1.); - data_[k].real(cache); - }; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds); - } - return; - } - if (diag[0] == std::complex(0., 1.)) { - // [[i, 0], [0, 1]] - auto func = [&](const indexes_t &inds) -> void { - const auto k = inds[1]; - double cache = data_[k].imag(); - data_[k].imag(data_[k].real()); - data_[k].real(cache * -1.); - }; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds); - } - return; - } - if (diag[0] == 0.0) { - // [[0, 0], [0, 1]] - auto func = [&](const indexes_t &inds) -> void { - data_[inds[0]] = 0.0; - }; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds); - } - return; - } - // general [[z, 0], [0, 1]] - auto func = [&](const indexes_t &inds, const vector > &_mat) -> void { - const auto k = inds[0]; - data_[k] *= _mat[0]; - }; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds, convert(diag)); - } - return; - } else { - // Lambda function for diagonal matrix multiplication - auto func = [&](const indexes_t &inds, const vector > &_mat) -> void { - const auto k0 = inds[0]; - const auto k1 = inds[1]; - data_[k0] *= _mat[0]; - data_[k1] *= _mat[1]; - }; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds, convert(diag)); - } - } - return; - } - const uint N = qbits.size(); - auto func = [&](const indexes_t &inds, const vector> &_diag) -> void { - for(int i=0; i<2; ++i){ - const int k = inds[i]; - int iv = 0; - for(int j=0;j qbit0{qbits[0]}; -#pragma omp parallel for - for (int k = 0; k < (size_ >> 1); k+=1){ - const auto inds = indexes(qbit0, qbit0, k); - func(inds, convert(diag)); - } -} - -template -void StateVector::update(vector const& qbits, const uint final_state, const uint meas_state, const double meas_prob){ - const uint dim = 1ULL << qbits.size(); - vector > matdiag(dim, 0.); - matdiag[meas_state] = 1./ std::sqrt(meas_prob); - apply_diagonal_matrix(qbits, matdiag); - - //TODO: Add reset - // for reset - if(final_state != meas_state){ - if(qbits.size() == 1){ - // apply a x gate - apply_x(qbits[0]); - }else{ - // Diagonal matrix for projecting and renormalizing to measurement outcome - vector > perm(dim*dim, 0.); - perm[final_state * dim + meas_state] = 1.; - perm[meas_state * dim + final_state] = 1.; - for (uint j = 0; j < dim; j++){ - if(j != final_state && j != meas_state) - perm[j * dim + j] = 1.; - } - // apply permutation to swap state - const uint N = qbits.size(); - const uint DIM = 1ULL << N; - auto func = [&](const indexes_t &inds, const vector > &_mat) -> void { - // std::array, 1ULL << N > cache; - vector > cache(1ULL << N, 0.); - for(uint i = 0; i< DIM; i++){ - const auto ii = inds[i]; - cache[i] = data_[ii]; - data_[ii] = 0.; - } - for(uint i=0; i< DIM; i++) - for(uint j=0; j qs(qbits.begin(), qbits.end()); - vector qs_sorted(qs.begin(), qs.end()); - std::sort(qs_sorted.begin(), qs_sorted.end()); - uint END = size_ >> qs.size(); -#pragma omp parallel for - for (int k = 0; k < END; k+=1){ - const auto inds = indexes(qs, qs_sorted, k); - func(inds, convert(perm)); - } - } - } -} - -template -void printVector(const std::vector& vec) { - for (const T& element : vec) { - std::cout << element << " "; - } - std::cout << std::endl; -} - -template -std::pair StateVector::sample_measure_probs(vector const& qbits){ - // 1. caculate actual measurement outcome - const int64_t N = qbits.size(); - const int64_t DIM = 1LL << N; - const int64_t END = 1LL << (num_ - N); - vector probs(DIM, 0.); - vector qubits_sorted(qbits.begin(), qbits.end()); - - std::sort(qubits_sorted.begin(), qubits_sorted.end()); - if ((num_ == N) && ( qubits_sorted == qbits )){ - probs = probabilities(); - }else{ - vector probs_private(DIM, 0.); -#pragma omp parallel for - for (int64_t k = 0; k < END; k++){ - auto idx = indexes(qbits, qubits_sorted, k); - // std::cout<<"indexes"<(probs.begin(), probs.end())(rng_); - return std::make_pair(outcome, probs[outcome]); -} - -// change to bit endian -vector int2vec(uint n, uint base){ - vector ret; - while(n >= base){ - ret.push_back(n%base); - n /= base; - } - ret.push_back(n); - return ret; -} - -template -void StateVector::apply_measure(vector const& qbits, vector const& cbits){ - // 1. caculate actual measurement outcome - const auto meas = sample_measure_probs(qbits); - //2. update statevector - update(qbits, meas.first, meas.first, meas.second); - //3. store measure - vector outcome = int2vec(meas.first, 2); - if(outcome.size() < qbits.size()){ - outcome.resize(qbits.size()); - } - for(uint j=0; j < outcome.size(); j++){ - creg_[cbits[j]] = outcome[j]; - } -} - -template -void StateVector::apply_reset(vector const& qbits){ - const auto meas = sample_measure_probs(qbits); - update(qbits, 0, meas.first, meas.second); -} \ No newline at end of file diff --git a/src/qfvm/types.hpp b/src/qfvm/types.hpp deleted file mode 100644 index 5ece0f7..0000000 --- a/src/qfvm/types.hpp +++ /dev/null @@ -1,33 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#ifdef _MSC_VER -using omp_i = signed long long; -#else -using omp_i = size_t; -#endif - -#ifdef _MSC_VER -#else -#ifndef __AVX2__ -#undef USE_SIMD -#endif -#endif - - -typedef unsigned int uint; -using pos_t = uint; -using data_t = double; -using std::complex; -using std::vector; -using std::string; -using RowMatrixXcd = Eigen::Matrix, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>; - -const complex imag_I = complex(0, 1.); -const double PI = 3.14159265358979323846; diff --git a/src/qfvm/util.h b/src/qfvm/util.h deleted file mode 100644 index 950b48e..0000000 --- a/src/qfvm/util.h +++ /dev/null @@ -1,99 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -namespace Qfutil{ - -std::vector randomArr(size_t length, size_t max){ - srand((unsigned)time(NULL)); - std::vector arr(length); - for (size_t i=0;i < arr.size();i++){ - arr[i] = rand()%max; - } - return arr; -} - -int randomint(int min, int max){ - srand((unsigned)time(NULL)); - return (rand()%(max - min + 1)) + min; -} - -static uint32_t randomize(uint32_t i){ - i = (i^61)^(i>>16); - i *=9; - i ^= i<<4; - i *= 0x27d4eb2d; - i ^= i >> 15; - return i; -} - -template -void printVector(std::vector const &arr){ - for (auto i : arr){ - std::cout << i << " "; - } - std::cout << std::endl; -} - - -std::vector split_string(const std::string& str, char delim) { - std::vector elems; - auto lastPos = str.find_first_not_of(delim, 0); - auto pos = str.find_first_of(delim, lastPos); - while (pos != std::string::npos || lastPos != std::string::npos) { - elems.push_back(str.substr(lastPos, pos - lastPos)); - lastPos = str.find_first_not_of(delim, pos); - pos = str.find_first_of(delim, lastPos); - } - return elems; -} - -std::vector split_string(const std::string& str, char delim, uint num){ - auto end = str.length(); - std::vector elems; - auto lastPos = str.find_first_not_of(delim, 0); - auto pos = str.find_first_of(delim, lastPos); - while ((pos != std::string::npos || lastPos != std::string::npos) && elems.size() < num) { - elems.push_back(str.substr(lastPos, pos - lastPos)); - lastPos = str.find_first_not_of(delim, pos); - pos = str.find_first_of(delim, lastPos); - } - - if ((pos != std::string::npos || lastPos != std::string::npos)){ - elems.push_back(str.substr(lastPos, end)); - } - return elems; -} - -template -std::vector find_numbers(const std::string &str){ - std::smatch matchs; - std::vector res; - std::regex pattern; - if (std::is_unsigned::value){ - pattern = std::regex("\\d+"); - }else if(std::is_floating_point::value){ - pattern = std::regex("-?(([1-9]\\d*\\.\\d*)|(0\\.\\d*[1-9]\\d*))|\\d+"); - } - - auto begin = std::sregex_iterator(str.begin(), str.end(), pattern); - const std::sregex_iterator end; - - for (std::sregex_iterator i = begin; i != end; ++i) { - std::string match_str = i->str(); - if (std::is_unsigned::value){ - res.push_back(std::stoi(match_str)); - }else if (std::is_floating_point::value){ - res.push_back(std::stod(match_str)); - } - } - - return res; -} - -}//namespace Qfutil \ No newline at end of file diff --git a/src/qfvm_gpu/apply_gate_custate.cuh b/src/qfvm_gpu/apply_gate_custate.cuh deleted file mode 100644 index 2d9c2e7..0000000 --- a/src/qfvm_gpu/apply_gate_custate.cuh +++ /dev/null @@ -1,46 +0,0 @@ - -#pragma once -#include -#include - - -void apply_gate_custate(cuDoubleComplex *psi_d, QuantumOperator &op, int n) -{ - - //get information form op - auto pos = op.positions(); - const int nTargets = op.targe_num(); - const int nControls = op.control_num(); - const int adjoint = 0; - - vector targets{pos.begin()+nControls, pos.end()}; - vector controls{pos.begin(), pos.begin()+nControls}; - - auto mat_temp = op.mat(); - cuDoubleComplex *mat = - reinterpret_cast(mat_temp.data()); - - // custatevec handle initialization - custatevecHandle_t handle; - custatevecCreate(&handle) ; - void* extraWorkspace = nullptr; - size_t extraWorkspaceSizeInBytes = 0; - - // check the size of external workspace - custatevecApplyMatrixGetWorkspaceSize( - handle, CUDA_C_64F, n, mat, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW, - adjoint, nTargets, nControls, CUSTATEVEC_COMPUTE_64F, &extraWorkspaceSizeInBytes) ; - - // allocate external workspace if necessary - if (extraWorkspaceSizeInBytes > 0) - cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes); - - custatevecApplyMatrix( - handle, psi_d, CUDA_C_64F, n, mat, CUDA_C_64F, - CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, targets.data(), nTargets, controls.data(), nullptr, - nControls, CUSTATEVEC_COMPUTE_64F, extraWorkspace, extraWorkspaceSizeInBytes); - - // destroy handle - custatevecDestroy(handle); -} - \ No newline at end of file diff --git a/src/qfvm_gpu/apply_gate_gpu.cuh b/src/qfvm_gpu/apply_gate_gpu.cuh deleted file mode 100644 index e2d7be0..0000000 --- a/src/qfvm_gpu/apply_gate_gpu.cuh +++ /dev/null @@ -1,411 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include - -struct targeIndex -{ - size_t ind0; - size_t ind1; -}; - - -__constant__ uint posv_d[50]; -__constant__ uint posv_sorted_d[50]; -__constant__ cuDoubleComplex mat_d_const[32*32]; //If target qubit < 5, use const memory; -__constant__ uint mat_mask_d_const[32]; - - -//-------------Single-target gate------------------------------- -template -__global__ void apply_one_targe_gate_kernel(cuDoubleComplex *psi_d, Func get_index, int rsize){ - cuDoubleComplex mat00 = mat_d_const[0]; - cuDoubleComplex mat01 = mat_d_const[1]; - cuDoubleComplex mat10 = mat_d_const[2]; - cuDoubleComplex mat11 = mat_d_const[3]; - - unsigned int gridSize = blockDim.x*gridDim.x; - for (int j = blockDim.x * blockIdx.x + threadIdx.x; j < rsize;j+= gridSize){ - targeIndex ind = get_index(j); - cuDoubleComplex temp = psi_d[ind.ind0]; - psi_d[ind.ind0] = cuCadd(cuCmul(mat00, psi_d[ind.ind0]), cuCmul(mat01, psi_d[ind.ind1])); - psi_d[ind.ind1] = cuCadd(cuCmul(mat10, temp), cuCmul(mat11, psi_d[ind.ind1])); - } -} - -template -void apply_one_targe_gate_gpu(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){ - //copy mat to device - auto mat_temp = op.mat(); - cuDoubleComplex *mat = - reinterpret_cast(mat_temp.data()); - checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, 4*sizeof(cuDoubleComplex))); - size_t rsize; - size_t offset; - size_t targe; - size_t control; - size_t setbit; - size_t poffset; - if (ctrl_num == 0){ - targe = op.positions()[0]; - offset = 1ll<>1; - auto getind_func = [offset, targe] __device__ (size_t j)-> targeIndex { - size_t ind0 = (j&(offset-1)) | (j>>targe<>>(psi_d, getind_func, rsize); - } - else if(ctrl_num == 1){ - control = op.positions()[0]; - targe = op.positions()[1]; - offset = 1ll<targe) { - control--; - } - poffset=1ll<>2; - auto getind_func = [control, targe, poffset, offset, setbit] __device__ (size_t j) -> targeIndex { - size_t ind0 = (j>>control<<(control+1))|(j&(poffset-1)); - ind0 = (ind0 >> targe << (targe+1))|(ind0 &(offset-1))|setbit; - size_t ind1 = ind0 + offset; - return {ind0, ind1}; - }; - - - size_t blockdim = rsize <= 1024 ? rsize : 1024; - size_t griddim = rsize / blockdim; - - apply_one_targe_gate_kernel<<>>(psi_d, getind_func, rsize); - } - else if(ctrl_num == 2){ - targe = op.positions().back(); - offset = 1ll<>psize; - - vector posv_sorted = op.positions(); - std::sort(posv_sorted.begin(), posv_sorted.end()); - //Copy pos to device - checkCudaErrors(cudaMemcpyToSymbol(posv_d, op.positions().data(), psize*sizeof(uint))); - checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); - - auto getind_func = [offset, psize] __device__ (size_t j)-> targeIndex{ - size_t ind0 = j; - for (pos_t k=0;k < psize;k++) - { - pos_t _pos = posv_sorted_d[k]; - ind0 = (ind0&((1ll<<_pos)-1)) | (ind0>>_pos<<_pos<<1); - } - for (pos_t k=0;k < psize-1;k++){ - ind0 |= 1ll<>>(psi_d, getind_func, rsize); - } -} - -template -__global__ void apply_2to4_targe_gate_kernel(cuDoubleComplex *psi_d, uint ctrlnum, int psize){ - constexpr uint matlen = 1<>_pos<<_pos<<1); - } - // Set control - for (size_t k=0; k < ctrlnum;k++){ - i |= 1ll< -void apply_2to4_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){ - // uint targe_num = op.targe_num(); - uint matlen = 1<(pos.begin()+op.control_num(), pos.end()); - vector targ_mask(matlen); - //create target mask - for (size_t m = 0; m < matlen;m++){ - for (size_t j = 0; j < targe_num; j++){ - if ((m>>j)&1){ - auto mask_pos = targs[j]; - targ_mask[m] |= 1ll< posv_sorted = op.positions(); - uint psize = pos.size(); - std::sort(posv_sorted.begin(),posv_sorted.end()); - //Copy pos to device - checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint))); - checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); - - //copy mat to const memory - auto mat_temp = op.mat(); - cuDoubleComplex *mat = reinterpret_cast(mat_temp.data()); - - checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex))); - checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint))); - size_t rsize = size>>psize; - - uint max_thread_num = targe_num < 4 ? 1024 : 512; - size_t blockdim = rsize <= max_thread_num ? rsize : max_thread_num; - size_t griddim = rsize / blockdim; - apply_2to4_targe_gate_kernel<<>>(psi_d, op.control_num(), psize); -} - - - -// ------------Large target number gate--------------- - -__global__ void apply_5_targe_gate_kernel_const(cuDoubleComplex *psi_d, uint ctrlnum, int psize, size_t size){ - uint rsize = size>>psize; - uint targnum = psize-ctrlnum; - uint matlen = (1<>_pos<<_pos<<1); - } - // Set control - for (size_t k=0; k < ctrlnum;k++){ - i |= 1ll<>1;offset > 0;offset >>=1){ - v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset); - v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset); - } - __syncthreads(); - if (!idx) psi_d[ i | mat_mask_d_const[idy]] = v; -} - - -void apply_5_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){ - uint targe_num = op.targe_num(); - uint matlen = 1<(pos.begin()+op.control_num(), pos.end()); - vector targ_mask(matlen); - //create target mask - for (size_t m = 0; m < matlen;m++){ - for (size_t j = 0; j < targe_num; j++){ - if ((m>>j)&1){ - auto mask_pos = targs[j]; - targ_mask[m] |= 1ll< posv_sorted = op.positions(); - uint psize = pos.size(); - std::sort(posv_sorted.begin(),posv_sorted.end()); - //Copy pos to device - checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint))); - checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); - - //copy mat to const memory - auto mat_temp = op.mat(); - cuDoubleComplex *mat = reinterpret_cast(mat_temp.data()); - - checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex))); - checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint))); - size_t rsize = size>>psize; - uint thread_num = matlen > 32 ? 32 : matlen; - dim3 blockdim = dim3(thread_num, thread_num); - apply_5_targe_gate_kernel_const<<>>(psi_d, op.control_num(), psize, size); -} - - -//For target number 6-10 -__global__ void apply_multi_targe_gate_kernel_shared(cuDoubleComplex *psi_d, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){ - - uint rsize = size>>psize; - uint targnum = psize-ctrlnum; - uint matlen = (1<>_pos<<_pos<<1); - } - // Set control - for (size_t k=0; k < ctrlnum;k++){ - i |= 1ll<>1;offset > 0;offset >>=1){ - v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset); - v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset); - } - __syncthreads(); - if (!idx) local_sum[y] = cuCadd(local_sum[y], v); - } - } - - for (int y = idy; y < matlen;y+=blockDim.y){ - if (!idx) psi_d[ i | mat_mask_d[y]] = local_sum[y]; - } -} - - -void apply_multi_targe_gate_gpu_shared(cuDoubleComplex *psi_d, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){ - uint targe_num = op.targe_num(); - uint matlen = 1<(pos.begin()+op.control_num(), pos.end()); - vector targ_mask(matlen); - //create target mask - for (size_t m = 0; m < matlen;m++){ - for (size_t j = 0; j < targe_num; j++){ - if ((m>>j)&1){ - auto mask_pos = targs[j]; - targ_mask[m] |= 1ll< posv_sorted = pos; - std::sort(posv_sorted.begin(),posv_sorted.end()); - //Copy pos to device - checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint))); - checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); - - - //copy mat to global memory - auto mat_temp = op.mat(); - cuDoubleComplex *mat = reinterpret_cast(mat_temp.data()); - checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice)); - - size_t rsize = size>>psize; - uint thread_num = matlen > 32 ? 32 : matlen; - dim3 blockdim = dim3(thread_num, thread_num); - - apply_multi_targe_gate_kernel_shared<<>>(psi_d, op.control_num(), mat_d, mat_mask_d, psize, size); -} - -//For target number > 10 -__global__ void apply_multi_targe_gate_kernel_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){ - uint rsize = size>>psize; - uint targnum = psize-ctrlnum; - uint matlen = (1<>_pos<<_pos<<1); - } - // Set control - for (size_t k=0; k < ctrlnum;k++){ - i |= 1ll<>1;offset > 0;offset >>=1){ - v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset); - v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset); - } - __syncthreads(); - if (!idx) v_sum = cuCadd(v_sum, v); - } - if (!idx) psi_d[ i | mat_mask_d[y]] = v_sum; - } -} - -void apply_multi_targe_gate_gpu_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){ - uint targe_num = op.targe_num(); - uint matlen = 1<(pos.begin()+op.control_num(), pos.end()); - vector targ_mask(matlen); - //create target mask - for (size_t m = 0; m < matlen;m++){ - for (size_t j = 0; j < targe_num; j++){ - if ((m>>j)&1){ - auto mask_pos = targs[j]; - targ_mask[m] |= 1ll< posv_sorted = pos; - std::sort(posv_sorted.begin(),posv_sorted.end()); - //Copy pos to device - checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint))); - checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); - - - //copy mat to global memory - auto mat_temp = op.mat(); - cuDoubleComplex *mat = reinterpret_cast(mat_temp.data()); - checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice)); - - size_t rsize = size>>psize; - uint thread_num = matlen > 32 ? 32 : matlen; - dim3 blockdim = dim3(thread_num, thread_num); - - apply_multi_targe_gate_kernel_global<<>>(psi_d, psi_d_copy, op.control_num(), mat_d, mat_mask_d, psize, size); - checkCudaErrors(cudaMemcpy(psi_d_copy, psi_d, size*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice)); -} \ No newline at end of file diff --git a/src/qfvm_gpu/cuda_simulator.cuh b/src/qfvm_gpu/cuda_simulator.cuh deleted file mode 100644 index 99ee5e4..0000000 --- a/src/qfvm_gpu/cuda_simulator.cuh +++ /dev/null @@ -1,91 +0,0 @@ -#pragma once -#include "cuda_statevector.cuh" -#include -#include -#include -#include -#include "apply_gate_gpu.cuh" - -void simulate_gpu(Circuit & circuit, CudaStateVector & psi_d){ - size_t size = psi_d.size(); - //initialize mat - cuDoubleComplex *mat_d; - uint *mat_mask_d; - CudaStateVector psi_d_copy{}; - if (circuit.max_targe_num() > 5) - { - uint max_matlen = 1< 10){ - psi_d_copy = psi_d; - } - } - - //apply_gate - for (auto gate : circuit.gates()){ - uint targnum = gate.targe_num(); - uint ctrlnum = gate.control_num(); - - if (targnum == 1){ - if (ctrlnum == 0){ - apply_one_targe_gate_gpu<0>(psi_d.data(), gate, size); - } - else if (ctrlnum == 1){ - apply_one_targe_gate_gpu<1>(psi_d.data(), gate, size); - } - else{ - apply_one_targe_gate_gpu<2>(psi_d.data(), gate, size); - } - } - else if(targnum > 1){ - if (targnum == 2){ - apply_2to4_targe_gate_gpu_const<2>(psi_d.data(), gate, size); - } - else if(targnum == 3){ - apply_2to4_targe_gate_gpu_const<3>(psi_d.data(), gate, size); - } - else if(targnum == 4){ - apply_2to4_targe_gate_gpu_const<4>(psi_d.data(), gate, size); - } - else if (targnum == 5){ - apply_5_targe_gate_gpu_const(psi_d.data(), gate, size); - }else if (targnum > 5 && targnum <= 10){ - apply_multi_targe_gate_gpu_shared(psi_d.data(), gate, mat_d, mat_mask_d, size); - } - else{ - apply_multi_targe_gate_gpu_global(psi_d.data(), psi_d_copy.data(), gate, mat_d, mat_mask_d, size); - } - } - else{ - throw "Invalid target number"; - } - } - - //free source - if (circuit.max_targe_num() > 5){ - checkCudaErrors(cudaFree(mat_d)); - checkCudaErrors(cudaFree(mat_mask_d)); - } -} - -void simulate_gpu(Circuit & circuit, StateVector & state){ - //initialize psi - state.set_num(circuit.qubit_num()); - size_t size = state.size(); - CudaStateVector psi_d(state); - - simulate_gpu(circuit, psi_d); - cudaDeviceSynchronize(); - - //copy back - complex* psi = reinterpret_cast*>(psi_d.data()); - checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex), cudaMemcpyDeviceToHost)); - psi=nullptr; -} - -StateVector simulate_gpu(Circuit & circuit){ - StateVector state(circuit.qubit_num()); - simulate_gpu(circuit, state); - return std::move(state); -} \ No newline at end of file diff --git a/src/qfvm_gpu/cuda_statevector.cuh b/src/qfvm_gpu/cuda_statevector.cuh deleted file mode 100644 index 19e8cb4..0000000 --- a/src/qfvm_gpu/cuda_statevector.cuh +++ /dev/null @@ -1,58 +0,0 @@ - -#pragma once -#include -#include -#include -#include - -class CudaStateVector -{ - protected: - uint num_; - size_t size_; - cuDoubleComplex *data_; - - public: - //construct function - CudaStateVector(){checkCudaErrors(cudaMalloc(&data_, 0));} - CudaStateVector(CudaStateVector const &other); - - explicit CudaStateVector(StateVector &sv); - ~CudaStateVector() { - checkCudaErrors(cudaFree(data_)); - } - - CudaStateVector &operator=(CudaStateVector const &other) - { - num_ = other.num(); - size_ = other.size(); - checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex))); - checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice)); - return *this; - } - - cuDoubleComplex* data() const { return data_;} - size_t size() const { return size_; } - uint num() const { return num_; } -}; - - -CudaStateVector::CudaStateVector(StateVector &sv) -: -num_(sv.num()), -size_(sv.size()) -{ - cuDoubleComplex *psi_h = reinterpret_cast(sv.data()); - checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex))); - checkCudaErrors(cudaMemcpy(data_, psi_h, size_*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); -} - -CudaStateVector::CudaStateVector(CudaStateVector const &other) -: -num_(other.num()), -size_(other.size()) -{ - checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex))); - checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice)); -} - diff --git a/src/qfvm_gpu/cuda_utils/CudaTexture.h b/src/qfvm_gpu/cuda_utils/CudaTexture.h deleted file mode 100644 index 0661d0d..0000000 --- a/src/qfvm_gpu/cuda_utils/CudaTexture.h +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - - -#include "helper_cuda.h" -#include - - -struct CudaTexture { - cudaTextureObject_t tex; - - CudaTexture(CudaTexture const &) = delete; - CudaTexture(CudaTexture &&) = default; - CudaTexture &operator=(CudaTexture const &) = delete; - CudaTexture &operator=(CudaTexture &&) = default; - - template - CudaTexture(T *dataDev, int width, int height) { - cudaTextureObject_t tex; - cudaResourceDesc resDesc; - memset(&resDesc, 0, sizeof(resDesc)); - resDesc.resType = cudaResourceTypePitch2D; - resDesc.res.pitch2D.devPtr = dataDev; - resDesc.res.pitch2D.width = width; - resDesc.res.pitch2D.height = height; - resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); - resDesc.res.pitch2D.pitchInBytes = width * sizeof(T); - cudaTextureDesc texDesc; - memset(&texDesc, 0, sizeof(texDesc)); - checkCudaErrors(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); - } - - ~CudaTexture() { - checkCudaErrors(cudaDestroyTextureObject(tex)); - } - - constexpr operator cudaTextureObject_t() const { - return tex; - } -}; diff --git a/src/qfvm_gpu/cuda_utils/helper_cuda.h b/src/qfvm_gpu/cuda_utils/helper_cuda.h deleted file mode 100644 index 516dd57..0000000 --- a/src/qfvm_gpu/cuda_utils/helper_cuda.h +++ /dev/null @@ -1,953 +0,0 @@ -/** - * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -//////////////////////////////////////////////////////////////////////////////// -// These are CUDA Helper functions for initialization and error checking - -#ifndef COMMON_HELPER_CUDA_H_ -#define COMMON_HELPER_CUDA_H_ - -#pragma once - -#include -#include -#include -#include - -#include "helper_string.h" - -#ifndef EXIT_WAIVED -#define EXIT_WAIVED 2 -#endif - -// Note, it is required that your SDK sample to include the proper header -// files, please refer the CUDA examples for examples of the needed CUDA -// headers, which may change depending on which CUDA functions are used. - -// CUDA Runtime error messages -#ifdef __DRIVER_TYPES_H__ -static const char *_cudaGetErrorEnum(cudaError_t error) { - return cudaGetErrorName(error); -} -#endif - -#ifdef CUDA_DRIVER_API -// CUDA Driver API errors -static const char *_cudaGetErrorEnum(CUresult error) { - static char unknown[] = ""; - const char *ret = NULL; - cuGetErrorName(error, &ret); - return ret ? ret : unknown; -} -#endif - -#ifdef CUBLAS_API_H_ -// cuBLAS API errors -static const char *_cudaGetErrorEnum(cublasStatus_t error) { - switch (error) { - case CUBLAS_STATUS_SUCCESS: - return "CUBLAS_STATUS_SUCCESS"; - - case CUBLAS_STATUS_NOT_INITIALIZED: - return "CUBLAS_STATUS_NOT_INITIALIZED"; - - case CUBLAS_STATUS_ALLOC_FAILED: - return "CUBLAS_STATUS_ALLOC_FAILED"; - - case CUBLAS_STATUS_INVALID_VALUE: - return "CUBLAS_STATUS_INVALID_VALUE"; - - case CUBLAS_STATUS_ARCH_MISMATCH: - return "CUBLAS_STATUS_ARCH_MISMATCH"; - - case CUBLAS_STATUS_MAPPING_ERROR: - return "CUBLAS_STATUS_MAPPING_ERROR"; - - case CUBLAS_STATUS_EXECUTION_FAILED: - return "CUBLAS_STATUS_EXECUTION_FAILED"; - - case CUBLAS_STATUS_INTERNAL_ERROR: - return "CUBLAS_STATUS_INTERNAL_ERROR"; - - case CUBLAS_STATUS_NOT_SUPPORTED: - return "CUBLAS_STATUS_NOT_SUPPORTED"; - - case CUBLAS_STATUS_LICENSE_ERROR: - return "CUBLAS_STATUS_LICENSE_ERROR"; - } - - return ""; -} -#endif - -#ifdef _CUFFT_H_ -// cuFFT API errors -static const char *_cudaGetErrorEnum(cufftResult error) { - switch (error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - - case CUFFT_INCOMPLETE_PARAMETER_LIST: - return "CUFFT_INCOMPLETE_PARAMETER_LIST"; - - case CUFFT_INVALID_DEVICE: - return "CUFFT_INVALID_DEVICE"; - - case CUFFT_PARSE_ERROR: - return "CUFFT_PARSE_ERROR"; - - case CUFFT_NO_WORKSPACE: - return "CUFFT_NO_WORKSPACE"; - - case CUFFT_NOT_IMPLEMENTED: - return "CUFFT_NOT_IMPLEMENTED"; - - case CUFFT_LICENSE_ERROR: - return "CUFFT_LICENSE_ERROR"; - - case CUFFT_NOT_SUPPORTED: - return "CUFFT_NOT_SUPPORTED"; - } - - return ""; -} -#endif - -#ifdef CUSPARSEAPI -// cuSPARSE API errors -static const char *_cudaGetErrorEnum(cusparseStatus_t error) { - switch (error) { - case CUSPARSE_STATUS_SUCCESS: - return "CUSPARSE_STATUS_SUCCESS"; - - case CUSPARSE_STATUS_NOT_INITIALIZED: - return "CUSPARSE_STATUS_NOT_INITIALIZED"; - - case CUSPARSE_STATUS_ALLOC_FAILED: - return "CUSPARSE_STATUS_ALLOC_FAILED"; - - case CUSPARSE_STATUS_INVALID_VALUE: - return "CUSPARSE_STATUS_INVALID_VALUE"; - - case CUSPARSE_STATUS_ARCH_MISMATCH: - return "CUSPARSE_STATUS_ARCH_MISMATCH"; - - case CUSPARSE_STATUS_MAPPING_ERROR: - return "CUSPARSE_STATUS_MAPPING_ERROR"; - - case CUSPARSE_STATUS_EXECUTION_FAILED: - return "CUSPARSE_STATUS_EXECUTION_FAILED"; - - case CUSPARSE_STATUS_INTERNAL_ERROR: - return "CUSPARSE_STATUS_INTERNAL_ERROR"; - - case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - } - - return ""; -} -#endif - -#ifdef CUSOLVER_COMMON_H_ -// cuSOLVER API errors -static const char *_cudaGetErrorEnum(cusolverStatus_t error) { - switch (error) { - case CUSOLVER_STATUS_SUCCESS: - return "CUSOLVER_STATUS_SUCCESS"; - case CUSOLVER_STATUS_NOT_INITIALIZED: - return "CUSOLVER_STATUS_NOT_INITIALIZED"; - case CUSOLVER_STATUS_ALLOC_FAILED: - return "CUSOLVER_STATUS_ALLOC_FAILED"; - case CUSOLVER_STATUS_INVALID_VALUE: - return "CUSOLVER_STATUS_INVALID_VALUE"; - case CUSOLVER_STATUS_ARCH_MISMATCH: - return "CUSOLVER_STATUS_ARCH_MISMATCH"; - case CUSOLVER_STATUS_MAPPING_ERROR: - return "CUSOLVER_STATUS_MAPPING_ERROR"; - case CUSOLVER_STATUS_EXECUTION_FAILED: - return "CUSOLVER_STATUS_EXECUTION_FAILED"; - case CUSOLVER_STATUS_INTERNAL_ERROR: - return "CUSOLVER_STATUS_INTERNAL_ERROR"; - case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; - case CUSOLVER_STATUS_NOT_SUPPORTED: - return "CUSOLVER_STATUS_NOT_SUPPORTED "; - case CUSOLVER_STATUS_ZERO_PIVOT: - return "CUSOLVER_STATUS_ZERO_PIVOT"; - case CUSOLVER_STATUS_INVALID_LICENSE: - return "CUSOLVER_STATUS_INVALID_LICENSE"; - } - - return ""; -} -#endif - -#ifdef CURAND_H_ -// cuRAND API errors -static const char *_cudaGetErrorEnum(curandStatus_t error) { - switch (error) { - case CURAND_STATUS_SUCCESS: - return "CURAND_STATUS_SUCCESS"; - - case CURAND_STATUS_VERSION_MISMATCH: - return "CURAND_STATUS_VERSION_MISMATCH"; - - case CURAND_STATUS_NOT_INITIALIZED: - return "CURAND_STATUS_NOT_INITIALIZED"; - - case CURAND_STATUS_ALLOCATION_FAILED: - return "CURAND_STATUS_ALLOCATION_FAILED"; - - case CURAND_STATUS_TYPE_ERROR: - return "CURAND_STATUS_TYPE_ERROR"; - - case CURAND_STATUS_OUT_OF_RANGE: - return "CURAND_STATUS_OUT_OF_RANGE"; - - case CURAND_STATUS_LENGTH_NOT_MULTIPLE: - return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; - - case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: - return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; - - case CURAND_STATUS_LAUNCH_FAILURE: - return "CURAND_STATUS_LAUNCH_FAILURE"; - - case CURAND_STATUS_PREEXISTING_FAILURE: - return "CURAND_STATUS_PREEXISTING_FAILURE"; - - case CURAND_STATUS_INITIALIZATION_FAILED: - return "CURAND_STATUS_INITIALIZATION_FAILED"; - - case CURAND_STATUS_ARCH_MISMATCH: - return "CURAND_STATUS_ARCH_MISMATCH"; - - case CURAND_STATUS_INTERNAL_ERROR: - return "CURAND_STATUS_INTERNAL_ERROR"; - } - - return ""; -} -#endif - -#ifdef NVJPEGAPI -// nvJPEG API errors -static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { - switch (error) { - case NVJPEG_STATUS_SUCCESS: - return "NVJPEG_STATUS_SUCCESS"; - - case NVJPEG_STATUS_NOT_INITIALIZED: - return "NVJPEG_STATUS_NOT_INITIALIZED"; - - case NVJPEG_STATUS_INVALID_PARAMETER: - return "NVJPEG_STATUS_INVALID_PARAMETER"; - - case NVJPEG_STATUS_BAD_JPEG: - return "NVJPEG_STATUS_BAD_JPEG"; - - case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: - return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; - - case NVJPEG_STATUS_ALLOCATOR_FAILURE: - return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; - - case NVJPEG_STATUS_EXECUTION_FAILED: - return "NVJPEG_STATUS_EXECUTION_FAILED"; - - case NVJPEG_STATUS_ARCH_MISMATCH: - return "NVJPEG_STATUS_ARCH_MISMATCH"; - - case NVJPEG_STATUS_INTERNAL_ERROR: - return "NVJPEG_STATUS_INTERNAL_ERROR"; - } - - return ""; -} -#endif - -#ifdef NV_NPPIDEFS_H -// NPP API errors -static const char *_cudaGetErrorEnum(NppStatus error) { - switch (error) { - case NPP_NOT_SUPPORTED_MODE_ERROR: - return "NPP_NOT_SUPPORTED_MODE_ERROR"; - - case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: - return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; - - case NPP_RESIZE_NO_OPERATION_ERROR: - return "NPP_RESIZE_NO_OPERATION_ERROR"; - - case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: - return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; - -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 - - case NPP_BAD_ARG_ERROR: - return "NPP_BAD_ARGUMENT_ERROR"; - - case NPP_COEFF_ERROR: - return "NPP_COEFFICIENT_ERROR"; - - case NPP_RECT_ERROR: - return "NPP_RECTANGLE_ERROR"; - - case NPP_QUAD_ERROR: - return "NPP_QUADRANGLE_ERROR"; - - case NPP_MEM_ALLOC_ERR: - return "NPP_MEMORY_ALLOCATION_ERROR"; - - case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: - return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; - - case NPP_INVALID_INPUT: - return "NPP_INVALID_INPUT"; - - case NPP_POINTER_ERROR: - return "NPP_POINTER_ERROR"; - - case NPP_WARNING: - return "NPP_WARNING"; - - case NPP_ODD_ROI_WARNING: - return "NPP_ODD_ROI_WARNING"; -#else - - // These are for CUDA 5.5 or higher - case NPP_BAD_ARGUMENT_ERROR: - return "NPP_BAD_ARGUMENT_ERROR"; - - case NPP_COEFFICIENT_ERROR: - return "NPP_COEFFICIENT_ERROR"; - - case NPP_RECTANGLE_ERROR: - return "NPP_RECTANGLE_ERROR"; - - case NPP_QUADRANGLE_ERROR: - return "NPP_QUADRANGLE_ERROR"; - - case NPP_MEMORY_ALLOCATION_ERR: - return "NPP_MEMORY_ALLOCATION_ERROR"; - - case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: - return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; - - case NPP_INVALID_HOST_POINTER_ERROR: - return "NPP_INVALID_HOST_POINTER_ERROR"; - - case NPP_INVALID_DEVICE_POINTER_ERROR: - return "NPP_INVALID_DEVICE_POINTER_ERROR"; -#endif - - case NPP_LUT_NUMBER_OF_LEVELS_ERROR: - return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; - - case NPP_TEXTURE_BIND_ERROR: - return "NPP_TEXTURE_BIND_ERROR"; - - case NPP_WRONG_INTERSECTION_ROI_ERROR: - return "NPP_WRONG_INTERSECTION_ROI_ERROR"; - - case NPP_NOT_EVEN_STEP_ERROR: - return "NPP_NOT_EVEN_STEP_ERROR"; - - case NPP_INTERPOLATION_ERROR: - return "NPP_INTERPOLATION_ERROR"; - - case NPP_RESIZE_FACTOR_ERROR: - return "NPP_RESIZE_FACTOR_ERROR"; - - case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: - return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; - -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 - - case NPP_MEMFREE_ERR: - return "NPP_MEMFREE_ERR"; - - case NPP_MEMSET_ERR: - return "NPP_MEMSET_ERR"; - - case NPP_MEMCPY_ERR: - return "NPP_MEMCPY_ERROR"; - - case NPP_MIRROR_FLIP_ERR: - return "NPP_MIRROR_FLIP_ERR"; -#else - - case NPP_MEMFREE_ERROR: - return "NPP_MEMFREE_ERROR"; - - case NPP_MEMSET_ERROR: - return "NPP_MEMSET_ERROR"; - - case NPP_MEMCPY_ERROR: - return "NPP_MEMCPY_ERROR"; - - case NPP_MIRROR_FLIP_ERROR: - return "NPP_MIRROR_FLIP_ERROR"; -#endif - - case NPP_ALIGNMENT_ERROR: - return "NPP_ALIGNMENT_ERROR"; - - case NPP_STEP_ERROR: - return "NPP_STEP_ERROR"; - - case NPP_SIZE_ERROR: - return "NPP_SIZE_ERROR"; - - case NPP_NULL_POINTER_ERROR: - return "NPP_NULL_POINTER_ERROR"; - - case NPP_CUDA_KERNEL_EXECUTION_ERROR: - return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; - - case NPP_NOT_IMPLEMENTED_ERROR: - return "NPP_NOT_IMPLEMENTED_ERROR"; - - case NPP_ERROR: - return "NPP_ERROR"; - - case NPP_SUCCESS: - return "NPP_SUCCESS"; - - case NPP_WRONG_INTERSECTION_QUAD_WARNING: - return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; - - case NPP_MISALIGNED_DST_ROI_WARNING: - return "NPP_MISALIGNED_DST_ROI_WARNING"; - - case NPP_AFFINE_QUAD_INCORRECT_WARNING: - return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; - - case NPP_DOUBLE_SIZE_WARNING: - return "NPP_DOUBLE_SIZE_WARNING"; - - case NPP_WRONG_INTERSECTION_ROI_WARNING: - return "NPP_WRONG_INTERSECTION_ROI_WARNING"; - -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 - /* These are 6.0 or higher */ - case NPP_LUT_PALETTE_BITSIZE_ERROR: - return "NPP_LUT_PALETTE_BITSIZE_ERROR"; - - case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: - return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; - - case NPP_QUALITY_INDEX_ERROR: - return "NPP_QUALITY_INDEX_ERROR"; - - case NPP_CHANNEL_ORDER_ERROR: - return "NPP_CHANNEL_ORDER_ERROR"; - - case NPP_ZERO_MASK_VALUE_ERROR: - return "NPP_ZERO_MASK_VALUE_ERROR"; - - case NPP_NUMBER_OF_CHANNELS_ERROR: - return "NPP_NUMBER_OF_CHANNELS_ERROR"; - - case NPP_COI_ERROR: - return "NPP_COI_ERROR"; - - case NPP_DIVISOR_ERROR: - return "NPP_DIVISOR_ERROR"; - - case NPP_CHANNEL_ERROR: - return "NPP_CHANNEL_ERROR"; - - case NPP_STRIDE_ERROR: - return "NPP_STRIDE_ERROR"; - - case NPP_ANCHOR_ERROR: - return "NPP_ANCHOR_ERROR"; - - case NPP_MASK_SIZE_ERROR: - return "NPP_MASK_SIZE_ERROR"; - - case NPP_MOMENT_00_ZERO_ERROR: - return "NPP_MOMENT_00_ZERO_ERROR"; - - case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: - return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; - - case NPP_THRESHOLD_ERROR: - return "NPP_THRESHOLD_ERROR"; - - case NPP_CONTEXT_MATCH_ERROR: - return "NPP_CONTEXT_MATCH_ERROR"; - - case NPP_FFT_FLAG_ERROR: - return "NPP_FFT_FLAG_ERROR"; - - case NPP_FFT_ORDER_ERROR: - return "NPP_FFT_ORDER_ERROR"; - - case NPP_SCALE_RANGE_ERROR: - return "NPP_SCALE_RANGE_ERROR"; - - case NPP_DATA_TYPE_ERROR: - return "NPP_DATA_TYPE_ERROR"; - - case NPP_OUT_OFF_RANGE_ERROR: - return "NPP_OUT_OFF_RANGE_ERROR"; - - case NPP_DIVIDE_BY_ZERO_ERROR: - return "NPP_DIVIDE_BY_ZERO_ERROR"; - - case NPP_RANGE_ERROR: - return "NPP_RANGE_ERROR"; - - case NPP_NO_MEMORY_ERROR: - return "NPP_NO_MEMORY_ERROR"; - - case NPP_ERROR_RESERVED: - return "NPP_ERROR_RESERVED"; - - case NPP_NO_OPERATION_WARNING: - return "NPP_NO_OPERATION_WARNING"; - - case NPP_DIVIDE_BY_ZERO_WARNING: - return "NPP_DIVIDE_BY_ZERO_WARNING"; -#endif - -#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 - /* These are 7.0 or higher */ - case NPP_OVERFLOW_ERROR: - return "NPP_OVERFLOW_ERROR"; - - case NPP_CORRUPTED_DATA_ERROR: - return "NPP_CORRUPTED_DATA_ERROR"; -#endif - } - - return ""; -} -#endif - -template -void check(T result, char const *const func, const char *const file, - int const line) { - if (result) { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, - static_cast(result), _cudaGetErrorEnum(result), func); - exit(EXIT_FAILURE); - } -} - -#ifdef __DRIVER_TYPES_H__ -// This will output the proper CUDA error strings in the event -// that a CUDA host call returns an error -#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) - -// This will output the proper error string when calling cudaGetLastError -#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) - -inline void __getLastCudaError(const char *errorMessage, const char *file, - const int line) { - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) { - fprintf(stderr, - "%s(%i) : getLastCudaError() CUDA error :" - " %s : (%d) %s.\n", - file, line, errorMessage, static_cast(err), - cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } -} - -// This will only print the proper error string when calling cudaGetLastError -// but not exit program incase error detected. -#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) - -inline void __printLastCudaError(const char *errorMessage, const char *file, - const int line) { - cudaError_t err = cudaGetLastError(); - - if (cudaSuccess != err) { - fprintf(stderr, - "%s(%i) : getLastCudaError() CUDA error :" - " %s : (%d) %s.\n", - file, line, errorMessage, static_cast(err), - cudaGetErrorString(err)); - } -} -#endif - -#ifndef MAX -#define MAX(a, b) (a > b ? a : b) -#endif - -// Float To Int conversion -inline int ftoi(float value) { - return (value >= 0 ? static_cast(value + 0.5) - : static_cast(value - 0.5)); -} - -// Beginning of GPU Architecture definitions -inline int _ConvertSMVer2Cores(int major, int minor) { - // Defines for GPU Architecture types (using the SM version to determine - // the # of cores per SM - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, - // and m = SM minor version - int Cores; - } sSMtoCores; - - sSMtoCores nGpuArchCoresPerSM[] = { - {0x30, 192}, - {0x32, 192}, - {0x35, 192}, - {0x37, 192}, - {0x50, 128}, - {0x52, 128}, - {0x53, 128}, - {0x60, 64}, - {0x61, 128}, - {0x62, 128}, - {0x70, 64}, - {0x72, 64}, - {0x75, 64}, - {0x80, 64}, - {0x86, 128}, - {0x87, 128}, - {-1, -1}}; - - int index = 0; - - while (nGpuArchCoresPerSM[index].SM != -1) { - if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { - return nGpuArchCoresPerSM[index].Cores; - } - - index++; - } - - // If we don't find the values, we default use the previous one - // to run properly - printf( - "MapSMtoCores for SM %d.%d is undefined." - " Default to use %d Cores/SM\n", - major, minor, nGpuArchCoresPerSM[index - 1].Cores); - return nGpuArchCoresPerSM[index - 1].Cores; -} - -inline const char* _ConvertSMVer2ArchName(int major, int minor) { - // Defines for GPU Architecture types (using the SM version to determine - // the GPU Arch name) - typedef struct { - int SM; // 0xMm (hexidecimal notation), M = SM Major version, - // and m = SM minor version - const char* name; - } sSMtoArchName; - - sSMtoArchName nGpuArchNameSM[] = { - {0x30, "Kepler"}, - {0x32, "Kepler"}, - {0x35, "Kepler"}, - {0x37, "Kepler"}, - {0x50, "Maxwell"}, - {0x52, "Maxwell"}, - {0x53, "Maxwell"}, - {0x60, "Pascal"}, - {0x61, "Pascal"}, - {0x62, "Pascal"}, - {0x70, "Volta"}, - {0x72, "Xavier"}, - {0x75, "Turing"}, - {0x80, "Ampere"}, - {0x86, "Ampere"}, - {0x87, "Ampere"}, - {-1, "Graphics Device"}}; - - int index = 0; - - while (nGpuArchNameSM[index].SM != -1) { - if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { - return nGpuArchNameSM[index].name; - } - - index++; - } - - // If we don't find the values, we default use the previous one - // to run properly - printf( - "MapSMtoArchName for SM %d.%d is undefined." - " Default to use %s\n", - major, minor, nGpuArchNameSM[index - 1].name); - return nGpuArchNameSM[index - 1].name; -} - // end of GPU Architecture definitions - -#ifdef __CUDA_RUNTIME_H__ -// General GPU Device CUDA Initialization -inline int gpuDeviceInit(int devID) { - int device_count; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) { - fprintf(stderr, - "gpuDeviceInit() CUDA error: " - "no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - if (devID < 0) { - devID = 0; - } - - if (devID > device_count - 1) { - fprintf(stderr, "\n"); - fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", - device_count); - fprintf(stderr, - ">> gpuDeviceInit (-device=%d) is not a valid" - " GPU device. <<\n", - devID); - fprintf(stderr, "\n"); - return -devID; - } - - int computeMode = -1, major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID)); - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); - if (computeMode == cudaComputeModeProhibited) { - fprintf(stderr, - "Error: device is running in , no threads can use cudaSetDevice().\n"); - return -1; - } - - if (major < 1) { - fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); - exit(EXIT_FAILURE); - } - - checkCudaErrors(cudaSetDevice(devID)); - printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); - - return devID; -} - -// This function returns the best GPU (with maximum GFLOPS) -inline int gpuGetMaxGflopsDeviceId() { - int current_device = 0, sm_per_multiproc = 0; - int max_perf_device = 0; - int device_count = 0; - int devices_prohibited = 0; - - uint64_t max_compute_perf = 0; - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) { - fprintf(stderr, - "gpuGetMaxGflopsDeviceId() CUDA error:" - " no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - // Find the best CUDA capable GPU device - current_device = 0; - - while (current_device < device_count) { - int computeMode = -1, major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); - - // If this GPU is not running on Compute Mode prohibited, - // then we can add it to the list - if (computeMode != cudaComputeModeProhibited) { - if (major == 9999 && minor == 9999) { - sm_per_multiproc = 1; - } else { - sm_per_multiproc = - _ConvertSMVer2Cores(major, minor); - } - int multiProcessorCount = 0, clockRate = 0; - checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device)); - cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device); - if (result != cudaSuccess) { - // If cudaDevAttrClockRate attribute is not supported we - // set clockRate as 1, to consider GPU with most SMs and CUDA Cores. - if(result == cudaErrorInvalidValue) { - clockRate = 1; - } - else { - fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__, - static_cast(result), _cudaGetErrorEnum(result)); - exit(EXIT_FAILURE); - } - } - uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; - - if (compute_perf > max_compute_perf) { - max_compute_perf = compute_perf; - max_perf_device = current_device; - } - } else { - devices_prohibited++; - } - - ++current_device; - } - - if (devices_prohibited == device_count) { - fprintf(stderr, - "gpuGetMaxGflopsDeviceId() CUDA error:" - " all devices have compute mode prohibited.\n"); - exit(EXIT_FAILURE); - } - - return max_perf_device; -} - -// Initialization code to find the best CUDA Device -inline int findCudaDevice(int argc, const char **argv) { - int devID = 0; - - // If the command-line has a device number specified, use it - if (checkCmdLineFlag(argc, argv, "device")) { - devID = getCmdLineArgumentInt(argc, argv, "device="); - - if (devID < 0) { - printf("Invalid command line parameter\n "); - exit(EXIT_FAILURE); - } else { - devID = gpuDeviceInit(devID); - - if (devID < 0) { - printf("exiting...\n"); - exit(EXIT_FAILURE); - } - } - } else { - // Otherwise pick the device with highest Gflops/s - devID = gpuGetMaxGflopsDeviceId(); - checkCudaErrors(cudaSetDevice(devID)); - int major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - devID, _ConvertSMVer2ArchName(major, minor), major, minor); - - } - - return devID; -} - -inline int findIntegratedGPU() { - int current_device = 0; - int device_count = 0; - int devices_prohibited = 0; - - checkCudaErrors(cudaGetDeviceCount(&device_count)); - - if (device_count == 0) { - fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); - exit(EXIT_FAILURE); - } - - // Find the integrated GPU which is compute capable - while (current_device < device_count) { - int computeMode = -1, integrated = -1; - checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); - checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device)); - // If GPU is integrated and is not running on Compute Mode prohibited, - // then cuda can map to GLES resource - if (integrated && (computeMode != cudaComputeModeProhibited)) { - checkCudaErrors(cudaSetDevice(current_device)); - - int major = 0, minor = 0; - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); - printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", - current_device, _ConvertSMVer2ArchName(major, minor), major, minor); - - return current_device; - } else { - devices_prohibited++; - } - - current_device++; - } - - if (devices_prohibited == device_count) { - fprintf(stderr, - "CUDA error:" - " No GLES-CUDA Interop capable GPU found.\n"); - exit(EXIT_FAILURE); - } - - return -1; -} - -// General check for CUDA GPU SM Capabilities -inline bool checkCudaCapabilities(int major_version, int minor_version) { - int dev; - int major = 0, minor = 0; - - checkCudaErrors(cudaGetDevice(&dev)); - checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); - checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev)); - - if ((major > major_version) || - (major == major_version && - minor >= minor_version)) { - printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, - _ConvertSMVer2ArchName(major, minor), major, minor); - return true; - } else { - printf( - " No GPU device was found that can support " - "CUDA compute capability %d.%d.\n", - major_version, minor_version); - return false; - } -} -#endif - - // end of CUDA Helper Functions - -#endif // COMMON_HELPER_CUDA_H_ diff --git a/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp b/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp deleted file mode 100644 index 59fdbc3..0000000 --- a/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. - * - * SPDX-License-Identifier: BSD-3-Clause - */ - -#define HANDLE_ERROR(x) \ -{ const auto err = x; \ - if (err != CUSTATEVEC_STATUS_SUCCESS ) { \ - printf("Error: %s in line %d\n", \ - custatevecGetErrorString(err), __LINE__); return err; } \ -}; - -#define HANDLE_CUDA_ERROR(x) \ -{ const auto err = x; \ - if (err != cudaSuccess ) { \ - printf("Error: %s in line %d\n", \ - cudaGetErrorString(err), __LINE__); return err; } \ -}; - -bool almost_equal(cuDoubleComplex x, cuDoubleComplex y) { - const double eps = 1.0e-5; - const cuDoubleComplex diff = cuCsub(x, y); - return (cuCabs(diff) < eps); -} - -bool almost_equal(double x, double y) { - const double eps = 1.0e-5; - const double diff = x - y; - return (abs(diff) < eps); -} diff --git a/src/qfvm_gpu/cuda_utils/helper_string.h b/src/qfvm_gpu/cuda_utils/helper_string.h deleted file mode 100644 index 77864b8..0000000 --- a/src/qfvm_gpu/cuda_utils/helper_string.h +++ /dev/null @@ -1,683 +0,0 @@ -/** - * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. - * - * Please refer to the NVIDIA end user license agreement (EULA) associated - * with this source code for terms and conditions that govern your use of - * this software. Any use, reproduction, disclosure, or distribution of - * this software and related documentation outside the terms of the EULA - * is strictly prohibited. - * - */ - -// These are helper functions for the SDK samples (string parsing, timers, etc) -#ifndef COMMON_HELPER_STRING_H_ -#define COMMON_HELPER_STRING_H_ - -#include -#include -#include -#include - -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) -#ifndef _CRT_SECURE_NO_DEPRECATE -#define _CRT_SECURE_NO_DEPRECATE -#endif -#ifndef STRCASECMP -#define STRCASECMP _stricmp -#endif -#ifndef STRNCASECMP -#define STRNCASECMP _strnicmp -#endif -#ifndef STRCPY -#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) -#endif - -#ifndef FOPEN -#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result != 0) -#endif -#ifndef SSCANF -#define SSCANF sscanf_s -#endif -#ifndef SPRINTF -#define SPRINTF sprintf_s -#endif -#else // Linux Includes -#include -#include - -#ifndef STRCASECMP -#define STRCASECMP strcasecmp -#endif -#ifndef STRNCASECMP -#define STRNCASECMP strncasecmp -#endif -#ifndef STRCPY -#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) -#endif - -#ifndef FOPEN -#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) -#endif -#ifndef FOPEN_FAIL -#define FOPEN_FAIL(result) (result == NULL) -#endif -#ifndef SSCANF -#define SSCANF sscanf -#endif -#ifndef SPRINTF -#define SPRINTF sprintf -#endif -#endif - -#ifndef EXIT_WAIVED -#define EXIT_WAIVED 2 -#endif - -// CUDA Utility Helper Functions -inline int stringRemoveDelimiter(char delimiter, const char *string) { - int string_start = 0; - - while (string[string_start] == delimiter) { - string_start++; - } - - if (string_start >= static_cast(strlen(string) - 1)) { - return 0; - } - - return string_start; -} - -inline int getFileExtension(char *filename, char **extension) { - int string_length = static_cast(strlen(filename)); - - while (filename[string_length--] != '.') { - if (string_length == 0) break; - } - - if (string_length > 0) string_length += 2; - - if (string_length == 0) - *extension = NULL; - else - *extension = &filename[string_length]; - - return string_length; -} - -inline bool checkCmdLineFlag(const int argc, const char **argv, - const char *string_ref) { - bool bFound = false; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - - const char *equal_pos = strchr(string_argv, '='); - int argv_length = static_cast( - equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); - - int length = static_cast(strlen(string_ref)); - - if (length == argv_length && - !STRNCASECMP(string_argv, string_ref, length)) { - bFound = true; - continue; - } - } - } - - return bFound; -} - -// This function wraps the CUDA Driver API into a template function -template -inline bool getCmdLineArgumentValue(const int argc, const char **argv, - const char *string_ref, T *value) { - bool bFound = false; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = static_cast(strlen(string_ref)); - - if (!STRNCASECMP(string_argv, string_ref, length)) { - if (length + 1 <= static_cast(strlen(string_argv))) { - int auto_inc = (string_argv[length] == '=') ? 1 : 0; - *value = (T)atoi(&string_argv[length + auto_inc]); - } - - bFound = true; - i = argc; - } - } - } - - return bFound; -} - -inline int getCmdLineArgumentInt(const int argc, const char **argv, - const char *string_ref) { - bool bFound = false; - int value = -1; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = static_cast(strlen(string_ref)); - - if (!STRNCASECMP(string_argv, string_ref, length)) { - if (length + 1 <= static_cast(strlen(string_argv))) { - int auto_inc = (string_argv[length] == '=') ? 1 : 0; - value = atoi(&string_argv[length + auto_inc]); - } else { - value = 0; - } - - bFound = true; - continue; - } - } - } - - if (bFound) { - return value; - } else { - return 0; - } -} - -inline float getCmdLineArgumentFloat(const int argc, const char **argv, - const char *string_ref) { - bool bFound = false; - float value = -1; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - const char *string_argv = &argv[i][string_start]; - int length = static_cast(strlen(string_ref)); - - if (!STRNCASECMP(string_argv, string_ref, length)) { - if (length + 1 <= static_cast(strlen(string_argv))) { - int auto_inc = (string_argv[length] == '=') ? 1 : 0; - value = static_cast(atof(&string_argv[length + auto_inc])); - } else { - value = 0.f; - } - - bFound = true; - continue; - } - } - } - - if (bFound) { - return value; - } else { - return 0; - } -} - -inline bool getCmdLineArgumentString(const int argc, const char **argv, - const char *string_ref, - char **string_retval) { - bool bFound = false; - - if (argc >= 1) { - for (int i = 1; i < argc; i++) { - int string_start = stringRemoveDelimiter('-', argv[i]); - char *string_argv = const_cast(&argv[i][string_start]); - int length = static_cast(strlen(string_ref)); - - if (!STRNCASECMP(string_argv, string_ref, length)) { - *string_retval = &string_argv[length + 1]; - bFound = true; - continue; - } - } - } - - if (!bFound) { - *string_retval = NULL; - } - - return bFound; -} - -////////////////////////////////////////////////////////////////////////////// -//! Find the path for a file assuming that -//! files are found in the searchPath. -//! -//! @return the path if succeeded, otherwise 0 -//! @param filename name of the file -//! @param executable_path optional absolute path of the executable -////////////////////////////////////////////////////////////////////////////// -inline char *sdkFindFilePath(const char *filename, - const char *executable_path) { - // defines a variable that is replaced with the name of the - // executable - - // Typical relative search paths to locate needed companion files (e.g. sample - // input data, or JIT source files) The origin for the relative search may be - // the .exe file, a .bat file launching an .exe, a browser .exe launching the - // .exe or .bat, etc - const char *searchPath[] = { - "./", // same dir - "./_data_files/", - "./common/", // "/common/" subdir - "./common/data/", // "/common/data/" subdir - "./data/", // "/data/" subdir - "./src/", // "/src/" subdir - "./src//data/", // "/src//data/" subdir - "./inc/", // "/inc/" subdir - "./0_Simple/", // "/0_Simple/" subdir - "./1_Utilities/", // "/1_Utilities/" subdir - "./2_Graphics/", // "/2_Graphics/" subdir - "./3_Imaging/", // "/3_Imaging/" subdir - "./4_Finance/", // "/4_Finance/" subdir - "./5_Simulations/", // "/5_Simulations/" subdir - "./6_Advanced/", // "/6_Advanced/" subdir - "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir - "./8_Android/", // "/8_Android/" subdir - "./samples/", // "/samples/" subdir - - "./0_Simple//data/", // "/0_Simple//data/" - // subdir - "./1_Utilities//data/", // "/1_Utilities//data/" - // subdir - "./2_Graphics//data/", // "/2_Graphics//data/" - // subdir - "./3_Imaging//data/", // "/3_Imaging//data/" - // subdir - "./4_Finance//data/", // "/4_Finance//data/" - // subdir - "./5_Simulations//data/", // "/5_Simulations//data/" - // subdir - "./6_Advanced//data/", // "/6_Advanced//data/" - // subdir - "./7_CUDALibraries//", // "/7_CUDALibraries//" - // subdir - "./7_CUDALibraries//data/", // "/7_CUDALibraries//data/" - // subdir - - "../", // up 1 in tree - "../common/", // up 1 in tree, "/common/" subdir - "../common/data/", // up 1 in tree, "/common/data/" subdir - "../data/", // up 1 in tree, "/data/" subdir - "../src/", // up 1 in tree, "/src/" subdir - "../inc/", // up 1 in tree, "/inc/" subdir - - "../0_Simple//data/", // up 1 in tree, - // "/0_Simple//" - // subdir - "../1_Utilities//data/", // up 1 in tree, - // "/1_Utilities//" - // subdir - "../2_Graphics//data/", // up 1 in tree, - // "/2_Graphics//" - // subdir - "../3_Imaging//data/", // up 1 in tree, - // "/3_Imaging//" - // subdir - "../4_Finance//data/", // up 1 in tree, - // "/4_Finance//" - // subdir - "../5_Simulations//data/", // up 1 in tree, - // "/5_Simulations//" - // subdir - "../6_Advanced//data/", // up 1 in tree, - // "/6_Advanced//" - // subdir - "../7_CUDALibraries//data/", // up 1 in tree, - // "/7_CUDALibraries//" - // subdir - "../8_Android//data/", // up 1 in tree, - // "/8_Android//" - // subdir - "../samples//data/", // up 1 in tree, - // "/samples//" - // subdir - "../../", // up 2 in tree - "../../common/", // up 2 in tree, "/common/" subdir - "../../common/data/", // up 2 in tree, "/common/data/" subdir - "../../data/", // up 2 in tree, "/data/" subdir - "../../src/", // up 2 in tree, "/src/" subdir - "../../inc/", // up 2 in tree, "/inc/" subdir - "../../sandbox//data/", // up 2 in tree, - // "/sandbox//" - // subdir - "../../0_Simple//data/", // up 2 in tree, - // "/0_Simple//" - // subdir - "../../1_Utilities//data/", // up 2 in tree, - // "/1_Utilities//" - // subdir - "../../2_Graphics//data/", // up 2 in tree, - // "/2_Graphics//" - // subdir - "../../3_Imaging//data/", // up 2 in tree, - // "/3_Imaging//" - // subdir - "../../4_Finance//data/", // up 2 in tree, - // "/4_Finance//" - // subdir - "../../5_Simulations//data/", // up 2 in tree, - // "/5_Simulations//" - // subdir - "../../6_Advanced//data/", // up 2 in tree, - // "/6_Advanced//" - // subdir - "../../7_CUDALibraries//data/", // up 2 in tree, - // "/7_CUDALibraries//" - // subdir - "../../8_Android//data/", // up 2 in tree, - // "/8_Android//" - // subdir - "../../samples//data/", // up 2 in tree, - // "/samples//" - // subdir - "../../../", // up 3 in tree - "../../../src//", // up 3 in tree, - // "/src//" subdir - "../../../src//data/", // up 3 in tree, - // "/src//data/" - // subdir - "../../../src//src/", // up 3 in tree, - // "/src//src/" - // subdir - "../../../src//inc/", // up 3 in tree, - // "/src//inc/" - // subdir - "../../../sandbox//", // up 3 in tree, - // "/sandbox//" - // subdir - "../../../sandbox//data/", // up 3 in tree, - // "/sandbox//data/" - // subdir - "../../../sandbox//src/", // up 3 in tree, - // "/sandbox//src/" - // subdir - "../../../sandbox//inc/", // up 3 in tree, - // "/sandbox//inc/" - // subdir - "../../../0_Simple//data/", // up 3 in tree, - // "/0_Simple//" - // subdir - "../../../1_Utilities//data/", // up 3 in tree, - // "/1_Utilities//" - // subdir - "../../../2_Graphics//data/", // up 3 in tree, - // "/2_Graphics//" - // subdir - "../../../3_Imaging//data/", // up 3 in tree, - // "/3_Imaging//" - // subdir - "../../../4_Finance//data/", // up 3 in tree, - // "/4_Finance//" - // subdir - "../../../5_Simulations//data/", // up 3 in tree, - // "/5_Simulations//" - // subdir - "../../../6_Advanced//data/", // up 3 in tree, - // "/6_Advanced//" - // subdir - "../../../7_CUDALibraries//data/", // up 3 in tree, - // "/7_CUDALibraries//" - // subdir - "../../../8_Android//data/", // up 3 in tree, - // "/8_Android//" - // subdir - "../../../0_Simple//", // up 3 in tree, - // "/0_Simple//" - // subdir - "../../../1_Utilities//", // up 3 in tree, - // "/1_Utilities//" - // subdir - "../../../2_Graphics//", // up 3 in tree, - // "/2_Graphics//" - // subdir - "../../../3_Imaging//", // up 3 in tree, - // "/3_Imaging//" - // subdir - "../../../4_Finance//", // up 3 in tree, - // "/4_Finance//" - // subdir - "../../../5_Simulations//", // up 3 in tree, - // "/5_Simulations//" - // subdir - "../../../6_Advanced//", // up 3 in tree, - // "/6_Advanced//" - // subdir - "../../../7_CUDALibraries//", // up 3 in tree, - // "/7_CUDALibraries//" - // subdir - "../../../8_Android//", // up 3 in tree, - // "/8_Android//" - // subdir - "../../../samples//data/", // up 3 in tree, - // "/samples//" - // subdir - "../../../common/", // up 3 in tree, "../../../common/" subdir - "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir - "../../../data/", // up 3 in tree, "../../../data/" subdir - "../../../../", // up 4 in tree - "../../../../src//", // up 4 in tree, - // "/src//" subdir - "../../../../src//data/", // up 4 in tree, - // "/src//data/" - // subdir - "../../../../src//src/", // up 4 in tree, - // "/src//src/" - // subdir - "../../../../src//inc/", // up 4 in tree, - // "/src//inc/" - // subdir - "../../../../sandbox//", // up 4 in tree, - // "/sandbox//" - // subdir - "../../../../sandbox//data/", // up 4 in tree, - // "/sandbox//data/" - // subdir - "../../../../sandbox//src/", // up 4 in tree, - // "/sandbox//src/" - // subdir - "../../../../sandbox//inc/", // up 4 in tree, - // "/sandbox//inc/" - // subdir - "../../../../0_Simple//data/", // up 4 in tree, - // "/0_Simple//" - // subdir - "../../../../1_Utilities//data/", // up 4 in tree, - // "/1_Utilities//" - // subdir - "../../../../2_Graphics//data/", // up 4 in tree, - // "/2_Graphics//" - // subdir - "../../../../3_Imaging//data/", // up 4 in tree, - // "/3_Imaging//" - // subdir - "../../../../4_Finance//data/", // up 4 in tree, - // "/4_Finance//" - // subdir - "../../../../5_Simulations//data/", // up 4 in tree, - // "/5_Simulations//" - // subdir - "../../../../6_Advanced//data/", // up 4 in tree, - // "/6_Advanced//" - // subdir - "../../../../7_CUDALibraries//data/", // up 4 in tree, - // "/7_CUDALibraries//" - // subdir - "../../../../8_Android//data/", // up 4 in tree, - // "/8_Android//" - // subdir - "../../../../0_Simple//", // up 4 in tree, - // "/0_Simple//" - // subdir - "../../../../1_Utilities//", // up 4 in tree, - // "/1_Utilities//" - // subdir - "../../../../2_Graphics//", // up 4 in tree, - // "/2_Graphics//" - // subdir - "../../../../3_Imaging//", // up 4 in tree, - // "/3_Imaging//" - // subdir - "../../../../4_Finance//", // up 4 in tree, - // "/4_Finance//" - // subdir - "../../../../5_Simulations//", // up 4 in tree, - // "/5_Simulations//" - // subdir - "../../../../6_Advanced//", // up 4 in tree, - // "/6_Advanced//" - // subdir - "../../../../7_CUDALibraries//", // up 4 in tree, - // "/7_CUDALibraries//" - // subdir - "../../../../8_Android//", // up 4 in tree, - // "/8_Android//" - // subdir - "../../../../samples//data/", // up 4 in tree, - // "/samples//" - // subdir - "../../../../common/", // up 4 in tree, "../../../common/" subdir - "../../../../common/data/", // up 4 in tree, "../../../common/data/" - // subdir - "../../../../data/", // up 4 in tree, "../../../data/" subdir - "../../../../../", // up 5 in tree - "../../../../../src//", // up 5 in tree, - // "/src//" - // subdir - "../../../../../src//data/", // up 5 in tree, - // "/src//data/" - // subdir - "../../../../../src//src/", // up 5 in tree, - // "/src//src/" - // subdir - "../../../../../src//inc/", // up 5 in tree, - // "/src//inc/" - // subdir - "../../../../../sandbox//", // up 5 in tree, - // "/sandbox//" - // subdir - "../../../../../sandbox//data/", // up 5 in tree, - // "/sandbox//data/" - // subdir - "../../../../../sandbox//src/", // up 5 in tree, - // "/sandbox//src/" - // subdir - "../../../../../sandbox//inc/", // up 5 in tree, - // "/sandbox//inc/" - // subdir - "../../../../../0_Simple//data/", // up 5 in tree, - // "/0_Simple//" - // subdir - "../../../../../1_Utilities//data/", // up 5 in tree, - // "/1_Utilities//" - // subdir - "../../../../../2_Graphics//data/", // up 5 in tree, - // "/2_Graphics//" - // subdir - "../../../../../3_Imaging//data/", // up 5 in tree, - // "/3_Imaging//" - // subdir - "../../../../../4_Finance//data/", // up 5 in tree, - // "/4_Finance//" - // subdir - "../../../../../5_Simulations//data/", // up 5 in tree, - // "/5_Simulations//" - // subdir - "../../../../../6_Advanced//data/", // up 5 in tree, - // "/6_Advanced//" - // subdir - "../../../../../7_CUDALibraries//data/", // up 5 in - // tree, - // "/7_CUDALibraries//" - // subdir - "../../../../../8_Android//data/", // up 5 in tree, - // "/8_Android//" - // subdir - "../../../../../samples//data/", // up 5 in tree, - // "/samples//" - // subdir - "../../../../../common/", // up 5 in tree, "../../../common/" subdir - "../../../../../common/data/", // up 5 in tree, "../../../common/data/" - // subdir - }; - - // Extract the executable name - std::string executable_name; - - if (executable_path != 0) { - executable_name = std::string(executable_path); - -#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) - // Windows path delimiter - size_t delimiter_pos = executable_name.find_last_of('\\'); - executable_name.erase(0, delimiter_pos + 1); - - if (executable_name.rfind(".exe") != std::string::npos) { - // we strip .exe, only if the .exe is found - executable_name.resize(executable_name.size() - 4); - } - -#else - // Linux & OSX path delimiter - size_t delimiter_pos = executable_name.find_last_of('/'); - executable_name.erase(0, delimiter_pos + 1); -#endif - } - - // Loop over all search paths and return the first hit - for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { - std::string path(searchPath[i]); - size_t executable_name_pos = path.find(""); - - // If there is executable_name variable in the searchPath - // replace it with the value - if (executable_name_pos != std::string::npos) { - if (executable_path != 0) { - path.replace(executable_name_pos, strlen(""), - executable_name); - } else { - // Skip this path entry if no executable argument is given - continue; - } - } - -#ifdef _DEBUG - printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); -#endif - - // Test if the file exists - path.append(filename); - FILE *fp; - FOPEN(fp, path.c_str(), "rb"); - - if (fp != NULL) { - fclose(fp); - // File found - // returning an allocated array here for backwards compatibility reasons - char *file_path = reinterpret_cast(malloc(path.length() + 1)); - STRCPY(file_path, path.length() + 1, path.c_str()); - return file_path; - } - - if (fp) { - fclose(fp); - } - } - - // File not found - return 0; -} - -#endif // COMMON_HELPER_STRING_H_ diff --git a/src/qfvm_gpu/cuda_utils/ticktock.h b/src/qfvm_gpu/cuda_utils/ticktock.h deleted file mode 100644 index 1adb8a9..0000000 --- a/src/qfvm_gpu/cuda_utils/ticktock.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once - -//#include -//#define TICK(x) auto bench_##x = std::chrono::steady_clock::now(); -//#define TOCK(x) std::cout << #x ": " << std::chrono::duration_cast>(std::chrono::steady_clock::now() - bench_##x).count() << "s" << std::endl; - -#include -#define TICK(x) auto bench_##x = tbb::tick_count::now(); -#define TOCK(x) std::cout << #x ": " << (tbb::tick_count::now() - bench_##x).seconds() << "s" << std::endl; diff --git a/src/qfvm_gpu/custate_simu.cuh b/src/qfvm_gpu/custate_simu.cuh deleted file mode 100644 index 6456c7e..0000000 --- a/src/qfvm_gpu/custate_simu.cuh +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once -#include "cuda_statevector.cuh" -#include -#include -#include -#include "apply_gate_custate.cuh" - -void simulate_custate(Circuit & circuit, CudaStateVector & psi_d){ - size_t size = psi_d.size(); - int n = psi_d.num(); - for (auto gate : circuit.gates()){ - apply_gate_custate(psi_d.data(), gate, n); - } -} - -void simulate_custate(Circuit & circuit, StateVector & state){ - //initialize psi - state.set_num(circuit.qubit_num()); - size_t size = state.size(); - CudaStateVector psi_d(state); - - simulate_custate(circuit, psi_d); - cudaDeviceSynchronize(); - - //copy back - complex* psi = reinterpret_cast*>(psi_d.data()); - checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex), cudaMemcpyDeviceToHost)); - psi=nullptr; -} - -StateVector simulate_custate(Circuit & circuit){ - StateVector state(circuit.qubit_num()); - simulate_custate(circuit, state); - return std::move(state); -} \ No newline at end of file From 2177df4b0aef500d91642f1a67f89e8f9965289a Mon Sep 17 00:00:00 2001 From: chensgit169 Date: Thu, 30 Nov 2023 16:09:35 +0800 Subject: [PATCH 3/5] Revert "remove redundant files" This reverts commit 4efd9b53b12b7c0faf54462101474e8c4810a07a. --- src/qfvm/circuit.hpp | 241 +++ src/qfvm/operators.hpp | 97 ++ src/qfvm/qasm.hpp | 42 + src/qfvm/qfvm.cpp | 164 ++ src/qfvm/simulator.hpp | 116 ++ src/qfvm/statevector.hpp | 1355 +++++++++++++++++ src/qfvm/types.hpp | 33 + src/qfvm/util.h | 99 ++ src/qfvm_gpu/apply_gate_custate.cuh | 46 + src/qfvm_gpu/apply_gate_gpu.cuh | 411 +++++ src/qfvm_gpu/cuda_simulator.cuh | 91 ++ src/qfvm_gpu/cuda_statevector.cuh | 58 + src/qfvm_gpu/cuda_utils/CudaTexture.h | 39 + src/qfvm_gpu/cuda_utils/helper_cuda.h | 953 ++++++++++++ src/qfvm_gpu/cuda_utils/helper_custatevec.hpp | 31 + src/qfvm_gpu/cuda_utils/helper_string.h | 683 +++++++++ src/qfvm_gpu/cuda_utils/ticktock.h | 9 + src/qfvm_gpu/custate_simu.cuh | 35 + 18 files changed, 4503 insertions(+) create mode 100644 src/qfvm/circuit.hpp create mode 100644 src/qfvm/operators.hpp create mode 100644 src/qfvm/qasm.hpp create mode 100644 src/qfvm/qfvm.cpp create mode 100644 src/qfvm/simulator.hpp create mode 100644 src/qfvm/statevector.hpp create mode 100644 src/qfvm/types.hpp create mode 100644 src/qfvm/util.h create mode 100644 src/qfvm_gpu/apply_gate_custate.cuh create mode 100644 src/qfvm_gpu/apply_gate_gpu.cuh create mode 100644 src/qfvm_gpu/cuda_simulator.cuh create mode 100644 src/qfvm_gpu/cuda_statevector.cuh create mode 100644 src/qfvm_gpu/cuda_utils/CudaTexture.h create mode 100644 src/qfvm_gpu/cuda_utils/helper_cuda.h create mode 100644 src/qfvm_gpu/cuda_utils/helper_custatevec.hpp create mode 100644 src/qfvm_gpu/cuda_utils/helper_string.h create mode 100644 src/qfvm_gpu/cuda_utils/ticktock.h create mode 100644 src/qfvm_gpu/custate_simu.cuh diff --git a/src/qfvm/circuit.hpp b/src/qfvm/circuit.hpp new file mode 100644 index 0000000..cd08bea --- /dev/null +++ b/src/qfvm/circuit.hpp @@ -0,0 +1,241 @@ +#pragma once +#include "operators.hpp" +#include "qasm.hpp" +#include +#include +#include +#include +#include "util.h" +#include +#include +#include + +namespace py = pybind11; +using namespace pybind11::literals; + + +void check_operator(QuantumOperator &op){ + std::cout << "-------------" << std::endl; + + std::cout << "name: " << op.name() << std::endl; + std::cout << "pos: "; + Qfutil::printVector(op.positions()); + + std::cout << "paras: "; + Qfutil::printVector(op.paras()); + + std::cout << "control number: "; + std::cout << op.control_num() << std::endl; + + std::cout << "matrix: " << std::endl; + std::cout << op.mat() << std::endl; + + std::cout << "flatten matrix: " << std::endl; + auto mat = op.mat(); + // Eigen::Map v1(mat.data(), mat.size()); + // std::cout << "v1: " << v1 << std::endl; + auto matv = mat.data(); + for (auto i = 0;i < mat.size();i++){ + std::cout << matv[i] << " "; + } + std::cout << std::endl; + std::cout << "-------------" << std::endl; +} + + +class Circuit{ + private: + uint qubit_num_; + vector instructions_; + uint max_targe_num_; + uint cbit_num_; + // to sample count + vector> measure_vec_; + bool final_measure_ = true; + + public: + Circuit(); + explicit Circuit(uint qubit_num); + explicit Circuit(vector &ops); + explicit Circuit(py::object const&pycircuit); + + void add_op(QuantumOperator &op); + void compress_instructions(); + uint qubit_num() const { return qubit_num_; } + uint cbit_num() const { return cbit_num_; } + uint max_targe_num() const { return max_targe_num_; } + bool final_measure() const { return final_measure_; } + vector gates(); + vector> measure_vec() { return measure_vec_; } + vector instructions() const { return instructions_; } + QuantumOperator from_pyops(py::object const &obj); +}; + +void Circuit::add_op(QuantumOperator &op){ + for (pos_t pos : op.positions()){ + if (pos > qubit_num_) { + throw "invalid position on quantum registers"; + } + else{ + instructions_.push_back(op); + } + } +} + + Circuit::Circuit(){}; + Circuit::Circuit(uint qubit_num) + : + qubit_num_(qubit_num){ } + + Circuit::Circuit(vector &ops) + : + instructions_(ops), + max_targe_num_(0){ + qubit_num_ = 0; + for (auto op : ops){ + for (pos_t pos : op.positions()){ + if (op.targe_num() > max_targe_num_) + max_targe_num_ = op.targe_num(); + if (pos+1 > qubit_num_){ qubit_num_ = pos+1; } + } + } +} + +vector Circuit::gates(){ + // provide gates for gpu and custate + std::vector classics = {"measure", "cif", "reset"}; + vector gates; + for(auto op : instructions_){ + if(std::find(classics.begin(), classics.end(), op.name()) == classics.end()){ + gates.push_back(op); + } + } + return gates; +} + +// Construct C++ operators from pygates +QuantumOperator Circuit::from_pyops(py::object const &obj){ + string name; + vector positions; + vector qbits; + vector cbits; + vector paras; + uint control_num = 0; + RowMatrixXcd mat; + + name = obj.attr("name").attr("lower")().cast(); + if (!(name == "barrier" || name == "delay" || name == "id" || name == "measure" || name == "reset" || name == "cif")) + { + if (py::isinstance(obj.attr("pos"))){ + positions = obj.attr("pos").cast>(); + } + else if(py::isinstance(obj.attr("pos"))){ + positions = vector{obj.attr("pos").cast()}; + } + + if (py::isinstance(obj.attr("paras"))){ + paras = obj.attr("paras").cast>(); + } + else if(py::isinstance(obj.attr("paras")) || py::isinstance(obj.attr("paras"))){ + paras = vector{obj.attr("paras").cast()}; + } + + if (py::hasattr(obj, "ctrls")){ + control_num = py::len(obj.attr("ctrls")); + } + + //Reverse order for multi-target gate + if (py::hasattr(obj, "_targ_matrix")){ + mat = obj.attr("get_targ_matrix")("reverse_order"_a=true).cast(); + } + else{ //Single target gate + mat = obj.attr("matrix").cast(); + } + return QuantumOperator(name, paras, positions, control_num, mat); + + }else if(name == "measure"){ + if (py::isinstance(obj.attr("qbits"))){ + qbits = obj.attr("qbits").cast>(); + }else if(py::isinstance(obj.attr("qbits"))){ + qbits = vector{obj.attr("qbits").cast()}; + } + + if (py::isinstance(obj.attr("cbits"))){ + cbits = obj.attr("cbits").cast>(); + }else if(py::isinstance(obj.attr("cbits"))){ + cbits = vector{obj.attr("cbits").cast()}; + } + //record qbit-cbit measure map + for(uint i = 0; i < qbits.size(); i++){ + measure_vec_.push_back(std::make_pair(qbits[i], cbits[i])); + } + return QuantumOperator(name, qbits, cbits); + + }else if(name == "reset"){ + if (py::isinstance(obj.attr("pos"))){ + positions = obj.attr("pos").cast>(); + } + else if(py::isinstance(obj.attr("pos"))){ + positions = vector{obj.attr("pos").cast()}; + } + return QuantumOperator(name, positions); + + }else if(name == "cif"){ + uint condition = 0; + vector instructions; + if (py::isinstance(obj.attr("cbits"))){ + cbits = obj.attr("cbits").cast>(); + }else if(py::isinstance(obj.attr("cbits"))){ + cbits = vector{obj.attr("cbits").cast()}; + } + + if(py::isinstance(obj.attr("condition"))){ + condition = obj.attr("condition").cast(); + } + + // Recursively handdle instruction + if (py::isinstance(obj.attr("instructions"))){ + auto pyops = obj.attr("instructions"); + for(auto pyop_h : pyops){ + py::object pyop = py::reinterpret_borrow(pyop_h); + QuantumOperator op = from_pyops(pyop); + if (op){ + if (op.targe_num() > max_targe_num_) + max_targe_num_ = op.targe_num(); + instructions.push_back(std::move(op)); + } + } + } + return QuantumOperator(name, cbits, condition, instructions); + + }else{ + return QuantumOperator(); + } + +} + +Circuit::Circuit(py::object const&pycircuit) +: +max_targe_num_(0) +{ + // auto pygates = pycircuit.attr("gates"); + auto pyops = pycircuit.attr("instructions"); + auto used_qubits = pycircuit.attr("used_qubits").cast>(); + cbit_num_ = pycircuit.attr("cbits_num").cast(); + qubit_num_ = *std::max_element(used_qubits.begin(), used_qubits.end())+1; + // judge wheather op qubit after measure + bool measured = false; + for (auto pyop_h : pyops){ + py::object pyop = py::reinterpret_borrow(pyop_h); + QuantumOperator op = from_pyops(pyop); + if (op){ + if (op.targe_num() > max_targe_num_) + max_targe_num_ = op.targe_num(); + if(op.name() == "measure") {measured = true;} + else if(measured == true) {final_measure_ = false; } + instructions_.push_back(std::move(op)); + } + } +} + +void Circuit::compress_instructions(){} diff --git a/src/qfvm/operators.hpp b/src/qfvm/operators.hpp new file mode 100644 index 0000000..cb9fd66 --- /dev/null +++ b/src/qfvm/operators.hpp @@ -0,0 +1,97 @@ +#pragma once + +#include +#include "statevector.hpp" + +class QuantumOperator{ + protected: + string name_; + vector positions_; + vector paras_; + uint control_num_; + uint targe_num_; + bool diag_; + bool real_; + RowMatrixXcd mat_; + vector qbits_; + vector cbits_; + vector instructions_; + uint condition_; + public: + //Constructor + QuantumOperator(); + QuantumOperator(string name, vector const &qbits); + QuantumOperator(string name, vector const &qbits, vector const &cbits); + QuantumOperator(string name, vector const &cbits, const uint condition, vector const &ins); + QuantumOperator(string name, vector paras, vector const &control_qubits, vector const &targe_qubits, RowMatrixXcd const &mat, bool diag=false, bool real=false); + QuantumOperator(string name,vector paras, vector const &positions, uint control_num, RowMatrixXcd const &mat, bool diag=false, bool real=false); + + //data accessor + string name() const {return name_;} + vector paras() const {return paras_;} + bool has_control() const{return control_num_ == 0 ? false : true;} + bool is_real() const{ return real_; } + bool is_diag() const{ return diag_; } + RowMatrixXcd mat() const { return mat_;} + uint control_num() const { return control_num_; } + uint targe_num() const { return targe_num_; } + uint condition() const { return condition_; } + vector positions(){ return positions_; } + explicit operator bool() const { + return !(name_ == "empty"); + } + vector qbits(){ return qbits_; } + vector cbits(){ return cbits_; } + vector instructions(){ return instructions_; } + //Apply method + virtual void apply_to_state(StateVector & state){ }; +}; + + +QuantumOperator::QuantumOperator() : name_("empty"){ }; + +QuantumOperator::QuantumOperator(string name, vector const &qbits) +: +name_(name), +targe_num_(0), +qbits_(qbits){} + +QuantumOperator::QuantumOperator(string name, vector const &qbits, vector const &cbits) +: +name_(name), +targe_num_(0), +qbits_(qbits), +cbits_(cbits){} + +QuantumOperator::QuantumOperator(string name, vector const &cbits, const uint condition, vector const &ins) +: +name_(name), +targe_num_(0), +cbits_(cbits), +instructions_(ins), +condition_(condition){} + +QuantumOperator::QuantumOperator(string name, vector paras, vector const &positions, uint control_num, RowMatrixXcd const &mat, bool diag, bool real) +: +name_(name), +paras_(paras), +positions_(positions), +control_num_(control_num), +targe_num_(positions.size()-control_num), +diag_(diag), +real_(real), +mat_(mat){ } + +QuantumOperator::QuantumOperator(string name, vector paras, vector const &control_qubits, vector const &targe_qubits, RowMatrixXcd const &mat, bool diag, bool real) +: +name_(name), +paras_(paras), +diag_(diag), +real_(real), +mat_(mat){ + positions_ = control_qubits; + positions_.insert(positions_.end(), targe_qubits.begin(), targe_qubits.end()); + control_num_ = control_qubits.size(); + targe_num_ = targe_qubits.size(); +} + diff --git a/src/qfvm/qasm.hpp b/src/qfvm/qasm.hpp new file mode 100644 index 0000000..e0a2285 --- /dev/null +++ b/src/qfvm/qasm.hpp @@ -0,0 +1,42 @@ +#pragma once + +#include +#include "types.hpp" +#include "util.h" + +using Qfutil::split_string; +using Qfutil::find_numbers; +#define Pair(name) {#name, Opname::name} + +enum class Opname{ + creg, x, y, z, h, s, sdg, t, tdg, p, rx, ry, rz, cnot, cx, cz, crx, cp, ccx, toffoli, swap, iswap, rxx, ryy, rzz, measure, reset, cif +}; + +std::unordered_map OPMAP{Pair(creg), Pair(x), Pair(y), Pair(z), Pair(h), Pair(s), Pair(sdg), Pair(t), + Pair(tdg), Pair(p), Pair(rx), Pair(ry), Pair(rz), Pair(cnot), Pair(cx), Pair(cz), + Pair(crx), Pair(cp), Pair(ccx), Pair(swap), Pair(iswap), Pair(rxx), Pair(ryy), + Pair(rzz), Pair(measure), Pair(reset), Pair(cif)}; + +struct Operation{ + string name; + vector positions; + vector params; + + void print_info(){ + std::cout << "name " << name << std::endl; + std::cout << "positions: "; + for (auto pos : positions){ + std::cout << pos << " "; + } + std::cout << std::endl; + + if (params.size() > 0){ + printf("parameters: "); + for (auto para : params){ + printf("%.6f ", para); + } + } + printf("\n"); + printf("-----\n"); + } +} ; diff --git a/src/qfvm/qfvm.cpp b/src/qfvm/qfvm.cpp new file mode 100644 index 0000000..f318da6 --- /dev/null +++ b/src/qfvm/qfvm.cpp @@ -0,0 +1,164 @@ +#include +#include +#include "simulator.hpp" +#include +#include +#ifdef _USE_GPU +#include +#endif + +#ifdef _USE_CUQUANTUM +#include +#endif + +namespace py = pybind11; + +template +py::array_t to_numpy(const std::tuple &src) { + auto src_ptr = std::get<0>(src); + auto src_size = std::get<1>(src); + + auto capsule = py::capsule(src_ptr, [](void* p) { + delete [] reinterpret_cast(p); + }); + return py::array_t( + src_size, + src_ptr, + capsule + ); +} + +std::pair, py::array_t> > simulate_circuit(py::object const&pycircuit, py::array_t> &np_inputstate, const int &shots){ + auto circuit = Circuit(pycircuit); + py::buffer_info buf = np_inputstate.request(); + auto* data_ptr = reinterpret_cast*>(buf.ptr); + size_t data_size = buf.size; + // If measure all at the end, simulate once + uint actual_shots = shots; + if (circuit.final_measure()) actual_shots = 1; + StateVector global_state; + vector>measures = circuit.measure_vec(); + std::mapcbit_measured; + for(auto &pair: measures){ + cbit_measured[pair.second] = true; + } + // Store outcome's count + std::map outcount; + for(uint i =0; i < actual_shots; i++){ + StateVector state; + if(data_size == 0){ + simulate(circuit, state); + }else{ + //deepcopy state + vector> data_copy(data_ptr, data_ptr + data_size); + state = std::move(StateVector(data_copy.data(), data_copy.size())); + simulate(circuit, state); + } + if(!circuit.final_measure()){ + // store reg + vector tmpcreg = state.creg(); + uint outcome = 0; + for(uint j=0;j tmpcount(global_state.size(), 0); + vector probs = global_state.probabilities(); + std::random_device rd; + std::mt19937 global_rng(rd()); + for(uint i = 0; i < shots; i++){ + uint outcome = std::discrete_distribution(probs.begin(), probs.end())(global_rng); + tmpcount[outcome]++; + } + // map to reg + for(uint i = 0; i < global_state.size(); i++){ + if(tmpcount[i] == 0) continue; + vector tmpcreg(global_state.cbit_num(), 0); + vector tmpout = int2vec(i, 2); + if(tmpout.size() < global_state.num()) + tmpout.resize(global_state.num()); + for(auto &pair: measures){ + tmpcreg[pair.second] = tmpout[pair.first]; + } + uint outcome = 0; + for(uint j=0;j> &np_inputstate){ + auto circuit = Circuit(pycircuit); + py::buffer_info buf = np_inputstate.request(); + auto* data_ptr = reinterpret_cast*>(buf.ptr); + size_t data_size = buf.size; + + + if (data_size == 0){ + StateVector state; + simulate_gpu(circuit, state); + return to_numpy(state.move_data_to_python()); + } + else{ + StateVector state(data_ptr, buf.size); + simulate_gpu(circuit, state); + state.move_data_to_python(); + return np_inputstate; + } +} +#endif + +#ifdef _USE_CUQUANTUM +py::object simulate_circuit_custate(py::object const&pycircuit, py::array_t> &np_inputstate){ + auto circuit = Circuit(pycircuit); + py::buffer_info buf = np_inputstate.request(); + auto* data_ptr = reinterpret_cast*>(buf.ptr); + size_t data_size = buf.size; + + + if (data_size == 0){ + StateVector state; + simulate_custate(circuit, state); + return to_numpy(state.move_data_to_python()); + } + else{ + StateVector state(data_ptr, buf.size); + simulate_custate(circuit, state); + state.move_data_to_python(); + return np_inputstate; + } +} +#endif + + + +PYBIND11_MODULE(qfvm, m) { + m.doc() = "Qfvm simulator"; + m.def("simulate_circuit", &simulate_circuit, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t>(0), py::arg("shots")); + + #ifdef _USE_GPU + m.def("simulate_circuit_gpu", &simulate_circuit_gpu, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t>(0)); + #endif + + #ifdef _USE_CUQUANTUM + m.def("simulate_circuit_custate", &simulate_circuit_custate, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t>(0)); + #endif +} + diff --git a/src/qfvm/simulator.hpp b/src/qfvm/simulator.hpp new file mode 100644 index 0000000..1499922 --- /dev/null +++ b/src/qfvm/simulator.hpp @@ -0,0 +1,116 @@ +#pragma once + +#include "statevector.hpp" +#include "circuit.hpp" + +void apply_op(QuantumOperator &op, StateVector &state){ + bool matched = false; + switch (OPMAP[op.name()]){ + //Named gate + case Opname::x: + state.apply_x(op.positions()[0]); + break; + case Opname::y: + state.apply_y(op.positions()[0]); + break; + case Opname::z: + state.apply_z(op.positions()[0]); + break; + case Opname::h: + state.apply_h(op.positions()[0]); + break; + case Opname::s: + state.apply_s(op.positions()[0]); + break; + case Opname::sdg: + state.apply_sdag(op.positions()[0]); + break; + case Opname::t: + state.apply_t(op.positions()[0]); + break; + case Opname::tdg: + state.apply_tdag(op.positions()[0]); + break; + case Opname::p: + state.apply_p(op.positions()[0], op.paras()[0]); + break; + case Opname::rx: + state.apply_rx(op.positions()[0], op.paras()[0]); + break; + case Opname::ry: + state.apply_ry(op.positions()[0], op.paras()[0]); + break; + case Opname::rz: + state.apply_rz(op.positions()[0], op.paras()[0]); + break; + case Opname::cx: + state.apply_cnot(op.positions()[0], op.positions()[1]); + break; + case Opname::cnot: + state.apply_cnot(op.positions()[0], op.positions()[1]); + break; + case Opname::cp: + state.apply_cp(op.positions()[0], op.positions()[1], op.paras()[0]); + break; + case Opname::cz: + state.apply_cz(op.positions()[0], op.positions()[1]); + break; + case Opname::ccx: + state.apply_ccx(op.positions()[0], op.positions()[1], op.positions()[2]); + break; + case Opname::toffoli: + state.apply_ccx(op.positions()[0], op.positions()[1], op.positions()[2]); + case Opname::rzz: + state.apply_cnot(op.positions()[0], op.positions()[1]); + state.apply_rz(op.positions()[1], op.paras()[0]); + state.apply_cnot(op.positions()[0], op.positions()[1]); + break; + case Opname::measure: + state.apply_measure(op.qbits(), op.cbits()); + break; + case Opname::reset: + state.apply_reset(op.qbits()); + break; + case Opname::cif: + // check cbits and condition + matched = state.check_cif(op.cbits(), op.condition()); + // apply op in instructions + if(matched){ + for(auto op_h :op.instructions()){ + apply_op(op_h, state); + } + } + break; + //Other general gate + default: + { + if (op.targe_num() == 1){ + auto mat_temp = op.mat(); + complex *mat = mat_temp.data(); + if (op.control_num() == 0){ + state.apply_one_targe_gate_general<0>(op.positions(), mat); + }else if (op.control_num() == 1){ + state.apply_one_targe_gate_general<1>(op.positions(), mat); + }else{ + state.apply_one_targe_gate_general<2>(op.positions(), mat); + } + }else if(op.targe_num() > 1){ + state.apply_multi_targe_gate_general(op.positions(), op.control_num(), op.mat()); + }else{ + throw "Invalid target number"; + } + } + } +} + +void simulate(Circuit const& circuit, StateVector & state){ + state.set_num(circuit.qubit_num()); + state.set_creg(circuit.cbit_num()); + // skip measure and handle it in qfvm.cpp + bool skip_measure = circuit.final_measure(); + for (auto op : circuit.instructions()){ + if(skip_measure == true && op.name() == "measure") continue; + apply_op(op , state); + } +} + diff --git a/src/qfvm/statevector.hpp b/src/qfvm/statevector.hpp new file mode 100644 index 0000000..99cf5f7 --- /dev/null +++ b/src/qfvm/statevector.hpp @@ -0,0 +1,1355 @@ +#pragma once +#include "types.hpp" +#include "util.h" +#include +#include +#include +#include +#include +#include +#include +#ifdef USE_SIMD +#ifdef _MSC_VER +#include +#else +#include +#endif +#endif + +template +class StateVector{ + private: + uint num_; + // classical bit + uint cbit_num_; + vector creg_; + size_t size_; + std::unique_ptr[]> data_; + //random engine + std::mt19937_64 rng_; + + public: + //construct function + StateVector(); + explicit StateVector(uint num); + explicit StateVector(complex *data, size_t data_size); + //move assign + // StateVector& operator=(StateVector&& other){ + // if(this != &other){ + // data_ = std::move(other.data_); + // creg_ = std::move(other.creg_); + // num_ = other.num_; + // cbit_num_ = other.cbit_num_; + // size_ = other.size_; + + // } + // return *this; + // } + + //Named gate function + void apply_x(pos_t pos); + void apply_y(pos_t pos); + void apply_z(pos_t pos); + void apply_h(pos_t pos); + void apply_s(pos_t pos); + void apply_sdag(pos_t pos); + void apply_t(pos_t pos); + void apply_tdag(pos_t pos); + void apply_p(pos_t pos, real_t phase); + void apply_rx(pos_t pos, real_t theta); + void apply_ry(pos_t pos, real_t theta); + void apply_rz(pos_t pos, real_t theta); + void apply_cnot(pos_t control, pos_t targe); + void apply_cz(pos_t control, pos_t targe); + void apply_cp(pos_t control, pos_t targe, real_t phase); + void apply_crx(pos_t control, pos_t targe, real_t theta); + void apply_cry(pos_t control, pos_t targe, real_t theta); + void apply_ccx(pos_t control1, pos_t control2, pos_t targe); + void apply_swap(pos_t q1, pos_t q2); + + //General implementation + //One-target gate, ctrl_num equal 2 represent multi-controlled gate + template + void apply_one_targe_gate_general(vector const& posv, complex *mat); + template + void apply_one_targe_gate_diag(vector const& posv, complex *mat); + template + void apply_one_targe_gate_real(vector const& posv, complex *mat); + template + void apply_one_targe_gate_x(vector const& posv); + + //Multiple-target gate + void apply_multi_targe_gate_general(vector const& posv, uint control_num, RowMatrixXcd const&mat); + + // Measure and Reset + std::pair sample_measure_probs(vector const& qbits); + vector probabilities() const; + void apply_diagonal_matrix(vector const& qbits, vector > const& mdiag); + void update(vector const& qbits, const uint final_state, const uint meas_state, const double meas_prob); + void apply_measure(vector const& qbits,const vector &cbits); + void apply_reset(vector const& qbits); + + // cif check + bool check_cif(const vector &cbits, const uint condition); + + complex operator[] (size_t j) const ; + void set_num(uint num); + void set_creg(uint num){ + if(num > 0){ + cbit_num_ = num; + creg_.resize(cbit_num_, 0); + }else{ + throw std::logic_error("The number of cbit must be positive."); + } + } + + vector creg(){ + return creg_; + } + + void set_rng(){ + std::random_device rd; + rng_.seed(rd()); + } + + void print_state(); + std::tuple*, size_t> move_data_to_python() { + auto data_ptr = data_.release(); + return std::make_tuple(std::move(data_ptr), size_); + } + + complex* data(){ return data_.get(); } + size_t size(){ return size_; } + uint num(){ return num_; } + uint cbit_num(){ return cbit_num_; } +}; + + +//////// constructors /////// + +template +StateVector::StateVector(uint num) +: num_(num), +size_(1ULL<[]>(size_); + data_[0] = complex(1., 0); +}; + +template +StateVector::StateVector() : StateVector(0){ } + +template +StateVector::StateVector(complex *data, size_t data_size) +: +data_(data), +size_(data_size) +{ + num_ = static_cast(std::log2(size_)); +} + + + +//// useful functions ///// +template +std::complex StateVector::operator[] (size_t j) const{ + return data_[j]; +} + +template +void StateVector::set_num(uint num){ + if (num_ > 0) { + // Initialized from statevector, + // should not resize + return; + } + num_ = num; + + if (size_ != 1ULL << num) { + data_.reset(); + size_ = 1ULL << num; + data_ = std::make_unique[]>(size_); + data_[0] = complex(1, 0); + } +} +template +bool StateVector::check_cif(const vector &cbits, const uint condition){ + uint out = 0; + for(uint i = 0; i < cbits.size(); i++){ + out *= 2; + out += creg_[cbits[i]]; + } + return out == condition; +} + +template +void StateVector::print_state(){ + std::cout << "state_data: "; + for (auto i=0;i +void StateVector::apply_x(pos_t pos){ + const size_t offset = 1<>1; + if (pos == 0){ //single step +#ifdef USE_SIMD +#pragma omp parallel for + for(omp_i j = 0;j < size_;j+=2){ + double* ptr = (double*)(data_.get() + j); + __m256d data = _mm256_loadu_pd(ptr); + data = _mm256_permute4x64_pd(data, 78); + _mm256_storeu_pd(ptr, data); + } +#else +#pragma omp parallel for + for(omp_i j = 0;j < size_;j+=2){ + std::swap(data_[j], data_[j+1]); + } +#endif + } + else{ +#ifdef USE_SIMD +#pragma omp parallel for + for(omp_i j = 0;j < rsize;j += 2){ + size_t i = (j&(offset-1)) | (j>>pos<>pos< +void StateVector::apply_y(pos_t pos){ + const size_t offset = 1<>1; + const complex im = imag_I; + if (pos == 0){ //single step +#ifdef USE_SIMD + __m256d minus_half = _mm256_set_pd(1, -1, -1, 1); +#pragma omp parallel for + for(omp_i j = 0;j < size_;j+=2){ + double* ptr = (double*)(data_.get() + j); + __m256d data = _mm256_loadu_pd(ptr); + data = _mm256_permute4x64_pd(data, 27); + data = _mm256_mul_pd(data, minus_half); + _mm256_storeu_pd(ptr, data); + } +#else +#pragma omp parallel for + for(omp_i j = 0;j < size_;j+=2){ + complex temp = data_[j]; + data_[j] = -im*data_[j+1]; + data_[j+1] = im*temp; + } +#endif + } + else{ +#ifdef USE_SIMD + __m256d minus_even = _mm256_set_pd(1, -1, 1, -1); + __m256d minus_odd = _mm256_set_pd(-1, 1, -1, 1); + +#pragma omp parallel for + for(omp_i j = 0;j < rsize;j += 2){ + size_t i = (j&(offset-1)) | (j>>pos<>pos< temp = data_[i]; + data_[i] = -im*data_[i+offset]; + data_[i+offset] = im*temp; + complex temp1 = data_[i1]; + data_[i1] = -im*data_[i1+offset]; + data_[i1+offset] = im*temp1; + } +#endif + } +} + +template +void StateVector::apply_z(pos_t pos){ + const size_t offset = 1<>1; + if (pos == 0){ //single step +#pragma omp parallel for + for(omp_i j = 1;j < size_;j+=2){ + data_[j] *= -1; + } + } + else{ +#ifdef USE_SIMD + __m256d minus_one = _mm256_set_pd(-1, -1, -1, -1); +#pragma omp parallel for + for(omp_i j = 0;j < rsize;j += 2){ + size_t i = (j&(offset-1)) | (j>>pos<>pos< +void StateVector::apply_h(pos_t pos){ + const double sqrt2inv = 1. / std::sqrt(2.); + complex mat[4] = {sqrt2inv, sqrt2inv, sqrt2inv, -sqrt2inv}; + apply_one_targe_gate_real<0>(vector{pos}, mat); + +} + +template +void StateVector::apply_s(pos_t pos){ + complex mat[2] = {1., imag_I}; + apply_one_targe_gate_diag<0>(vector{pos}, mat); +} + +template +void StateVector::apply_sdag(pos_t pos){ + complex mat[2] = {1., -imag_I}; + apply_one_targe_gate_diag<0>(vector{pos}, mat); + +} + +template +void StateVector::apply_t(pos_t pos){ + complex p = imag_I*PI/4.; + complex mat[2] = {1., std::exp(p)}; + apply_one_targe_gate_diag<0>(vector{pos}, mat); + +} + +template +void StateVector::apply_tdag(pos_t pos){ + complex p = -imag_I*PI/4.; + complex mat[2] = {1., std::exp(p)}; + apply_one_targe_gate_diag<0>(vector{pos}, mat); + +} + +template +void StateVector::apply_p(pos_t pos, real_t phase){ + complex p = imag_I*phase; + complex mat[2] = {1., std::exp(p)}; + apply_one_targe_gate_diag<0>(vector{pos}, mat); +} + + +template +void StateVector::apply_rx(pos_t pos, real_t theta){ + complex mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)}; + apply_one_targe_gate_general<0>(vector{pos}, mat); +} + + +template +void StateVector::apply_ry(pos_t pos, real_t theta){ + complex mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)}; + apply_one_targe_gate_real<0>(vector{pos}, mat); +} + +template +void StateVector::apply_rz(pos_t pos, real_t theta){ + complex z0 = -imag_I*theta/2.; + complex z1 = imag_I*theta/2.; + complex mat[2] = {std::exp(z0), std::exp(z1)}; + apply_one_targe_gate_diag<0>(vector{pos}, mat); +} + +template +void StateVector::apply_cnot(pos_t control, pos_t targe){ + apply_one_targe_gate_x<1>(vector{control, targe}); +} + +template +void StateVector::apply_cz(pos_t control, pos_t targe){ + complex mat[2] = {1., -1.}; + apply_one_targe_gate_diag<1>(vector{control, targe}, mat); +} + +template +void StateVector::apply_cp(pos_t control, pos_t targe, real_t phase){ + complex p = imag_I*phase; + complex mat[2] = {1., std::exp(p)}; + apply_one_targe_gate_diag<1>(vector{control, targe}, mat); +} + +template +void StateVector::apply_crx(pos_t control, pos_t targe, real_t theta){ + complex mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)}; + + apply_one_targe_gate_general<1>(vector{control, targe}, mat); +} + +template +void StateVector::apply_cry(pos_t control, pos_t targe, real_t theta){ + complex mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)}; + + apply_one_targe_gate_real<1>(vector{control, targe}, mat); +} + +template +void StateVector::apply_ccx(pos_t control1, pos_t control2, pos_t targe){ + apply_one_targe_gate_x<2>(vector{control1, control2, targe}); +} + +/////// General implementation ///////// + +template +template +void StateVector::apply_one_targe_gate_general(vector const& posv, complex *mat) +{ + std::function getind_func_near; + std::function getind_func; + size_t rsize; + size_t offset; + size_t targe; + size_t control = 0; + size_t setbit; + size_t poffset; + bool has_control=false; + vector posv_sorted = posv; + if (ctrl_num == 0){ + targe = posv[0]; + offset = 1ll<>1; + getind_func_near = [&](size_t j)-> size_t { + return 2*j; + }; + + getind_func = [&](size_t j)-> size_t { + return (j&(offset-1)) | (j>>targe<targe) { + control--; + } + poffset=1ll<>2; + getind_func = [&](size_t j) -> size_t { + size_t i = (j>>control<<(control+1))|(j&(poffset-1)); + i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit; + return i; + }; + + getind_func_near = getind_func; + + + } + else if(ctrl_num == 2){ + has_control = true; + control = *min_element(posv.begin(), posv.end()-1); + targe = *(posv.end()-1); + offset = 1ll<>posv.size(); + getind_func = [&](size_t j)-> size_t{ + size_t i = j; + for (size_t k=0;k < posv.size();k++) + { + size_t _pos = posv_sorted[k]; + i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1); + } + for (size_t k=0;k < posv.size()-1;k++){ + i |= 1ll< mat00 = mat[0]; + const complex mat01 = mat[1]; + const complex mat10 = mat[2]; + const complex mat11 = mat[3]; + if (targe == 0){ +#pragma omp parallel for + for(omp_i j = 0;j < rsize;j++){ + size_t i = getind_func_near(j); + complex temp = data_[i]; + data_[i] = mat00*data_[i] + mat01*data_[i+1]; + data_[i+1] = mat10*temp + mat11*data_[i+1]; + } + }else if (has_control && control == 0){ //single step +#pragma omp parallel for + for(omp_i j = 0;j < rsize;j++){ + size_t i = getind_func(j); + complex temp = data_[i]; + data_[i] = mat00*data_[i] + mat01*data_[i+offset]; + data_[i+offset] = mat10*temp + mat11*data_[i+offset]; + } + + }else{//unroll to 2 +#ifdef USE_SIMD + __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(), mat[0].real()); + __m256d m_00im = _mm256_set_pd(mat[0].imag(), -mat[0].imag(), mat[0].imag(), -mat[0].imag()); + __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(), mat[1].real(), mat[1].real()); + __m256d m_01im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(), mat[1].imag(), -mat[1].imag()); + + __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real()); + __m256d m_10im = _mm256_set_pd(mat[2].imag(), -mat[2].imag(),mat[2].imag(), -mat[2].imag()); + __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real()); + __m256d m_11im = _mm256_set_pd(mat[3].imag(), -mat[3].imag(), mat[3].imag(), -mat[3].imag()); +#pragma omp parallel for + for(omp_i j = 0;j < rsize; j+= 2){ + size_t i = getind_func(j); + + double* p0 = (double*)(data_.get()+i); + double* p1 = (double*)(data_.get()+i+offset); + //load data + __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0 + __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1 + __m256d data0_p = _mm256_permute_pd(data0, 5); + __m256d data1_p = _mm256_permute_pd(data1, 5); + + //row0 + __m256d temp00re = _mm256_mul_pd(m_00re, data0); + __m256d temp00im = _mm256_mul_pd(m_00im, data0_p); + __m256d temp00 = _mm256_add_pd(temp00re, temp00im); + __m256d temp01re = _mm256_mul_pd(m_01re, data1); + __m256d temp01im = _mm256_mul_pd(m_01im, data1_p); + __m256d temp01 = _mm256_add_pd(temp01re, temp01im); + __m256d temp0 = _mm256_add_pd(temp00, temp01); + + //row1 + __m256d temp10re = _mm256_mul_pd(m_10re, data0); + __m256d temp10im = _mm256_mul_pd(m_10im, data0_p); + __m256d temp10 = _mm256_add_pd(temp10re, temp10im); + __m256d temp11re = _mm256_mul_pd(m_11re, data1); + __m256d temp11im = _mm256_mul_pd(m_11im, data1_p); + __m256d temp11 = _mm256_add_pd(temp11re, temp11im); + __m256d temp1 = _mm256_add_pd(temp10, temp11); + + _mm256_storeu_pd(p0, temp0); + _mm256_storeu_pd(p1, temp1); + } +#else +#pragma omp parallel for + for(omp_i j = 0;j < rsize;j += 2){ + size_t i = getind_func(j); + size_t i1 = i+1; + complex temp = data_[i]; + complex temp1 = data_[i1]; + data_[i] = mat00*data_[i] + mat01*data_[i+offset]; + data_[i+offset] = mat10*temp + mat11*data_[i+offset]; + data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset]; + data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset]; + } +#endif + } +} + + +template +template +void StateVector::apply_one_targe_gate_x(vector const& posv) +{ + std::function getind_func_near; + std::function getind_func; + size_t rsize; + size_t offset; + size_t targe; + size_t control; + size_t setbit; + size_t poffset; + vector posv_sorted = posv; + bool has_control=false; + if (ctrl_num == 0){ + targe = posv[0]; + offset = 1ll<>1; + getind_func_near = [&](size_t j)-> size_t { + return 2*j; + }; + + getind_func = [&](size_t j)-> size_t { + return (j&(offset-1)) | (j>>targe<targe) { + control--; + } + poffset=1ll<>2; + getind_func = [&](size_t j) -> size_t { + size_t i = (j>>control<<(control+1))|(j&(poffset-1)); + i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit; + return i; + }; + getind_func_near = getind_func; + } + else if(ctrl_num == 2){ + has_control = true; + control = *min_element(posv.begin(), posv.end()-1); + targe = *(posv.end()-1); + offset = 1ll<>posv.size(); + + getind_func = [&](size_t j) -> size_t{ + size_t i = j; + for (size_t k=0;k < posv.size();k++){ + size_t _pos = posv_sorted[k]; + i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1); + } + for (size_t k=0;k < posv.size()-1;k++){ + i |= 1ll< +template +void StateVector::apply_one_targe_gate_real(vector const& posv, complex *mat) +{ + std::function getind_func_near; + std::function getind_func; + size_t rsize; + size_t offset; + size_t targe; + size_t control = 0; + size_t setbit; + size_t poffset; + bool has_control=false; + vector posv_sorted = posv; + if (ctrl_num == 0){ + targe = posv[0]; + offset = 1ll<>1; + getind_func_near = [&](size_t j)-> size_t { + return 2*j; + }; + + getind_func = [&](size_t j)-> size_t { + return (j&(offset-1)) | (j>>targe<targe) { + control--; + } + poffset=1ll<>2; + getind_func = [&](size_t j) -> size_t { + size_t i = (j>>control<<(control+1))|(j&(poffset-1)); + i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit; + return i; + }; + + getind_func_near = getind_func; + } + else if(ctrl_num == 2){ + has_control = true; + control = *min_element(posv.begin(), posv.end()-1); + targe = *(posv.end()-1); + offset = 1ll<>posv.size(); + getind_func = [&](size_t j)-> size_t{ + size_t i = j; + for (size_t k=0;k < posv.size();k++){ + size_t _pos = posv_sorted[k]; + i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1); + } + for (size_t k=0;k < posv.size()-1;k++){ + i |= 1ll< temp = data_[i]; + data_[i] = mat00*data_[i] + mat01*data_[i+1]; + data_[i+1] = mat10*temp + mat11*data_[i+1]; + } + + }else if (has_control && control == 0){ //single step + +#pragma omp parallel for + for(omp_i j = 0;j < rsize;j++){ + size_t i = getind_func(j); + complex temp = data_[i]; + data_[i] = mat00*data_[i] + mat01*data_[i+offset]; + data_[i+offset] = mat10*temp + mat11*data_[i+offset]; + } + }else{//unroll to 2 +#ifdef USE_SIMD + __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(), mat[0].real()); + __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(), mat[1].real(), mat[1].real()); + __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real()); + __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real()); +#pragma omp parallel for + for(omp_i j = 0;j < rsize; j+= 2){ + size_t i = getind_func(j); + + double* p0 = (double*)(data_.get()+i); + double* p1 = (double*)(data_.get()+i+offset); + //load data + __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0 + __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1 + __m256d data0_p = _mm256_permute_pd(data0, 5); + __m256d data1_p = _mm256_permute_pd(data1, 5); + + //row0 + __m256d temp00re = _mm256_mul_pd(m_00re, data0); + __m256d temp01re = _mm256_mul_pd(m_01re, data1); + __m256d temp0 = _mm256_add_pd(temp00re, temp01re); + + //row1 + __m256d temp10re = _mm256_mul_pd(m_10re, data0); + __m256d temp11re = _mm256_mul_pd(m_11re, data1); + __m256d temp1 = _mm256_add_pd(temp10re, temp11re); + + _mm256_storeu_pd(p0, temp0); + _mm256_storeu_pd(p1, temp1); + } +#else +#pragma omp parallel for + for(omp_i j = 0;j < rsize;j += 2){ + size_t i = getind_func(j); + size_t i1 = i+1; + complex temp = data_[i]; + complex temp1 = data_[i1]; + data_[i] = mat00*data_[i] + mat01*data_[i+offset]; + data_[i+offset] = mat10*temp + mat11*data_[i+offset]; + data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset]; + data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset]; + } +#endif + } +} + + +template +template +void StateVector::apply_one_targe_gate_diag(vector const& posv, complex *mat) +{ + std::function getind_func_near; + std::function getind_func; + size_t rsize; + size_t offset; + size_t targe; + size_t control = 0; + size_t setbit; + size_t poffset; + bool has_control=false; + vector posv_sorted = posv; + if (ctrl_num == 0){ + targe = posv[0]; + offset = 1ll<>1; + getind_func_near = [&](size_t j)-> size_t { + return 2*j; + }; + + getind_func = [&](size_t j)-> size_t { + return (j&(offset-1)) | (j>>targe<targe) { + control--; + } + poffset=1ll<>2; + getind_func = [&](size_t j) -> size_t { + size_t i = (j>>control<<(control+1))|(j&(poffset-1)); + i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit; + return i; + }; + + getind_func_near = getind_func; + + } + else if(ctrl_num == 2){ + has_control = true; + control = *min_element(posv.begin(), posv.end()-1); + targe = *(posv.end()-1); + offset = 1ll<>posv.size(); + getind_func = [&](size_t j)-> size_t + { + size_t i = j; + for (size_t k=0;k < posv.size();k++){ + size_t _pos = posv_sorted[k]; + i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1); + } + for (size_t k=0;k < posv.size()-1;k++){ + i |= 1ll< temp = data_[i]; + data_[i] *= mat[0]; + data_[i+offset] *= mat[1]; + } + + }else{//unroll to 2 +#ifdef USE_SIMD + __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(), mat[0].real()); + __m256d m_00im = _mm256_set_pd(mat[0].imag(), -mat[0].imag(), mat[0].imag(), -mat[0].imag()); + __m256d m_11re = _mm256_set_pd(mat[1].real(), mat[1].real(), mat[1].real(), mat[1].real()); + __m256d m_11im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(), mat[1].imag(), -mat[1].imag()); +#pragma omp parallel for + for(omp_i j = 0;j < rsize; j+= 2){ + size_t i = getind_func(j); + + double* p0 = (double*)(data_.get()+i); + double* p1 = (double*)(data_.get()+i+offset); + + //load data + __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0 + __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1 + __m256d data0_p = _mm256_permute_pd(data0, 5); + __m256d data1_p = _mm256_permute_pd(data1, 5); + + //row0 + __m256d temp00re = _mm256_mul_pd(m_00re, data0); + __m256d temp00im = _mm256_mul_pd(m_00im, data0_p); + __m256d temp00 = _mm256_add_pd(temp00re, temp00im); + + //row1 + __m256d temp11re = _mm256_mul_pd(m_11re, data1); + __m256d temp11im = _mm256_mul_pd(m_11im, data1_p); + __m256d temp11 = _mm256_add_pd(temp11re, temp11im); + + _mm256_storeu_pd(p0, temp00); + _mm256_storeu_pd(p1, temp11); + } +#else +#pragma omp parallel for + for(omp_i j = 0;j < rsize;j += 2){ + size_t i = getind_func(j); + size_t i1 = i+1; + data_[i] *= mat[0]; + data_[i+offset] *= mat[1]; + data_[i1] *= mat[0]; + data_[i1+offset] *= mat[1]; + } +#endif + } +} + +template +void StateVector::apply_multi_targe_gate_general(vector const& posv, uint control_num, RowMatrixXcd const& mat) +{ + auto posv_sorted = posv; + auto targs = vector(posv.begin()+control_num, posv.end()); + sort(posv_sorted.begin(), posv_sorted.end()); + size_t rsize = size_ >> posv.size(); + uint targe_num = targs.size(); + size_t matsize= 1<< targe_num; + std::vector targ_mask(matsize); + //create target mask + for (size_t m = 0; m < matsize;m++){ + for (size_t j = 0; j < targe_num; j++){ + if ((m>>j)&1){ + auto mask_pos = targs[j]; + targ_mask[m] |= 1ll<>_pos<<_pos<<1); + } + // Set control + for (size_t k=0; k < control_num;k++){ + i |= 1ll< const& qubits_sorted, const uint k) { + uint lowbits, retval = k; + for (size_t j = 0; j < qubits_sorted.size(); j++) { + lowbits = retval & ((1LL << qubits_sorted[j]) -1); + retval >>= qubits_sorted[j]; + retval <<= qubits_sorted[j] + 1; + retval |= lowbits; + } + return retval; +} + +using indexes_t = vector; +inline indexes_t indexes(vector const& qbits, vector const& qubits_sorted, const uint k) { + const auto N = qubits_sorted.size(); + indexes_t ret(1LL << N, 0); + // Get index0 + ret[0] = index0(qubits_sorted, k); + for (size_t i = 0; i < N; i++) { + const auto n = 1LL << i; + const auto bit = 1ll << qbits[i]; + for (size_t j = 0; j < n; j++) + ret[n + j] = ret[j] | bit; + } + return ret; +} + +template +vector StateVector::probabilities() const { + const int len = 1LL << num_; + vector probs(len, 0.); +#pragma omp parallel for + for (int j = 0; j < len; j++) { + probs[j] = std::real(data_[j] * std::conj(data_[j])); + } + return probs; +} + +vector> convert(const vector> &v){ + vector> ret(v.size(), 0.); + for (size_t i = 0; i< v.size(); ++i) + ret[i] = v[i]; + return ret; +} + +template +void StateVector::apply_diagonal_matrix(vector const&qbits, vector > const&diag){ + // just one qubit + if(qbits.size() == 1){ + const uint qubit = qbits[0]; + vector qbit0{qubit}; + if (diag[0] == 1.0) { // [[1, 0], [0, z]] matrix + if (diag[1] == 1.0) + return; // Identity + if (diag[1] == std::complex(0., -1.)) { // [[1, 0], [0, -i]] + auto func = [&](const indexes_t &inds) -> void { + const auto k = inds[1]; + double cache = data_[k].imag(); + data_[k].imag(data_[k].real() * -1.); + data_[k].real(cache); + }; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds); + } + return; + } + if (diag[1] == std::complex(0., 1.)) { + // [[1, 0], [0, i]] + auto func = [&](const indexes_t &inds) -> void { + const auto k = inds[1]; + double cache = data_[k].imag(); + data_[k].imag(data_[k].real()); + data_[k].real(cache * -1.); + }; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds); + } + return; + } + if (diag[0] == 0.0) { + // [[1, 0], [0, 0]] + auto func = [&](const indexes_t &inds) -> void { + data_[inds[1]] = 0.0; + }; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds); + } + return; + } + // general [[1, 0], [0, z]] + auto func = [&](const indexes_t &inds, const vector > &_mat) -> void { + const auto k = inds[1]; + data_[k] *= _mat[1]; + }; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds, convert(diag)); + } + return; + } else if (diag[1] == 1.0) { + // [[z, 0], [0, 1]] matrix + if (diag[0] == std::complex(0., -1.)) { + // [[-i, 0], [0, 1]] + auto func = [&](const indexes_t &inds) -> void { + const auto k = inds[1]; + double cache = data_[k].imag(); + data_[k].imag(data_[k].real() * -1.); + data_[k].real(cache); + }; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds); + } + return; + } + if (diag[0] == std::complex(0., 1.)) { + // [[i, 0], [0, 1]] + auto func = [&](const indexes_t &inds) -> void { + const auto k = inds[1]; + double cache = data_[k].imag(); + data_[k].imag(data_[k].real()); + data_[k].real(cache * -1.); + }; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds); + } + return; + } + if (diag[0] == 0.0) { + // [[0, 0], [0, 1]] + auto func = [&](const indexes_t &inds) -> void { + data_[inds[0]] = 0.0; + }; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds); + } + return; + } + // general [[z, 0], [0, 1]] + auto func = [&](const indexes_t &inds, const vector > &_mat) -> void { + const auto k = inds[0]; + data_[k] *= _mat[0]; + }; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds, convert(diag)); + } + return; + } else { + // Lambda function for diagonal matrix multiplication + auto func = [&](const indexes_t &inds, const vector > &_mat) -> void { + const auto k0 = inds[0]; + const auto k1 = inds[1]; + data_[k0] *= _mat[0]; + data_[k1] *= _mat[1]; + }; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds, convert(diag)); + } + } + return; + } + const uint N = qbits.size(); + auto func = [&](const indexes_t &inds, const vector> &_diag) -> void { + for(int i=0; i<2; ++i){ + const int k = inds[i]; + int iv = 0; + for(int j=0;j qbit0{qbits[0]}; +#pragma omp parallel for + for (int k = 0; k < (size_ >> 1); k+=1){ + const auto inds = indexes(qbit0, qbit0, k); + func(inds, convert(diag)); + } +} + +template +void StateVector::update(vector const& qbits, const uint final_state, const uint meas_state, const double meas_prob){ + const uint dim = 1ULL << qbits.size(); + vector > matdiag(dim, 0.); + matdiag[meas_state] = 1./ std::sqrt(meas_prob); + apply_diagonal_matrix(qbits, matdiag); + + //TODO: Add reset + // for reset + if(final_state != meas_state){ + if(qbits.size() == 1){ + // apply a x gate + apply_x(qbits[0]); + }else{ + // Diagonal matrix for projecting and renormalizing to measurement outcome + vector > perm(dim*dim, 0.); + perm[final_state * dim + meas_state] = 1.; + perm[meas_state * dim + final_state] = 1.; + for (uint j = 0; j < dim; j++){ + if(j != final_state && j != meas_state) + perm[j * dim + j] = 1.; + } + // apply permutation to swap state + const uint N = qbits.size(); + const uint DIM = 1ULL << N; + auto func = [&](const indexes_t &inds, const vector > &_mat) -> void { + // std::array, 1ULL << N > cache; + vector > cache(1ULL << N, 0.); + for(uint i = 0; i< DIM; i++){ + const auto ii = inds[i]; + cache[i] = data_[ii]; + data_[ii] = 0.; + } + for(uint i=0; i< DIM; i++) + for(uint j=0; j qs(qbits.begin(), qbits.end()); + vector qs_sorted(qs.begin(), qs.end()); + std::sort(qs_sorted.begin(), qs_sorted.end()); + uint END = size_ >> qs.size(); +#pragma omp parallel for + for (int k = 0; k < END; k+=1){ + const auto inds = indexes(qs, qs_sorted, k); + func(inds, convert(perm)); + } + } + } +} + +template +void printVector(const std::vector& vec) { + for (const T& element : vec) { + std::cout << element << " "; + } + std::cout << std::endl; +} + +template +std::pair StateVector::sample_measure_probs(vector const& qbits){ + // 1. caculate actual measurement outcome + const int64_t N = qbits.size(); + const int64_t DIM = 1LL << N; + const int64_t END = 1LL << (num_ - N); + vector probs(DIM, 0.); + vector qubits_sorted(qbits.begin(), qbits.end()); + + std::sort(qubits_sorted.begin(), qubits_sorted.end()); + if ((num_ == N) && ( qubits_sorted == qbits )){ + probs = probabilities(); + }else{ + vector probs_private(DIM, 0.); +#pragma omp parallel for + for (int64_t k = 0; k < END; k++){ + auto idx = indexes(qbits, qubits_sorted, k); + // std::cout<<"indexes"<(probs.begin(), probs.end())(rng_); + return std::make_pair(outcome, probs[outcome]); +} + +// change to bit endian +vector int2vec(uint n, uint base){ + vector ret; + while(n >= base){ + ret.push_back(n%base); + n /= base; + } + ret.push_back(n); + return ret; +} + +template +void StateVector::apply_measure(vector const& qbits, vector const& cbits){ + // 1. caculate actual measurement outcome + const auto meas = sample_measure_probs(qbits); + //2. update statevector + update(qbits, meas.first, meas.first, meas.second); + //3. store measure + vector outcome = int2vec(meas.first, 2); + if(outcome.size() < qbits.size()){ + outcome.resize(qbits.size()); + } + for(uint j=0; j < outcome.size(); j++){ + creg_[cbits[j]] = outcome[j]; + } +} + +template +void StateVector::apply_reset(vector const& qbits){ + const auto meas = sample_measure_probs(qbits); + update(qbits, 0, meas.first, meas.second); +} \ No newline at end of file diff --git a/src/qfvm/types.hpp b/src/qfvm/types.hpp new file mode 100644 index 0000000..5ece0f7 --- /dev/null +++ b/src/qfvm/types.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +using omp_i = signed long long; +#else +using omp_i = size_t; +#endif + +#ifdef _MSC_VER +#else +#ifndef __AVX2__ +#undef USE_SIMD +#endif +#endif + + +typedef unsigned int uint; +using pos_t = uint; +using data_t = double; +using std::complex; +using std::vector; +using std::string; +using RowMatrixXcd = Eigen::Matrix, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>; + +const complex imag_I = complex(0, 1.); +const double PI = 3.14159265358979323846; diff --git a/src/qfvm/util.h b/src/qfvm/util.h new file mode 100644 index 0000000..950b48e --- /dev/null +++ b/src/qfvm/util.h @@ -0,0 +1,99 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace Qfutil{ + +std::vector randomArr(size_t length, size_t max){ + srand((unsigned)time(NULL)); + std::vector arr(length); + for (size_t i=0;i < arr.size();i++){ + arr[i] = rand()%max; + } + return arr; +} + +int randomint(int min, int max){ + srand((unsigned)time(NULL)); + return (rand()%(max - min + 1)) + min; +} + +static uint32_t randomize(uint32_t i){ + i = (i^61)^(i>>16); + i *=9; + i ^= i<<4; + i *= 0x27d4eb2d; + i ^= i >> 15; + return i; +} + +template +void printVector(std::vector const &arr){ + for (auto i : arr){ + std::cout << i << " "; + } + std::cout << std::endl; +} + + +std::vector split_string(const std::string& str, char delim) { + std::vector elems; + auto lastPos = str.find_first_not_of(delim, 0); + auto pos = str.find_first_of(delim, lastPos); + while (pos != std::string::npos || lastPos != std::string::npos) { + elems.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(delim, pos); + pos = str.find_first_of(delim, lastPos); + } + return elems; +} + +std::vector split_string(const std::string& str, char delim, uint num){ + auto end = str.length(); + std::vector elems; + auto lastPos = str.find_first_not_of(delim, 0); + auto pos = str.find_first_of(delim, lastPos); + while ((pos != std::string::npos || lastPos != std::string::npos) && elems.size() < num) { + elems.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(delim, pos); + pos = str.find_first_of(delim, lastPos); + } + + if ((pos != std::string::npos || lastPos != std::string::npos)){ + elems.push_back(str.substr(lastPos, end)); + } + return elems; +} + +template +std::vector find_numbers(const std::string &str){ + std::smatch matchs; + std::vector res; + std::regex pattern; + if (std::is_unsigned::value){ + pattern = std::regex("\\d+"); + }else if(std::is_floating_point::value){ + pattern = std::regex("-?(([1-9]\\d*\\.\\d*)|(0\\.\\d*[1-9]\\d*))|\\d+"); + } + + auto begin = std::sregex_iterator(str.begin(), str.end(), pattern); + const std::sregex_iterator end; + + for (std::sregex_iterator i = begin; i != end; ++i) { + std::string match_str = i->str(); + if (std::is_unsigned::value){ + res.push_back(std::stoi(match_str)); + }else if (std::is_floating_point::value){ + res.push_back(std::stod(match_str)); + } + } + + return res; +} + +}//namespace Qfutil \ No newline at end of file diff --git a/src/qfvm_gpu/apply_gate_custate.cuh b/src/qfvm_gpu/apply_gate_custate.cuh new file mode 100644 index 0000000..2d9c2e7 --- /dev/null +++ b/src/qfvm_gpu/apply_gate_custate.cuh @@ -0,0 +1,46 @@ + +#pragma once +#include +#include + + +void apply_gate_custate(cuDoubleComplex *psi_d, QuantumOperator &op, int n) +{ + + //get information form op + auto pos = op.positions(); + const int nTargets = op.targe_num(); + const int nControls = op.control_num(); + const int adjoint = 0; + + vector targets{pos.begin()+nControls, pos.end()}; + vector controls{pos.begin(), pos.begin()+nControls}; + + auto mat_temp = op.mat(); + cuDoubleComplex *mat = + reinterpret_cast(mat_temp.data()); + + // custatevec handle initialization + custatevecHandle_t handle; + custatevecCreate(&handle) ; + void* extraWorkspace = nullptr; + size_t extraWorkspaceSizeInBytes = 0; + + // check the size of external workspace + custatevecApplyMatrixGetWorkspaceSize( + handle, CUDA_C_64F, n, mat, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW, + adjoint, nTargets, nControls, CUSTATEVEC_COMPUTE_64F, &extraWorkspaceSizeInBytes) ; + + // allocate external workspace if necessary + if (extraWorkspaceSizeInBytes > 0) + cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes); + + custatevecApplyMatrix( + handle, psi_d, CUDA_C_64F, n, mat, CUDA_C_64F, + CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, targets.data(), nTargets, controls.data(), nullptr, + nControls, CUSTATEVEC_COMPUTE_64F, extraWorkspace, extraWorkspaceSizeInBytes); + + // destroy handle + custatevecDestroy(handle); +} + \ No newline at end of file diff --git a/src/qfvm_gpu/apply_gate_gpu.cuh b/src/qfvm_gpu/apply_gate_gpu.cuh new file mode 100644 index 0000000..e2d7be0 --- /dev/null +++ b/src/qfvm_gpu/apply_gate_gpu.cuh @@ -0,0 +1,411 @@ +#pragma once +#include +#include +#include +#include +#include + +struct targeIndex +{ + size_t ind0; + size_t ind1; +}; + + +__constant__ uint posv_d[50]; +__constant__ uint posv_sorted_d[50]; +__constant__ cuDoubleComplex mat_d_const[32*32]; //If target qubit < 5, use const memory; +__constant__ uint mat_mask_d_const[32]; + + +//-------------Single-target gate------------------------------- +template +__global__ void apply_one_targe_gate_kernel(cuDoubleComplex *psi_d, Func get_index, int rsize){ + cuDoubleComplex mat00 = mat_d_const[0]; + cuDoubleComplex mat01 = mat_d_const[1]; + cuDoubleComplex mat10 = mat_d_const[2]; + cuDoubleComplex mat11 = mat_d_const[3]; + + unsigned int gridSize = blockDim.x*gridDim.x; + for (int j = blockDim.x * blockIdx.x + threadIdx.x; j < rsize;j+= gridSize){ + targeIndex ind = get_index(j); + cuDoubleComplex temp = psi_d[ind.ind0]; + psi_d[ind.ind0] = cuCadd(cuCmul(mat00, psi_d[ind.ind0]), cuCmul(mat01, psi_d[ind.ind1])); + psi_d[ind.ind1] = cuCadd(cuCmul(mat10, temp), cuCmul(mat11, psi_d[ind.ind1])); + } +} + +template +void apply_one_targe_gate_gpu(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){ + //copy mat to device + auto mat_temp = op.mat(); + cuDoubleComplex *mat = + reinterpret_cast(mat_temp.data()); + checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, 4*sizeof(cuDoubleComplex))); + size_t rsize; + size_t offset; + size_t targe; + size_t control; + size_t setbit; + size_t poffset; + if (ctrl_num == 0){ + targe = op.positions()[0]; + offset = 1ll<>1; + auto getind_func = [offset, targe] __device__ (size_t j)-> targeIndex { + size_t ind0 = (j&(offset-1)) | (j>>targe<>>(psi_d, getind_func, rsize); + } + else if(ctrl_num == 1){ + control = op.positions()[0]; + targe = op.positions()[1]; + offset = 1ll<targe) { + control--; + } + poffset=1ll<>2; + auto getind_func = [control, targe, poffset, offset, setbit] __device__ (size_t j) -> targeIndex { + size_t ind0 = (j>>control<<(control+1))|(j&(poffset-1)); + ind0 = (ind0 >> targe << (targe+1))|(ind0 &(offset-1))|setbit; + size_t ind1 = ind0 + offset; + return {ind0, ind1}; + }; + + + size_t blockdim = rsize <= 1024 ? rsize : 1024; + size_t griddim = rsize / blockdim; + + apply_one_targe_gate_kernel<<>>(psi_d, getind_func, rsize); + } + else if(ctrl_num == 2){ + targe = op.positions().back(); + offset = 1ll<>psize; + + vector posv_sorted = op.positions(); + std::sort(posv_sorted.begin(), posv_sorted.end()); + //Copy pos to device + checkCudaErrors(cudaMemcpyToSymbol(posv_d, op.positions().data(), psize*sizeof(uint))); + checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); + + auto getind_func = [offset, psize] __device__ (size_t j)-> targeIndex{ + size_t ind0 = j; + for (pos_t k=0;k < psize;k++) + { + pos_t _pos = posv_sorted_d[k]; + ind0 = (ind0&((1ll<<_pos)-1)) | (ind0>>_pos<<_pos<<1); + } + for (pos_t k=0;k < psize-1;k++){ + ind0 |= 1ll<>>(psi_d, getind_func, rsize); + } +} + +template +__global__ void apply_2to4_targe_gate_kernel(cuDoubleComplex *psi_d, uint ctrlnum, int psize){ + constexpr uint matlen = 1<>_pos<<_pos<<1); + } + // Set control + for (size_t k=0; k < ctrlnum;k++){ + i |= 1ll< +void apply_2to4_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){ + // uint targe_num = op.targe_num(); + uint matlen = 1<(pos.begin()+op.control_num(), pos.end()); + vector targ_mask(matlen); + //create target mask + for (size_t m = 0; m < matlen;m++){ + for (size_t j = 0; j < targe_num; j++){ + if ((m>>j)&1){ + auto mask_pos = targs[j]; + targ_mask[m] |= 1ll< posv_sorted = op.positions(); + uint psize = pos.size(); + std::sort(posv_sorted.begin(),posv_sorted.end()); + //Copy pos to device + checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint))); + checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); + + //copy mat to const memory + auto mat_temp = op.mat(); + cuDoubleComplex *mat = reinterpret_cast(mat_temp.data()); + + checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex))); + checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint))); + size_t rsize = size>>psize; + + uint max_thread_num = targe_num < 4 ? 1024 : 512; + size_t blockdim = rsize <= max_thread_num ? rsize : max_thread_num; + size_t griddim = rsize / blockdim; + apply_2to4_targe_gate_kernel<<>>(psi_d, op.control_num(), psize); +} + + + +// ------------Large target number gate--------------- + +__global__ void apply_5_targe_gate_kernel_const(cuDoubleComplex *psi_d, uint ctrlnum, int psize, size_t size){ + uint rsize = size>>psize; + uint targnum = psize-ctrlnum; + uint matlen = (1<>_pos<<_pos<<1); + } + // Set control + for (size_t k=0; k < ctrlnum;k++){ + i |= 1ll<>1;offset > 0;offset >>=1){ + v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset); + v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset); + } + __syncthreads(); + if (!idx) psi_d[ i | mat_mask_d_const[idy]] = v; +} + + +void apply_5_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){ + uint targe_num = op.targe_num(); + uint matlen = 1<(pos.begin()+op.control_num(), pos.end()); + vector targ_mask(matlen); + //create target mask + for (size_t m = 0; m < matlen;m++){ + for (size_t j = 0; j < targe_num; j++){ + if ((m>>j)&1){ + auto mask_pos = targs[j]; + targ_mask[m] |= 1ll< posv_sorted = op.positions(); + uint psize = pos.size(); + std::sort(posv_sorted.begin(),posv_sorted.end()); + //Copy pos to device + checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint))); + checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); + + //copy mat to const memory + auto mat_temp = op.mat(); + cuDoubleComplex *mat = reinterpret_cast(mat_temp.data()); + + checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex))); + checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint))); + size_t rsize = size>>psize; + uint thread_num = matlen > 32 ? 32 : matlen; + dim3 blockdim = dim3(thread_num, thread_num); + apply_5_targe_gate_kernel_const<<>>(psi_d, op.control_num(), psize, size); +} + + +//For target number 6-10 +__global__ void apply_multi_targe_gate_kernel_shared(cuDoubleComplex *psi_d, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){ + + uint rsize = size>>psize; + uint targnum = psize-ctrlnum; + uint matlen = (1<>_pos<<_pos<<1); + } + // Set control + for (size_t k=0; k < ctrlnum;k++){ + i |= 1ll<>1;offset > 0;offset >>=1){ + v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset); + v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset); + } + __syncthreads(); + if (!idx) local_sum[y] = cuCadd(local_sum[y], v); + } + } + + for (int y = idy; y < matlen;y+=blockDim.y){ + if (!idx) psi_d[ i | mat_mask_d[y]] = local_sum[y]; + } +} + + +void apply_multi_targe_gate_gpu_shared(cuDoubleComplex *psi_d, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){ + uint targe_num = op.targe_num(); + uint matlen = 1<(pos.begin()+op.control_num(), pos.end()); + vector targ_mask(matlen); + //create target mask + for (size_t m = 0; m < matlen;m++){ + for (size_t j = 0; j < targe_num; j++){ + if ((m>>j)&1){ + auto mask_pos = targs[j]; + targ_mask[m] |= 1ll< posv_sorted = pos; + std::sort(posv_sorted.begin(),posv_sorted.end()); + //Copy pos to device + checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint))); + checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); + + + //copy mat to global memory + auto mat_temp = op.mat(); + cuDoubleComplex *mat = reinterpret_cast(mat_temp.data()); + checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice)); + + size_t rsize = size>>psize; + uint thread_num = matlen > 32 ? 32 : matlen; + dim3 blockdim = dim3(thread_num, thread_num); + + apply_multi_targe_gate_kernel_shared<<>>(psi_d, op.control_num(), mat_d, mat_mask_d, psize, size); +} + +//For target number > 10 +__global__ void apply_multi_targe_gate_kernel_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){ + uint rsize = size>>psize; + uint targnum = psize-ctrlnum; + uint matlen = (1<>_pos<<_pos<<1); + } + // Set control + for (size_t k=0; k < ctrlnum;k++){ + i |= 1ll<>1;offset > 0;offset >>=1){ + v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset); + v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset); + } + __syncthreads(); + if (!idx) v_sum = cuCadd(v_sum, v); + } + if (!idx) psi_d[ i | mat_mask_d[y]] = v_sum; + } +} + +void apply_multi_targe_gate_gpu_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){ + uint targe_num = op.targe_num(); + uint matlen = 1<(pos.begin()+op.control_num(), pos.end()); + vector targ_mask(matlen); + //create target mask + for (size_t m = 0; m < matlen;m++){ + for (size_t j = 0; j < targe_num; j++){ + if ((m>>j)&1){ + auto mask_pos = targs[j]; + targ_mask[m] |= 1ll< posv_sorted = pos; + std::sort(posv_sorted.begin(),posv_sorted.end()); + //Copy pos to device + checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint))); + checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint))); + + + //copy mat to global memory + auto mat_temp = op.mat(); + cuDoubleComplex *mat = reinterpret_cast(mat_temp.data()); + checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice)); + + size_t rsize = size>>psize; + uint thread_num = matlen > 32 ? 32 : matlen; + dim3 blockdim = dim3(thread_num, thread_num); + + apply_multi_targe_gate_kernel_global<<>>(psi_d, psi_d_copy, op.control_num(), mat_d, mat_mask_d, psize, size); + checkCudaErrors(cudaMemcpy(psi_d_copy, psi_d, size*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice)); +} \ No newline at end of file diff --git a/src/qfvm_gpu/cuda_simulator.cuh b/src/qfvm_gpu/cuda_simulator.cuh new file mode 100644 index 0000000..99ee5e4 --- /dev/null +++ b/src/qfvm_gpu/cuda_simulator.cuh @@ -0,0 +1,91 @@ +#pragma once +#include "cuda_statevector.cuh" +#include +#include +#include +#include +#include "apply_gate_gpu.cuh" + +void simulate_gpu(Circuit & circuit, CudaStateVector & psi_d){ + size_t size = psi_d.size(); + //initialize mat + cuDoubleComplex *mat_d; + uint *mat_mask_d; + CudaStateVector psi_d_copy{}; + if (circuit.max_targe_num() > 5) + { + uint max_matlen = 1< 10){ + psi_d_copy = psi_d; + } + } + + //apply_gate + for (auto gate : circuit.gates()){ + uint targnum = gate.targe_num(); + uint ctrlnum = gate.control_num(); + + if (targnum == 1){ + if (ctrlnum == 0){ + apply_one_targe_gate_gpu<0>(psi_d.data(), gate, size); + } + else if (ctrlnum == 1){ + apply_one_targe_gate_gpu<1>(psi_d.data(), gate, size); + } + else{ + apply_one_targe_gate_gpu<2>(psi_d.data(), gate, size); + } + } + else if(targnum > 1){ + if (targnum == 2){ + apply_2to4_targe_gate_gpu_const<2>(psi_d.data(), gate, size); + } + else if(targnum == 3){ + apply_2to4_targe_gate_gpu_const<3>(psi_d.data(), gate, size); + } + else if(targnum == 4){ + apply_2to4_targe_gate_gpu_const<4>(psi_d.data(), gate, size); + } + else if (targnum == 5){ + apply_5_targe_gate_gpu_const(psi_d.data(), gate, size); + }else if (targnum > 5 && targnum <= 10){ + apply_multi_targe_gate_gpu_shared(psi_d.data(), gate, mat_d, mat_mask_d, size); + } + else{ + apply_multi_targe_gate_gpu_global(psi_d.data(), psi_d_copy.data(), gate, mat_d, mat_mask_d, size); + } + } + else{ + throw "Invalid target number"; + } + } + + //free source + if (circuit.max_targe_num() > 5){ + checkCudaErrors(cudaFree(mat_d)); + checkCudaErrors(cudaFree(mat_mask_d)); + } +} + +void simulate_gpu(Circuit & circuit, StateVector & state){ + //initialize psi + state.set_num(circuit.qubit_num()); + size_t size = state.size(); + CudaStateVector psi_d(state); + + simulate_gpu(circuit, psi_d); + cudaDeviceSynchronize(); + + //copy back + complex* psi = reinterpret_cast*>(psi_d.data()); + checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex), cudaMemcpyDeviceToHost)); + psi=nullptr; +} + +StateVector simulate_gpu(Circuit & circuit){ + StateVector state(circuit.qubit_num()); + simulate_gpu(circuit, state); + return std::move(state); +} \ No newline at end of file diff --git a/src/qfvm_gpu/cuda_statevector.cuh b/src/qfvm_gpu/cuda_statevector.cuh new file mode 100644 index 0000000..19e8cb4 --- /dev/null +++ b/src/qfvm_gpu/cuda_statevector.cuh @@ -0,0 +1,58 @@ + +#pragma once +#include +#include +#include +#include + +class CudaStateVector +{ + protected: + uint num_; + size_t size_; + cuDoubleComplex *data_; + + public: + //construct function + CudaStateVector(){checkCudaErrors(cudaMalloc(&data_, 0));} + CudaStateVector(CudaStateVector const &other); + + explicit CudaStateVector(StateVector &sv); + ~CudaStateVector() { + checkCudaErrors(cudaFree(data_)); + } + + CudaStateVector &operator=(CudaStateVector const &other) + { + num_ = other.num(); + size_ = other.size(); + checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex))); + checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice)); + return *this; + } + + cuDoubleComplex* data() const { return data_;} + size_t size() const { return size_; } + uint num() const { return num_; } +}; + + +CudaStateVector::CudaStateVector(StateVector &sv) +: +num_(sv.num()), +size_(sv.size()) +{ + cuDoubleComplex *psi_h = reinterpret_cast(sv.data()); + checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex))); + checkCudaErrors(cudaMemcpy(data_, psi_h, size_*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice)); +} + +CudaStateVector::CudaStateVector(CudaStateVector const &other) +: +num_(other.num()), +size_(other.size()) +{ + checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex))); + checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice)); +} + diff --git a/src/qfvm_gpu/cuda_utils/CudaTexture.h b/src/qfvm_gpu/cuda_utils/CudaTexture.h new file mode 100644 index 0000000..0661d0d --- /dev/null +++ b/src/qfvm_gpu/cuda_utils/CudaTexture.h @@ -0,0 +1,39 @@ +#pragma once + + +#include "helper_cuda.h" +#include + + +struct CudaTexture { + cudaTextureObject_t tex; + + CudaTexture(CudaTexture const &) = delete; + CudaTexture(CudaTexture &&) = default; + CudaTexture &operator=(CudaTexture const &) = delete; + CudaTexture &operator=(CudaTexture &&) = default; + + template + CudaTexture(T *dataDev, int width, int height) { + cudaTextureObject_t tex; + cudaResourceDesc resDesc; + memset(&resDesc, 0, sizeof(resDesc)); + resDesc.resType = cudaResourceTypePitch2D; + resDesc.res.pitch2D.devPtr = dataDev; + resDesc.res.pitch2D.width = width; + resDesc.res.pitch2D.height = height; + resDesc.res.pitch2D.desc = cudaCreateChannelDesc(); + resDesc.res.pitch2D.pitchInBytes = width * sizeof(T); + cudaTextureDesc texDesc; + memset(&texDesc, 0, sizeof(texDesc)); + checkCudaErrors(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL)); + } + + ~CudaTexture() { + checkCudaErrors(cudaDestroyTextureObject(tex)); + } + + constexpr operator cudaTextureObject_t() const { + return tex; + } +}; diff --git a/src/qfvm_gpu/cuda_utils/helper_cuda.h b/src/qfvm_gpu/cuda_utils/helper_cuda.h new file mode 100644 index 0000000..516dd57 --- /dev/null +++ b/src/qfvm_gpu/cuda_utils/helper_cuda.h @@ -0,0 +1,953 @@ +/** + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include + +#include "helper_string.h" + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INCOMPLETE_PARAMETER_LIST: + return "CUFFT_INCOMPLETE_PARAMETER_LIST"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_PARSE_ERROR: + return "CUFFT_PARSE_ERROR"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_LICENSE_ERROR: + return "CUFFT_LICENSE_ERROR"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#ifdef __DRIVER_TYPES_H__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + } +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __CUDA_RUNTIME_H__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + if (computeMode == cudaComputeModeProhibited) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + if (computeMode != cudaComputeModeProhibited) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device)); + cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device); + if (result != cudaSuccess) { + // If cudaDevAttrClockRate attribute is not supported we + // set clockRate as 1, to consider GPU with most SMs and CUDA Cores. + if(result == cudaErrorInvalidValue) { + clockRate = 1; + } + else { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__, + static_cast(result), _cudaGetErrorEnum(result)); + exit(EXIT_FAILURE); + } + } + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + checkCudaErrors(cudaSetDevice(devID)); + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device)); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + if (integrated && (computeMode != cudaComputeModeProhibited)) { + checkCudaErrors(cudaSetDevice(current_device)); + + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(cudaGetDevice(&dev)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev)); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp b/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp new file mode 100644 index 0000000..59fdbc3 --- /dev/null +++ b/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. + * + * SPDX-License-Identifier: BSD-3-Clause + */ + +#define HANDLE_ERROR(x) \ +{ const auto err = x; \ + if (err != CUSTATEVEC_STATUS_SUCCESS ) { \ + printf("Error: %s in line %d\n", \ + custatevecGetErrorString(err), __LINE__); return err; } \ +}; + +#define HANDLE_CUDA_ERROR(x) \ +{ const auto err = x; \ + if (err != cudaSuccess ) { \ + printf("Error: %s in line %d\n", \ + cudaGetErrorString(err), __LINE__); return err; } \ +}; + +bool almost_equal(cuDoubleComplex x, cuDoubleComplex y) { + const double eps = 1.0e-5; + const cuDoubleComplex diff = cuCsub(x, y); + return (cuCabs(diff) < eps); +} + +bool almost_equal(double x, double y) { + const double eps = 1.0e-5; + const double diff = x - y; + return (abs(diff) < eps); +} diff --git a/src/qfvm_gpu/cuda_utils/helper_string.h b/src/qfvm_gpu/cuda_utils/helper_string.h new file mode 100644 index 0000000..77864b8 --- /dev/null +++ b/src/qfvm_gpu/cuda_utils/helper_string.h @@ -0,0 +1,683 @@ +/** + * Copyright 1993-2013 NVIDIA Corporation. All rights reserved. + * + * Please refer to the NVIDIA end user license agreement (EULA) associated + * with this source code for terms and conditions that govern your use of + * this software. Any use, reproduction, disclosure, or distribution of + * this software and related documentation outside the terms of the EULA + * is strictly prohibited. + * + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./_data_files/", + "./common/", // "/common/" subdir + "./common/data/", // "/common/data/" subdir + "./data/", // "/data/" subdir + "./src/", // "/src/" subdir + "./src//data/", // "/src//data/" subdir + "./inc/", // "/inc/" subdir + "./0_Simple/", // "/0_Simple/" subdir + "./1_Utilities/", // "/1_Utilities/" subdir + "./2_Graphics/", // "/2_Graphics/" subdir + "./3_Imaging/", // "/3_Imaging/" subdir + "./4_Finance/", // "/4_Finance/" subdir + "./5_Simulations/", // "/5_Simulations/" subdir + "./6_Advanced/", // "/6_Advanced/" subdir + "./7_CUDALibraries/", // "/7_CUDALibraries/" subdir + "./8_Android/", // "/8_Android/" subdir + "./samples/", // "/samples/" subdir + + "./0_Simple//data/", // "/0_Simple//data/" + // subdir + "./1_Utilities//data/", // "/1_Utilities//data/" + // subdir + "./2_Graphics//data/", // "/2_Graphics//data/" + // subdir + "./3_Imaging//data/", // "/3_Imaging//data/" + // subdir + "./4_Finance//data/", // "/4_Finance//data/" + // subdir + "./5_Simulations//data/", // "/5_Simulations//data/" + // subdir + "./6_Advanced//data/", // "/6_Advanced//data/" + // subdir + "./7_CUDALibraries//", // "/7_CUDALibraries//" + // subdir + "./7_CUDALibraries//data/", // "/7_CUDALibraries//data/" + // subdir + + "../", // up 1 in tree + "../common/", // up 1 in tree, "/common/" subdir + "../common/data/", // up 1 in tree, "/common/data/" subdir + "../data/", // up 1 in tree, "/data/" subdir + "../src/", // up 1 in tree, "/src/" subdir + "../inc/", // up 1 in tree, "/inc/" subdir + + "../0_Simple//data/", // up 1 in tree, + // "/0_Simple//" + // subdir + "../1_Utilities//data/", // up 1 in tree, + // "/1_Utilities//" + // subdir + "../2_Graphics//data/", // up 1 in tree, + // "/2_Graphics//" + // subdir + "../3_Imaging//data/", // up 1 in tree, + // "/3_Imaging//" + // subdir + "../4_Finance//data/", // up 1 in tree, + // "/4_Finance//" + // subdir + "../5_Simulations//data/", // up 1 in tree, + // "/5_Simulations//" + // subdir + "../6_Advanced//data/", // up 1 in tree, + // "/6_Advanced//" + // subdir + "../7_CUDALibraries//data/", // up 1 in tree, + // "/7_CUDALibraries//" + // subdir + "../8_Android//data/", // up 1 in tree, + // "/8_Android//" + // subdir + "../samples//data/", // up 1 in tree, + // "/samples//" + // subdir + "../../", // up 2 in tree + "../../common/", // up 2 in tree, "/common/" subdir + "../../common/data/", // up 2 in tree, "/common/data/" subdir + "../../data/", // up 2 in tree, "/data/" subdir + "../../src/", // up 2 in tree, "/src/" subdir + "../../inc/", // up 2 in tree, "/inc/" subdir + "../../sandbox//data/", // up 2 in tree, + // "/sandbox//" + // subdir + "../../0_Simple//data/", // up 2 in tree, + // "/0_Simple//" + // subdir + "../../1_Utilities//data/", // up 2 in tree, + // "/1_Utilities//" + // subdir + "../../2_Graphics//data/", // up 2 in tree, + // "/2_Graphics//" + // subdir + "../../3_Imaging//data/", // up 2 in tree, + // "/3_Imaging//" + // subdir + "../../4_Finance//data/", // up 2 in tree, + // "/4_Finance//" + // subdir + "../../5_Simulations//data/", // up 2 in tree, + // "/5_Simulations//" + // subdir + "../../6_Advanced//data/", // up 2 in tree, + // "/6_Advanced//" + // subdir + "../../7_CUDALibraries//data/", // up 2 in tree, + // "/7_CUDALibraries//" + // subdir + "../../8_Android//data/", // up 2 in tree, + // "/8_Android//" + // subdir + "../../samples//data/", // up 2 in tree, + // "/samples//" + // subdir + "../../../", // up 3 in tree + "../../../src//", // up 3 in tree, + // "/src//" subdir + "../../../src//data/", // up 3 in tree, + // "/src//data/" + // subdir + "../../../src//src/", // up 3 in tree, + // "/src//src/" + // subdir + "../../../src//inc/", // up 3 in tree, + // "/src//inc/" + // subdir + "../../../sandbox//", // up 3 in tree, + // "/sandbox//" + // subdir + "../../../sandbox//data/", // up 3 in tree, + // "/sandbox//data/" + // subdir + "../../../sandbox//src/", // up 3 in tree, + // "/sandbox//src/" + // subdir + "../../../sandbox//inc/", // up 3 in tree, + // "/sandbox//inc/" + // subdir + "../../../0_Simple//data/", // up 3 in tree, + // "/0_Simple//" + // subdir + "../../../1_Utilities//data/", // up 3 in tree, + // "/1_Utilities//" + // subdir + "../../../2_Graphics//data/", // up 3 in tree, + // "/2_Graphics//" + // subdir + "../../../3_Imaging//data/", // up 3 in tree, + // "/3_Imaging//" + // subdir + "../../../4_Finance//data/", // up 3 in tree, + // "/4_Finance//" + // subdir + "../../../5_Simulations//data/", // up 3 in tree, + // "/5_Simulations//" + // subdir + "../../../6_Advanced//data/", // up 3 in tree, + // "/6_Advanced//" + // subdir + "../../../7_CUDALibraries//data/", // up 3 in tree, + // "/7_CUDALibraries//" + // subdir + "../../../8_Android//data/", // up 3 in tree, + // "/8_Android//" + // subdir + "../../../0_Simple//", // up 3 in tree, + // "/0_Simple//" + // subdir + "../../../1_Utilities//", // up 3 in tree, + // "/1_Utilities//" + // subdir + "../../../2_Graphics//", // up 3 in tree, + // "/2_Graphics//" + // subdir + "../../../3_Imaging//", // up 3 in tree, + // "/3_Imaging//" + // subdir + "../../../4_Finance//", // up 3 in tree, + // "/4_Finance//" + // subdir + "../../../5_Simulations//", // up 3 in tree, + // "/5_Simulations//" + // subdir + "../../../6_Advanced//", // up 3 in tree, + // "/6_Advanced//" + // subdir + "../../../7_CUDALibraries//", // up 3 in tree, + // "/7_CUDALibraries//" + // subdir + "../../../8_Android//", // up 3 in tree, + // "/8_Android//" + // subdir + "../../../samples//data/", // up 3 in tree, + // "/samples//" + // subdir + "../../../common/", // up 3 in tree, "../../../common/" subdir + "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir + "../../../data/", // up 3 in tree, "../../../data/" subdir + "../../../../", // up 4 in tree + "../../../../src//", // up 4 in tree, + // "/src//" subdir + "../../../../src//data/", // up 4 in tree, + // "/src//data/" + // subdir + "../../../../src//src/", // up 4 in tree, + // "/src//src/" + // subdir + "../../../../src//inc/", // up 4 in tree, + // "/src//inc/" + // subdir + "../../../../sandbox//", // up 4 in tree, + // "/sandbox//" + // subdir + "../../../../sandbox//data/", // up 4 in tree, + // "/sandbox//data/" + // subdir + "../../../../sandbox//src/", // up 4 in tree, + // "/sandbox//src/" + // subdir + "../../../../sandbox//inc/", // up 4 in tree, + // "/sandbox//inc/" + // subdir + "../../../../0_Simple//data/", // up 4 in tree, + // "/0_Simple//" + // subdir + "../../../../1_Utilities//data/", // up 4 in tree, + // "/1_Utilities//" + // subdir + "../../../../2_Graphics//data/", // up 4 in tree, + // "/2_Graphics//" + // subdir + "../../../../3_Imaging//data/", // up 4 in tree, + // "/3_Imaging//" + // subdir + "../../../../4_Finance//data/", // up 4 in tree, + // "/4_Finance//" + // subdir + "../../../../5_Simulations//data/", // up 4 in tree, + // "/5_Simulations//" + // subdir + "../../../../6_Advanced//data/", // up 4 in tree, + // "/6_Advanced//" + // subdir + "../../../../7_CUDALibraries//data/", // up 4 in tree, + // "/7_CUDALibraries//" + // subdir + "../../../../8_Android//data/", // up 4 in tree, + // "/8_Android//" + // subdir + "../../../../0_Simple//", // up 4 in tree, + // "/0_Simple//" + // subdir + "../../../../1_Utilities//", // up 4 in tree, + // "/1_Utilities//" + // subdir + "../../../../2_Graphics//", // up 4 in tree, + // "/2_Graphics//" + // subdir + "../../../../3_Imaging//", // up 4 in tree, + // "/3_Imaging//" + // subdir + "../../../../4_Finance//", // up 4 in tree, + // "/4_Finance//" + // subdir + "../../../../5_Simulations//", // up 4 in tree, + // "/5_Simulations//" + // subdir + "../../../../6_Advanced//", // up 4 in tree, + // "/6_Advanced//" + // subdir + "../../../../7_CUDALibraries//", // up 4 in tree, + // "/7_CUDALibraries//" + // subdir + "../../../../8_Android//", // up 4 in tree, + // "/8_Android//" + // subdir + "../../../../samples//data/", // up 4 in tree, + // "/samples//" + // subdir + "../../../../common/", // up 4 in tree, "../../../common/" subdir + "../../../../common/data/", // up 4 in tree, "../../../common/data/" + // subdir + "../../../../data/", // up 4 in tree, "../../../data/" subdir + "../../../../../", // up 5 in tree + "../../../../../src//", // up 5 in tree, + // "/src//" + // subdir + "../../../../../src//data/", // up 5 in tree, + // "/src//data/" + // subdir + "../../../../../src//src/", // up 5 in tree, + // "/src//src/" + // subdir + "../../../../../src//inc/", // up 5 in tree, + // "/src//inc/" + // subdir + "../../../../../sandbox//", // up 5 in tree, + // "/sandbox//" + // subdir + "../../../../../sandbox//data/", // up 5 in tree, + // "/sandbox//data/" + // subdir + "../../../../../sandbox//src/", // up 5 in tree, + // "/sandbox//src/" + // subdir + "../../../../../sandbox//inc/", // up 5 in tree, + // "/sandbox//inc/" + // subdir + "../../../../../0_Simple//data/", // up 5 in tree, + // "/0_Simple//" + // subdir + "../../../../../1_Utilities//data/", // up 5 in tree, + // "/1_Utilities//" + // subdir + "../../../../../2_Graphics//data/", // up 5 in tree, + // "/2_Graphics//" + // subdir + "../../../../../3_Imaging//data/", // up 5 in tree, + // "/3_Imaging//" + // subdir + "../../../../../4_Finance//data/", // up 5 in tree, + // "/4_Finance//" + // subdir + "../../../../../5_Simulations//data/", // up 5 in tree, + // "/5_Simulations//" + // subdir + "../../../../../6_Advanced//data/", // up 5 in tree, + // "/6_Advanced//" + // subdir + "../../../../../7_CUDALibraries//data/", // up 5 in + // tree, + // "/7_CUDALibraries//" + // subdir + "../../../../../8_Android//data/", // up 5 in tree, + // "/8_Android//" + // subdir + "../../../../../samples//data/", // up 5 in tree, + // "/samples//" + // subdir + "../../../../../common/", // up 5 in tree, "../../../common/" subdir + "../../../../../common/data/", // up 5 in tree, "../../../common/data/" + // subdir + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ diff --git a/src/qfvm_gpu/cuda_utils/ticktock.h b/src/qfvm_gpu/cuda_utils/ticktock.h new file mode 100644 index 0000000..1adb8a9 --- /dev/null +++ b/src/qfvm_gpu/cuda_utils/ticktock.h @@ -0,0 +1,9 @@ +#pragma once + +//#include +//#define TICK(x) auto bench_##x = std::chrono::steady_clock::now(); +//#define TOCK(x) std::cout << #x ": " << std::chrono::duration_cast>(std::chrono::steady_clock::now() - bench_##x).count() << "s" << std::endl; + +#include +#define TICK(x) auto bench_##x = tbb::tick_count::now(); +#define TOCK(x) std::cout << #x ": " << (tbb::tick_count::now() - bench_##x).seconds() << "s" << std::endl; diff --git a/src/qfvm_gpu/custate_simu.cuh b/src/qfvm_gpu/custate_simu.cuh new file mode 100644 index 0000000..6456c7e --- /dev/null +++ b/src/qfvm_gpu/custate_simu.cuh @@ -0,0 +1,35 @@ +#pragma once +#include "cuda_statevector.cuh" +#include +#include +#include +#include "apply_gate_custate.cuh" + +void simulate_custate(Circuit & circuit, CudaStateVector & psi_d){ + size_t size = psi_d.size(); + int n = psi_d.num(); + for (auto gate : circuit.gates()){ + apply_gate_custate(psi_d.data(), gate, n); + } +} + +void simulate_custate(Circuit & circuit, StateVector & state){ + //initialize psi + state.set_num(circuit.qubit_num()); + size_t size = state.size(); + CudaStateVector psi_d(state); + + simulate_custate(circuit, psi_d); + cudaDeviceSynchronize(); + + //copy back + complex* psi = reinterpret_cast*>(psi_d.data()); + checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex), cudaMemcpyDeviceToHost)); + psi=nullptr; +} + +StateVector simulate_custate(Circuit & circuit){ + StateVector state(circuit.qubit_num()); + simulate_custate(circuit, state); + return std::move(state); +} \ No newline at end of file From 3baa9683899965df1bb95f3e1c5cb8bd8351b220 Mon Sep 17 00:00:00 2001 From: chensgit169 Date: Thu, 30 Nov 2023 16:11:43 +0800 Subject: [PATCH 4/5] fix: use lower case of instruction name --- quafu/visualisation/circuitPlot.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/quafu/visualisation/circuitPlot.py b/quafu/visualisation/circuitPlot.py index 4866ce7..dcc5eac 100644 --- a/quafu/visualisation/circuitPlot.py +++ b/quafu/visualisation/circuitPlot.py @@ -187,10 +187,9 @@ def __call__(self, plt.show() def _process_ins(self, ins: Instruction, append: bool = True): - name = ins.name - assert name in Instruction.ins_classes, 'If this should occur, please report a bug.' + name = ins.name.lower() + assert name in Instruction.ins_classes, 'Name: %s not registered, if this should occur, please report a bug.' % name - name = name.lower() _which = slice(np.min(ins.pos), np.max(ins.pos) + 1) depth = np.max(self.dorders[_which]) paras = ins.paras From bc7ea5f8550999d1529ff0b019bf7b8276d0ca9d Mon Sep 17 00:00:00 2001 From: chensgit169 Date: Thu, 30 Nov 2023 16:21:32 +0800 Subject: [PATCH 5/5] chore: add licence --- quafu/elements/oracle.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/quafu/elements/oracle.py b/quafu/elements/oracle.py index 2ce6e47..b49ebdf 100644 --- a/quafu/elements/oracle.py +++ b/quafu/elements/oracle.py @@ -1,3 +1,17 @@ +# (C) Copyright 2023 Beijing Academy of Quantum Information Sciences +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from abc import ABCMeta from quafu.elements import QuantumGate, Instruction from typing import Dict, Iterable, List