From 897e4b7fc065db7d906fddd6f5fdfe566a204854 Mon Sep 17 00:00:00 2001
From: chensgit169 <weichen191@mails.ucas.ac.cn>
Date: Thu, 30 Nov 2023 15:36:53 +0800
Subject: [PATCH 1/5] fet: use oracle in circuit

---
 quafu/circuits/quantum_circuit.py | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/quafu/circuits/quantum_circuit.py b/quafu/circuits/quantum_circuit.py
index a73837f..7ec4f3b 100644
--- a/quafu/circuits/quantum_circuit.py
+++ b/quafu/circuits/quantum_circuit.py
@@ -381,27 +381,12 @@ def wrap_to_gate(self, name: str):
         """
         Wrap the circuit to a subclass of QuantumGate, create by metaclass.
         """
-        from quafu.elements.quantum_gate import customize_gate
+        from quafu.elements.oracle import customize_gate
+        from copy import deepcopy
 
-        gate_structure = []
-        qubit_mapping = {q: i for i, q in enumerate(self.used_qubits)}
-        for gate in self.gates:
-            if isinstance(gate, QuantumGate):
-                gate = copy.deepcopy(gate)
-                # TODO: handel control pos
-                if isinstance(gate.pos, int):
-                    gate.pos = qubit_mapping[gate.pos]
-                else:
-                    gate.pos = [qubit_mapping[p] for p in gate.pos]
-                gate_structure.append(gate)
-            else:
-                raise ValueError()
-
-        # TODO: error check
-
-        customized = customize_gate(cls_name=name.lower(),
-                                    qubit_num=len(self.used_qubits),
-                                    gate_structure=gate_structure)
+        # TODO: check validity of instructions
+        gate_structure = [deepcopy(ins) for ins in self.instructions]
+        customized = customize_gate(name, gate_structure, self.num)
         return customized
 
     def id(self, pos: int) -> "QuantumCircuit":

From 4aba7aa34844a6fd9a4adcd1d15c3caa17110a74 Mon Sep 17 00:00:00 2001
From: chensgit169 <weichen191@mails.ucas.ac.cn>
Date: Thu, 30 Nov 2023 15:37:21 +0800
Subject: [PATCH 2/5] chore: (mistake) remove redundant files

---
 src/qfvm/circuit.hpp                          |  241 ---
 src/qfvm/operators.hpp                        |   97 --
 src/qfvm/qasm.hpp                             |   42 -
 src/qfvm/qfvm.cpp                             |  164 --
 src/qfvm/simulator.hpp                        |  116 --
 src/qfvm/statevector.hpp                      | 1355 -----------------
 src/qfvm/types.hpp                            |   33 -
 src/qfvm/util.h                               |   99 --
 src/qfvm_gpu/apply_gate_custate.cuh           |   46 -
 src/qfvm_gpu/apply_gate_gpu.cuh               |  411 -----
 src/qfvm_gpu/cuda_simulator.cuh               |   91 --
 src/qfvm_gpu/cuda_statevector.cuh             |   58 -
 src/qfvm_gpu/cuda_utils/CudaTexture.h         |   39 -
 src/qfvm_gpu/cuda_utils/helper_cuda.h         |  953 ------------
 src/qfvm_gpu/cuda_utils/helper_custatevec.hpp |   31 -
 src/qfvm_gpu/cuda_utils/helper_string.h       |  683 ---------
 src/qfvm_gpu/cuda_utils/ticktock.h            |    9 -
 src/qfvm_gpu/custate_simu.cuh                 |   35 -
 18 files changed, 4503 deletions(-)
 delete mode 100644 src/qfvm/circuit.hpp
 delete mode 100644 src/qfvm/operators.hpp
 delete mode 100644 src/qfvm/qasm.hpp
 delete mode 100644 src/qfvm/qfvm.cpp
 delete mode 100644 src/qfvm/simulator.hpp
 delete mode 100644 src/qfvm/statevector.hpp
 delete mode 100644 src/qfvm/types.hpp
 delete mode 100644 src/qfvm/util.h
 delete mode 100644 src/qfvm_gpu/apply_gate_custate.cuh
 delete mode 100644 src/qfvm_gpu/apply_gate_gpu.cuh
 delete mode 100644 src/qfvm_gpu/cuda_simulator.cuh
 delete mode 100644 src/qfvm_gpu/cuda_statevector.cuh
 delete mode 100644 src/qfvm_gpu/cuda_utils/CudaTexture.h
 delete mode 100644 src/qfvm_gpu/cuda_utils/helper_cuda.h
 delete mode 100644 src/qfvm_gpu/cuda_utils/helper_custatevec.hpp
 delete mode 100644 src/qfvm_gpu/cuda_utils/helper_string.h
 delete mode 100644 src/qfvm_gpu/cuda_utils/ticktock.h
 delete mode 100644 src/qfvm_gpu/custate_simu.cuh

diff --git a/src/qfvm/circuit.hpp b/src/qfvm/circuit.hpp
deleted file mode 100644
index cd08bea..0000000
--- a/src/qfvm/circuit.hpp
+++ /dev/null
@@ -1,241 +0,0 @@
-#pragma once
-#include "operators.hpp"
-#include "qasm.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-#include <pybind11/eigen.h>
-#include "util.h"
-#include <Eigen/Core>
-#include <algorithm>
-#include <iostream>
-
-namespace py = pybind11;
-using namespace pybind11::literals;
-
-
-void check_operator(QuantumOperator &op){
-    std::cout << "-------------" << std::endl;
-
-    std::cout << "name: " << op.name() << std::endl;
-    std::cout << "pos: ";
-    Qfutil::printVector(op.positions());
-
-    std::cout << "paras: ";
-    Qfutil::printVector(op.paras());
-
-    std::cout << "control number: ";
-    std::cout << op.control_num() << std::endl;
-
-    std::cout << "matrix: " << std::endl;
-    std::cout << op.mat() << std::endl;
-
-    std::cout << "flatten matrix: " << std::endl;
-    auto mat = op.mat();
-    // Eigen::Map<Eigen::RowVectorXcd> v1(mat.data(), mat.size());
-    // std::cout << "v1: " << v1 << std::endl;
-    auto matv = mat.data();
-    for (auto i = 0;i < mat.size();i++){
-        std::cout << matv[i] << " ";
-    }
-    std::cout << std::endl;
-    std::cout << "-------------" << std::endl;
-}
-
-
-class Circuit{
-    private:
-        uint qubit_num_;  
-        vector<QuantumOperator> instructions_;
-        uint max_targe_num_;
-        uint cbit_num_;
-        // to sample count
-        vector<std::pair<uint,uint>> measure_vec_;
-        bool final_measure_ = true;
-
-    public:
-    Circuit();
-    explicit Circuit(uint qubit_num);
-    explicit Circuit(vector<QuantumOperator> &ops);
-    explicit Circuit(py::object const&pycircuit); 
-
-    void add_op(QuantumOperator &op);
-    void compress_instructions();
-    uint qubit_num() const { return qubit_num_; }
-    uint cbit_num() const { return cbit_num_; }
-    uint max_targe_num() const { return max_targe_num_; }
-    bool final_measure() const { return final_measure_; }
-    vector<QuantumOperator> gates();
-    vector<std::pair<uint,uint>> measure_vec() { return measure_vec_; }
-    vector<QuantumOperator> instructions() const { return instructions_; }
-    QuantumOperator from_pyops(py::object const &obj);
-};
-
-void Circuit::add_op(QuantumOperator &op){
-    for (pos_t pos : op.positions()){
-        if (pos > qubit_num_) {
-            throw "invalid position on quantum registers";
-        }
-        else{
-            instructions_.push_back(op);
-        }
-    }
-}
-
- Circuit::Circuit(){};
- Circuit::Circuit(uint qubit_num)
- :
- qubit_num_(qubit_num){ }
-
- Circuit::Circuit(vector<QuantumOperator> &ops)
- :
- instructions_(ops),
- max_targe_num_(0){
-    qubit_num_ = 0;
-    for (auto op : ops){
-        for (pos_t pos : op.positions()){
-            if (op.targe_num() > max_targe_num_)
-                max_targe_num_ = op.targe_num();
-            if (pos+1 > qubit_num_){ qubit_num_ = pos+1; }
-        }
-    }
-}
-
-vector<QuantumOperator> Circuit::gates(){
-    // provide gates for gpu and custate
-    std::vector<std::string> classics = {"measure", "cif", "reset"};
-    vector<QuantumOperator> gates;
-    for(auto op : instructions_){
-        if(std::find(classics.begin(), classics.end(), op.name()) == classics.end()){
-            gates.push_back(op);
-        }
-    }
-    return gates;
-}
-
-// Construct C++ operators from pygates
-QuantumOperator Circuit::from_pyops(py::object const &obj){
-    string name;
-    vector<pos_t> positions;
-    vector<pos_t> qbits;
-    vector<pos_t> cbits;
-    vector<double> paras;
-    uint control_num = 0;
-    RowMatrixXcd mat;
-    
-    name = obj.attr("name").attr("lower")().cast<string>();
-    if (!(name == "barrier" || name == "delay" || name == "id" || name == "measure" || name == "reset" || name == "cif"))
-    {
-        if (py::isinstance<py::list>(obj.attr("pos"))){
-            positions = obj.attr("pos").cast<vector<pos_t>>();
-        }
-        else if(py::isinstance<py::int_>(obj.attr("pos"))){
-            positions = vector<pos_t>{obj.attr("pos").cast<pos_t>()};
-        }
-
-        if (py::isinstance<py::list>(obj.attr("paras"))){
-            paras = obj.attr("paras").cast<vector<double>>();
-        }
-        else if(py::isinstance<py::float_>(obj.attr("paras")) || py::isinstance<py::int_>(obj.attr("paras"))){
-            paras = vector<double>{obj.attr("paras").cast<double>()};
-        }
-
-        if (py::hasattr(obj, "ctrls")){
-                control_num = py::len(obj.attr("ctrls"));
-        }
-        
-        //Reverse order for multi-target gate
-        if (py::hasattr(obj, "_targ_matrix")){
-                mat = obj.attr("get_targ_matrix")("reverse_order"_a=true).cast<RowMatrixXcd>();
-        }
-        else{ //Single target gate
-                mat = obj.attr("matrix").cast<RowMatrixXcd>();
-        }
-        return QuantumOperator(name, paras, positions, control_num, mat);
-
-    }else if(name == "measure"){
-        if (py::isinstance<py::list>(obj.attr("qbits"))){
-            qbits = obj.attr("qbits").cast<vector<pos_t>>();
-        }else if(py::isinstance<py::int_>(obj.attr("qbits"))){
-            qbits = vector<pos_t>{obj.attr("qbits").cast<pos_t>()};
-        }
-
-        if (py::isinstance<py::list>(obj.attr("cbits"))){
-            cbits = obj.attr("cbits").cast<vector<pos_t>>();
-        }else if(py::isinstance<py::int_>(obj.attr("cbits"))){
-            cbits = vector<pos_t>{obj.attr("cbits").cast<pos_t>()};
-        }
-        //record qbit-cbit measure map
-        for(uint i = 0; i < qbits.size(); i++){
-            measure_vec_.push_back(std::make_pair(qbits[i], cbits[i]));
-        }
-        return QuantumOperator(name, qbits, cbits);
-
-    }else if(name == "reset"){
-        if (py::isinstance<py::list>(obj.attr("pos"))){
-            positions = obj.attr("pos").cast<vector<pos_t>>();
-        }
-        else if(py::isinstance<py::int_>(obj.attr("pos"))){
-            positions = vector<pos_t>{obj.attr("pos").cast<pos_t>()};
-        }
-        return QuantumOperator(name, positions);
-
-    }else if(name == "cif"){
-        uint condition = 0;
-        vector<QuantumOperator> instructions;
-        if (py::isinstance<py::list>(obj.attr("cbits"))){
-            cbits = obj.attr("cbits").cast<vector<pos_t>>();
-        }else if(py::isinstance<py::int_>(obj.attr("cbits"))){
-            cbits = vector<pos_t>{obj.attr("cbits").cast<pos_t>()};
-        }
-
-        if(py::isinstance<py::int_>(obj.attr("condition"))){
-            condition = obj.attr("condition").cast<pos_t>();
-        }
-
-        // Recursively handdle instruction
-        if (py::isinstance<py::list>(obj.attr("instructions"))){
-            auto pyops = obj.attr("instructions");
-            for(auto pyop_h : pyops){
-                py::object pyop = py::reinterpret_borrow<py::object>(pyop_h);
-                QuantumOperator op = from_pyops(pyop);
-                if (op){
-                    if (op.targe_num() > max_targe_num_)
-                        max_targe_num_ = op.targe_num();
-                    instructions.push_back(std::move(op));
-                }        
-            }
-        }
-        return QuantumOperator(name, cbits, condition, instructions); 
-
-    }else{
-        return QuantumOperator();
-    }
-   
-}
-
-Circuit::Circuit(py::object const&pycircuit)
-:
-max_targe_num_(0)
-{
-    // auto pygates = pycircuit.attr("gates");
-    auto pyops = pycircuit.attr("instructions");
-    auto used_qubits = pycircuit.attr("used_qubits").cast<vector<pos_t>>();
-    cbit_num_ =  pycircuit.attr("cbits_num").cast<uint>();
-    qubit_num_ = *std::max_element(used_qubits.begin(), used_qubits.end())+1;
-    // judge wheather op qubit after measure
-    bool measured = false;
-    for (auto pyop_h : pyops){
-        py::object pyop = py::reinterpret_borrow<py::object>(pyop_h);
-        QuantumOperator op = from_pyops(pyop);
-        if (op){
-            if (op.targe_num() > max_targe_num_)
-                max_targe_num_ = op.targe_num();
-            if(op.name() == "measure") {measured = true;}
-            else if(measured == true) {final_measure_ = false; } 
-            instructions_.push_back(std::move(op));
-        }        
-    }
-} 
-
-void Circuit::compress_instructions(){}
diff --git a/src/qfvm/operators.hpp b/src/qfvm/operators.hpp
deleted file mode 100644
index cb9fd66..0000000
--- a/src/qfvm/operators.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include "statevector.hpp"
-
-class QuantumOperator{
-    protected:
-        string name_;
-        vector<pos_t> positions_;
-        vector<double> paras_;
-        uint control_num_;
-        uint targe_num_;  
-        bool diag_;
-        bool real_;
-        RowMatrixXcd mat_;
-        vector<pos_t> qbits_;
-        vector<pos_t> cbits_;
-        vector<QuantumOperator> instructions_;
-        uint condition_;
-    public:
-        //Constructor
-        QuantumOperator();
-        QuantumOperator(string name, vector<pos_t> const &qbits);
-        QuantumOperator(string name, vector<pos_t> const &qbits, vector<pos_t> const &cbits);
-        QuantumOperator(string name, vector<pos_t> const &cbits, const uint condition, vector<QuantumOperator> const &ins);
-        QuantumOperator(string name, vector<double> paras, vector<pos_t> const &control_qubits, vector<pos_t> const &targe_qubits, RowMatrixXcd const &mat, bool diag=false, bool real=false);
-        QuantumOperator(string name,vector<double> paras, vector<pos_t> const &positions, uint control_num, RowMatrixXcd const &mat, bool diag=false, bool real=false);
-
-        //data accessor
-        string name() const {return name_;}
-        vector<double> paras() const {return paras_;}
-        bool has_control() const{return control_num_ == 0 ? false : true;}
-        bool is_real() const{ return real_; }
-        bool is_diag() const{ return diag_; }
-        RowMatrixXcd mat() const { return mat_;}
-        uint control_num() const { return control_num_; } 
-        uint targe_num() const { return targe_num_; }
-        uint condition() const { return condition_; }
-        vector<pos_t> positions(){ return positions_; }
-        explicit operator bool() const {
-            return !(name_ == "empty");
-        }
-        vector<pos_t> qbits(){ return qbits_; }
-        vector<pos_t> cbits(){ return cbits_; }
-        vector<QuantumOperator> instructions(){ return instructions_; }
-        //Apply method
-        virtual void apply_to_state(StateVector<double> & state){ };
-};
-
-
-QuantumOperator::QuantumOperator() : name_("empty"){ };
-
-QuantumOperator::QuantumOperator(string name, vector<pos_t> const &qbits)
-:
-name_(name),
-targe_num_(0),
-qbits_(qbits){}
-
-QuantumOperator::QuantumOperator(string name, vector<pos_t> const &qbits, vector<pos_t> const &cbits)
-:
-name_(name),
-targe_num_(0),
-qbits_(qbits),
-cbits_(cbits){}
-
-QuantumOperator::QuantumOperator(string name, vector<pos_t> const &cbits, const uint condition, vector<QuantumOperator> const &ins)
-:
-name_(name),
-targe_num_(0),
-cbits_(cbits),
-instructions_(ins),
-condition_(condition){}
-
-QuantumOperator::QuantumOperator(string name, vector<double> paras, vector<pos_t> const &positions, uint control_num, RowMatrixXcd const &mat, bool diag, bool real)
-:
-name_(name),
-paras_(paras),
-positions_(positions),
-control_num_(control_num),
-targe_num_(positions.size()-control_num),
-diag_(diag),
-real_(real),
-mat_(mat){ }
-
-QuantumOperator::QuantumOperator(string name, vector<double> paras, vector<pos_t> const &control_qubits, vector<pos_t> const &targe_qubits, RowMatrixXcd const &mat, bool diag, bool real)
-:
-name_(name),
-paras_(paras),
-diag_(diag),
-real_(real),
-mat_(mat){
-    positions_ = control_qubits;
-    positions_.insert(positions_.end(), targe_qubits.begin(), targe_qubits.end());
-    control_num_ = control_qubits.size();
-    targe_num_ = targe_qubits.size();
-}
-
diff --git a/src/qfvm/qasm.hpp b/src/qfvm/qasm.hpp
deleted file mode 100644
index e0a2285..0000000
--- a/src/qfvm/qasm.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-#pragma once
-
-#include <unordered_map>
-#include "types.hpp"
-#include "util.h"
-
-using Qfutil::split_string;
-using Qfutil::find_numbers;
-#define Pair(name) {#name, Opname::name}
-
-enum class Opname{
-    creg, x, y, z, h, s, sdg, t, tdg, p, rx, ry, rz, cnot, cx, cz, crx, cp, ccx, toffoli, swap, iswap, rxx, ryy, rzz, measure, reset, cif
-};
-
-std::unordered_map<string, Opname> OPMAP{Pair(creg), Pair(x), Pair(y), Pair(z), Pair(h), Pair(s), Pair(sdg), Pair(t),
-                            Pair(tdg), Pair(p), Pair(rx), Pair(ry), Pair(rz), Pair(cnot), Pair(cx), Pair(cz), 
-                            Pair(crx), Pair(cp), Pair(ccx), Pair(swap), Pair(iswap), Pair(rxx), Pair(ryy), 
-                            Pair(rzz), Pair(measure), Pair(reset), Pair(cif)};
-
-struct Operation{
-    string name;
-    vector<pos_t> positions;
-    vector<double> params; 
-
-    void print_info(){
-        std::cout << "name " << name << std::endl;
-        std::cout << "positions: ";
-        for (auto pos : positions){
-            std::cout << pos << " ";
-        }
-        std::cout << std::endl;
-
-        if (params.size() > 0){
-        printf("parameters: ");
-            for (auto para : params){
-                printf("%.6f ", para);
-            }
-        }
-        printf("\n");
-        printf("-----\n");
-    }
-} ;
diff --git a/src/qfvm/qfvm.cpp b/src/qfvm/qfvm.cpp
deleted file mode 100644
index f318da6..0000000
--- a/src/qfvm/qfvm.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include "simulator.hpp"
-#include <iostream>
-#include <random>
-#ifdef _USE_GPU
-#include <cuda_simulator.cuh>
-#endif
-
-#ifdef _USE_CUQUANTUM
-#include <custate_simu.cuh>
-#endif
-
-namespace py = pybind11;
-
-template <typename T>
-py::array_t<T> to_numpy(const std::tuple<T*, size_t> &src) {
-    auto src_ptr = std::get<0>(src);
-    auto src_size = std::get<1>(src);
-
-    auto capsule = py::capsule(src_ptr, [](void* p) {
-        delete [] reinterpret_cast<T*>(p);
-    });
-    return py::array_t<T>(
-        src_size,
-        src_ptr,
-        capsule
-    );
-}
-
-std::pair<std::map<uint, uint>, py::array_t<complex<double>> > simulate_circuit(py::object const&pycircuit, py::array_t<complex<double>> &np_inputstate, const int &shots){
-    auto circuit = Circuit(pycircuit);
-    py::buffer_info buf = np_inputstate.request();
-    auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
-    size_t data_size = buf.size;
-    // If measure all at the end, simulate once
-    uint actual_shots = shots;
-    if (circuit.final_measure()) actual_shots = 1;
-    StateVector<double> global_state;
-    vector<std::pair<uint,uint>>measures = circuit.measure_vec();
-    std::map<uint,bool>cbit_measured;
-    for(auto &pair: measures){
-        cbit_measured[pair.second] = true;        
-    }
-    // Store outcome's count
-    std::map<uint, uint> outcount;
-    for(uint i =0; i < actual_shots; i++){
-        StateVector<double> state;
-        if(data_size == 0){
-            simulate(circuit, state);
-        }else{
-            //deepcopy state
-            vector<std::complex<double>> data_copy(data_ptr, data_ptr + data_size);
-            state = std::move(StateVector<double>(data_copy.data(), data_copy.size()));
-            simulate(circuit, state);
-        }
-        if(!circuit.final_measure()){
-            // store reg
-            vector<uint> tmpcreg = state.creg();
-            uint outcome = 0;
-            for(uint j=0;j<tmpcreg.size();j++){
-                if(cbit_measured.find(j) == cbit_measured.end()) continue; 
-                outcome *= 2;
-                outcome += tmpcreg[j];
-            }
-            if(outcount.find(outcome) != outcount.end()) outcount[outcome]++;
-            else outcount[outcome] = 1;
-        }
-        if (circuit.final_measure() || i == actual_shots-1) global_state = std::move(state);
-    }
-    // sample outcome if final_measure is true
-    if(circuit.final_measure() && !measures.empty()){
-        vector<uint> tmpcount(global_state.size(), 0);
-        vector<double> probs = global_state.probabilities();
-        std::random_device rd;
-        std::mt19937 global_rng(rd());
-        for(uint i = 0; i < shots; i++){
-            uint outcome = std::discrete_distribution<uint>(probs.begin(), probs.end())(global_rng);
-            tmpcount[outcome]++;
-        }
-        // map to reg
-        for(uint i = 0; i < global_state.size(); i++){
-            if(tmpcount[i] == 0) continue;
-            vector<uint> tmpcreg(global_state.cbit_num(), 0); 
-            vector<uint> tmpout = int2vec(i, 2);
-            if(tmpout.size() < global_state.num())
-                tmpout.resize(global_state.num());
-            for(auto &pair: measures){
-                tmpcreg[pair.second] = tmpout[pair.first];
-            }
-            uint outcome = 0;
-            for(uint j=0;j<tmpcreg.size();j++){
-                if(cbit_measured.find(j) == cbit_measured.end()) continue; 
-                outcome *= 2;
-                outcome += tmpcreg[j];
-            }
-            if(outcount.find(outcome) != outcount.end()) outcount[outcome] += tmpcount[i];
-            else outcount[outcome] = tmpcount[i];
-        }
-    } 
-    // return 
-    if(data_size == 0) return std::make_pair(outcount, to_numpy(global_state.move_data_to_python()));
-    else return std::make_pair(outcount, np_inputstate);  
-}
-
-#ifdef _USE_GPU
-py::object simulate_circuit_gpu(py::object const&pycircuit, py::array_t<complex<double>> &np_inputstate){
-    auto circuit = Circuit(pycircuit);
-    py::buffer_info buf = np_inputstate.request();
-    auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
-    size_t data_size = buf.size;
-
-
-    if (data_size == 0){
-        StateVector<double> state;
-        simulate_gpu(circuit, state);
-        return to_numpy(state.move_data_to_python());
-    }
-    else{
-      StateVector<double> state(data_ptr, buf.size);
-      simulate_gpu(circuit, state);
-      state.move_data_to_python();
-      return np_inputstate;
-    }
-}
-#endif
-
-#ifdef _USE_CUQUANTUM
-py::object simulate_circuit_custate(py::object const&pycircuit, py::array_t<complex<double>> &np_inputstate){
-    auto circuit = Circuit(pycircuit);
-    py::buffer_info buf = np_inputstate.request();
-    auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
-    size_t data_size = buf.size;
-
-
-    if (data_size == 0){
-        StateVector<double> state;
-        simulate_custate(circuit, state);
-        return to_numpy(state.move_data_to_python());
-    }
-    else{
-      StateVector<double> state(data_ptr, buf.size);
-      simulate_custate(circuit, state);
-      state.move_data_to_python();
-      return np_inputstate;
-    }
-}
-#endif
-
-
-
-PYBIND11_MODULE(qfvm, m) {
-    m.doc() = "Qfvm simulator";
-    m.def("simulate_circuit", &simulate_circuit, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t<complex<double>>(0), py::arg("shots"));
-
-    #ifdef _USE_GPU
-     m.def("simulate_circuit_gpu", &simulate_circuit_gpu, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t<complex<double>>(0));
-    #endif
-
-    #ifdef _USE_CUQUANTUM
-    m.def("simulate_circuit_custate", &simulate_circuit_custate, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t<complex<double>>(0));
-    #endif
-}
-
diff --git a/src/qfvm/simulator.hpp b/src/qfvm/simulator.hpp
deleted file mode 100644
index 1499922..0000000
--- a/src/qfvm/simulator.hpp
+++ /dev/null
@@ -1,116 +0,0 @@
-#pragma once
-
-#include "statevector.hpp"
-#include "circuit.hpp"
-
-void apply_op(QuantumOperator &op, StateVector<data_t> &state){
-    bool matched = false; 
-    switch (OPMAP[op.name()]){
-            //Named gate
-        case Opname::x:
-            state.apply_x(op.positions()[0]);
-            break;
-        case Opname::y:
-            state.apply_y(op.positions()[0]);
-            break;
-        case Opname::z:
-            state.apply_z(op.positions()[0]);
-            break;
-        case Opname::h:
-            state.apply_h(op.positions()[0]);
-            break;
-        case Opname::s:
-            state.apply_s(op.positions()[0]);
-            break;
-        case Opname::sdg:
-            state.apply_sdag(op.positions()[0]);
-            break;
-        case Opname::t:
-            state.apply_t(op.positions()[0]);
-            break;
-        case Opname::tdg:
-            state.apply_tdag(op.positions()[0]);
-            break;
-        case Opname::p:
-            state.apply_p(op.positions()[0], op.paras()[0]);
-            break;
-        case Opname::rx:
-            state.apply_rx(op.positions()[0], op.paras()[0]);
-            break;
-        case Opname::ry:
-            state.apply_ry(op.positions()[0], op.paras()[0]);
-            break;
-        case Opname::rz:
-            state.apply_rz(op.positions()[0], op.paras()[0]);
-            break;
-        case Opname::cx:
-            state.apply_cnot(op.positions()[0], op.positions()[1]);
-            break;
-        case Opname::cnot:
-            state.apply_cnot(op.positions()[0], op.positions()[1]);
-            break;
-        case Opname::cp:
-            state.apply_cp(op.positions()[0], op.positions()[1], op.paras()[0]);
-            break;
-        case Opname::cz:
-            state.apply_cz(op.positions()[0], op.positions()[1]);
-            break;
-        case Opname::ccx:
-            state.apply_ccx(op.positions()[0], op.positions()[1],  op.positions()[2]);
-            break;
-        case Opname::toffoli:
-            state.apply_ccx(op.positions()[0], op.positions()[1],  op.positions()[2]);
-        case Opname::rzz:
-            state.apply_cnot(op.positions()[0], op.positions()[1]);
-            state.apply_rz(op.positions()[1], op.paras()[0]);
-            state.apply_cnot(op.positions()[0], op.positions()[1]);
-            break;
-        case Opname::measure:
-            state.apply_measure(op.qbits(), op.cbits());
-            break;
-        case Opname::reset:
-            state.apply_reset(op.qbits());
-            break;
-        case Opname::cif:
-            // check cbits and condition
-            matched = state.check_cif(op.cbits(), op.condition());
-            // apply op in instructions
-            if(matched){
-                for(auto op_h :op.instructions()){
-                    apply_op(op_h, state);
-                }
-            }
-            break;
-        //Other general gate
-        default:
-        {
-            if (op.targe_num() == 1){
-                auto mat_temp = op.mat();
-                complex<double> *mat = mat_temp.data();
-                if (op.control_num() == 0){
-                    state.apply_one_targe_gate_general<0>(op.positions(), mat);
-                }else if (op.control_num() == 1){
-                    state.apply_one_targe_gate_general<1>(op.positions(), mat);
-                }else{
-                    state.apply_one_targe_gate_general<2>(op.positions(), mat);
-                }
-            }else if(op.targe_num() > 1){
-                state.apply_multi_targe_gate_general(op.positions(), op.control_num(), op.mat());
-            }else{
-                throw "Invalid target number";
-            }
-        }
-    }
-}
-
-void simulate(Circuit const& circuit, StateVector<data_t> & state){
-    state.set_num(circuit.qubit_num());
-    state.set_creg(circuit.cbit_num());
-    // skip measure and handle it in qfvm.cpp 
-    bool skip_measure = circuit.final_measure();
-    for (auto op : circuit.instructions()){
-        if(skip_measure == true && op.name() == "measure") continue;
-        apply_op(op , state);
-    }
-}
-
diff --git a/src/qfvm/statevector.hpp b/src/qfvm/statevector.hpp
deleted file mode 100644
index 99cf5f7..0000000
--- a/src/qfvm/statevector.hpp
+++ /dev/null
@@ -1,1355 +0,0 @@
-#pragma once
-#include "types.hpp"
-#include "util.h"
-#include <stdlib.h>
-#include <iostream>
-#include <cmath>
-#include <omp.h>
-#include <functional>
-#include <algorithm>
-#include <random>
-#ifdef USE_SIMD
-#ifdef _MSC_VER
-#include <intrin.h>
-#else
-#include <x86intrin.h>
-#endif
-#endif
-
-template <class real_t = double>
-class StateVector{
-    private:
-        uint num_;
-         // classical bit
-        uint cbit_num_;  
-        vector<uint> creg_;
-        size_t size_;
-        std::unique_ptr<complex<real_t>[]> data_;
-        //random engine
-        std::mt19937_64 rng_;
-
-    public:
-        //construct function
-        StateVector();
-        explicit StateVector(uint num);
-        explicit StateVector(complex<real_t> *data, size_t data_size);
-        //move assign
-        // StateVector& operator=(StateVector&& other){
-        //     if(this != &other){
-        //         data_ = std::move(other.data_);
-        //         creg_ = std::move(other.creg_);
-        //         num_ = other.num_;
-        //         cbit_num_ = other.cbit_num_;
-        //         size_ = other.size_;
-
-        //     }
-        //     return *this;
-        // }
-
-        //Named gate function
-        void apply_x(pos_t pos);
-        void apply_y(pos_t pos);
-        void apply_z(pos_t pos);
-        void apply_h(pos_t pos);
-        void apply_s(pos_t pos);
-        void apply_sdag(pos_t pos);
-        void apply_t(pos_t pos);
-        void apply_tdag(pos_t pos);
-        void apply_p(pos_t pos, real_t phase);
-        void apply_rx(pos_t pos, real_t theta);
-        void apply_ry(pos_t pos, real_t theta);
-        void apply_rz(pos_t pos, real_t theta);
-        void apply_cnot(pos_t control, pos_t targe);
-        void apply_cz(pos_t control, pos_t targe);
-        void apply_cp(pos_t control, pos_t targe, real_t phase);
-        void apply_crx(pos_t control, pos_t targe,  real_t theta);
-        void apply_cry(pos_t control, pos_t targe,  real_t theta);
-        void apply_ccx(pos_t control1, pos_t control2, pos_t targe);
-        void apply_swap(pos_t q1, pos_t q2);
-
-        //General implementation
-        //One-target gate, ctrl_num equal 2 represent multi-controlled gate
-        template<int ctrl_num>
-        void apply_one_targe_gate_general(vector<pos_t> const& posv, complex<double> *mat);
-        template<int ctrl_num>
-        void apply_one_targe_gate_diag(vector<pos_t> const& posv, complex<double> *mat);
-        template<int ctrl_num>
-        void apply_one_targe_gate_real(vector<pos_t> const& posv, complex<double> *mat);
-        template<int ctrl_num>
-        void apply_one_targe_gate_x(vector<pos_t> const& posv);
-
-        //Multiple-target gate
-        void apply_multi_targe_gate_general(vector<pos_t> const& posv, uint control_num, RowMatrixXcd const&mat);
-
-        // Measure and Reset
-        std::pair<uint, double> sample_measure_probs(vector<pos_t> const& qbits);
-        vector<double> probabilities() const;
-        void apply_diagonal_matrix(vector<pos_t> const& qbits, vector<std::complex<double> > const& mdiag);
-        void update(vector<pos_t> const& qbits, const uint final_state, const uint meas_state, const double meas_prob);
-        void apply_measure(vector<pos_t> const& qbits,const vector<pos_t> &cbits);
-        void apply_reset(vector<pos_t> const& qbits);
-
-        // cif check
-        bool check_cif(const vector<pos_t> &cbits, const uint condition);
-
-        complex<real_t> operator[] (size_t j) const ;
-        void set_num(uint num);
-        void set_creg(uint num){
-            if(num > 0){
-                cbit_num_ = num;
-                creg_.resize(cbit_num_, 0);
-            }else{
-                throw std::logic_error("The number of cbit must be positive.");
-            }
-        }
-
-        vector<uint> creg(){
-            return creg_;
-        }
-
-        void set_rng(){
-            std::random_device rd;
-            rng_.seed(rd());
-        }
-
-        void print_state();
-        std::tuple<std::complex<real_t>*, size_t> move_data_to_python() {
-            auto data_ptr = data_.release();
-            return std::make_tuple(std::move(data_ptr), size_);
-        }
-        
-        complex<real_t>* data(){ return data_.get(); }
-        size_t size(){ return size_; }
-        uint num(){ return num_; }
-        uint cbit_num(){ return cbit_num_; }
-};
-
-
-//////// constructors ///////
-
-template <class real_t>
-StateVector<real_t>::StateVector(uint num)
-: num_(num),
-size_(1ULL<<num)
-{   data_ = std::make_unique<complex<real_t>[]>(size_);
-    data_[0] = complex<real_t>(1., 0);
-};
-
-template <class real_t>
-StateVector<real_t>::StateVector() : StateVector(0){ }
-
-template <class real_t>
-StateVector<real_t>::StateVector(complex<real_t> *data, size_t data_size)
-:
-data_(data),
-size_(data_size)
-{   
-    num_ = static_cast<int>(std::log2(size_));
-}
-
-
-
-//// useful functions /////
-template <class real_t>
-std::complex<real_t> StateVector<real_t>::operator[] (size_t j) const{
-    return data_[j];
-}
-
-template <class real_t>
-void StateVector<real_t>::set_num(uint num){
-    if (num_ > 0) {
-        // Initialized from statevector,
-        // should not resize
-        return;
-    }
-    num_ = num;
-
-    if (size_ != 1ULL << num) {
-        data_.reset();
-        size_ = 1ULL << num;
-        data_ = std::make_unique<complex<real_t>[]>(size_);
-        data_[0] = complex<real_t>(1, 0);
-    }
-}
-template <class real_t>
-bool StateVector<real_t>::check_cif(const vector<pos_t> &cbits, const uint condition){
-    uint out = 0;
-    for(uint i = 0; i < cbits.size(); i++){
-        out *= 2;
-        out += creg_[cbits[i]];
-    }
-    return out == condition; 
-}
-
-template <class real_t>
-void StateVector<real_t>::print_state(){
-    std::cout << "state_data: ";
-    for (auto i=0;i<size_;i++){
-        std::cout << data_[i] << " ";
-    }
-    std::cout << std::endl;
-}
-
-
-////// apply gate ////////
-
-template <class real_t>
-void StateVector<real_t>::apply_x(pos_t pos){
-    const size_t offset = 1<<pos;
-    const size_t rsize = size_>>1;
-     if (pos == 0){ //single step
-#ifdef USE_SIMD
-#pragma omp parallel for
-         for(omp_i j = 0;j < size_;j+=2){
-             double* ptr = (double*)(data_.get() + j);
-            __m256d data = _mm256_loadu_pd(ptr);
-            data = _mm256_permute4x64_pd(data, 78);
-            _mm256_storeu_pd(ptr, data);
-         }
-#else
-#pragma omp parallel for
-            for(omp_i j = 0;j < size_;j+=2){
-                std::swap(data_[j], data_[j+1]);
-            }
-#endif
-     }
-     else{
-#ifdef USE_SIMD
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            double* ptr0 = (double*)(data_.get()+ i);
-            double* ptr1 = (double*)(data_.get() + i + offset);
-            __m256d data0 = _mm256_loadu_pd(ptr0);
-            __m256d data1 = _mm256_loadu_pd(ptr1);
-            _mm256_storeu_pd(ptr1, data0);
-            _mm256_storeu_pd(ptr0, data1);
-        }
-#else
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            size_t i1 = i+1;
-            std::swap(data_[i], data_[i+offset]);
-            std::swap(data_[i1], data_[i1+offset]);
-        }
-#endif
-     }
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_y(pos_t pos){
-    const size_t offset = 1<<pos;
-    const size_t rsize = size_>>1;
-    const complex<real_t> im = imag_I;
-     if (pos == 0){ //single step
-#ifdef USE_SIMD
-        __m256d minus_half = _mm256_set_pd(1, -1, -1, 1);
-#pragma omp parallel for
-         for(omp_i j = 0;j < size_;j+=2){
-             double* ptr = (double*)(data_.get() + j);
-            __m256d data = _mm256_loadu_pd(ptr);
-            data = _mm256_permute4x64_pd(data, 27);
-            data = _mm256_mul_pd(data, minus_half);
-            _mm256_storeu_pd(ptr, data);
-         }
-#else
-#pragma omp parallel for
-        for(omp_i j = 0;j < size_;j+=2){
-            complex<real_t> temp = data_[j];
-            data_[j] = -im*data_[j+1];
-            data_[j+1] = im*temp;
-        }
-#endif
-     }
-     else{
-#ifdef USE_SIMD
-        __m256d minus_even = _mm256_set_pd(1, -1, 1, -1);
-        __m256d minus_odd = _mm256_set_pd(-1, 1, -1, 1);
-
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-
-            double* ptr0 = (double*)(data_.get() + i);
-            double* ptr1 = (double*)(data_.get() + i + offset);
-            __m256d data0 = _mm256_loadu_pd(ptr0);
-            __m256d data1 = _mm256_loadu_pd(ptr1);
-            data0 = _mm256_permute_pd(data0, 5);
-            data1 = _mm256_permute_pd(data1, 5);
-            data0 = _mm256_mul_pd(data0, minus_even);
-            data1 = _mm256_mul_pd(data1, minus_odd);
-            _mm256_storeu_pd(ptr1, data0);
-            _mm256_storeu_pd(ptr0, data1);
-        }
-#else
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            size_t i1 = i+1;
-            complex<real_t> temp = data_[i];
-            data_[i] = -im*data_[i+offset];
-            data_[i+offset] = im*temp;
-            complex<real_t> temp1 = data_[i1];
-            data_[i1] = -im*data_[i1+offset];
-            data_[i1+offset] = im*temp1;
-        }
-#endif
-     }
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_z(pos_t pos){
-    const size_t offset = 1<<pos;
-    const size_t rsize = size_>>1;
-    if (pos == 0){ //single step
-#pragma omp parallel for
-        for(omp_i j = 1;j < size_;j+=2){
-            data_[j] *= -1;
-        }
-     }
-     else{
-#ifdef USE_SIMD
-        __m256d minus_one = _mm256_set_pd(-1, -1, -1, -1);
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            double* ptr1 = (double*)(data_.get() + i + offset);
-            __m256d data1 = _mm256_loadu_pd(ptr1);
-            data1 = _mm256_mul_pd(data1, minus_one);
-            _mm256_storeu_pd(ptr1, data1);
-        }
-#else
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            data_[i+offset] *= -1;
-            data_[i+offset+1] *= -1;
-        }
-#endif
-     }
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_h(pos_t pos){
-    const double sqrt2inv = 1. / std::sqrt(2.);
-    complex<double> mat[4] = {sqrt2inv, sqrt2inv, sqrt2inv, -sqrt2inv};
-    apply_one_targe_gate_real<0>(vector<pos_t>{pos}, mat);
-
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_s(pos_t pos){
-    complex<double> mat[2] = {1., imag_I};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_sdag(pos_t pos){
-    complex<double> mat[2] = {1., -imag_I};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
-
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_t(pos_t pos){
-    complex<double> p = imag_I*PI/4.;
-    complex<double> mat[2] = {1., std::exp(p)};
-   apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
-
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_tdag(pos_t pos){
-    complex<double> p = -imag_I*PI/4.;
-    complex<double> mat[2] = {1., std::exp(p)};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
-
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_p(pos_t pos, real_t phase){
-    complex<double> p = imag_I*phase;
-    complex<double> mat[2] = {1., std::exp(p)};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
-}
-
-
-template <class real_t>
-void StateVector<real_t>::apply_rx(pos_t pos, real_t theta){
-    complex<double> mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)};
-    apply_one_targe_gate_general<0>(vector<pos_t>{pos}, mat);
-}
-
-
-template <class real_t>
-void StateVector<real_t>::apply_ry(pos_t pos, real_t theta){
-    complex<double> mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)};
-    apply_one_targe_gate_real<0>(vector<pos_t>{pos}, mat);
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_rz(pos_t pos, real_t theta){
-    complex<double> z0 = -imag_I*theta/2.;
-    complex<double> z1 = imag_I*theta/2.;
-    complex<double> mat[2] = {std::exp(z0), std::exp(z1)};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_cnot(pos_t control, pos_t targe){
-    apply_one_targe_gate_x<1>(vector<pos_t>{control, targe});
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_cz(pos_t control, pos_t targe){
-    complex<double> mat[2] = {1., -1.};
-    apply_one_targe_gate_diag<1>(vector<pos_t>{control, targe}, mat);
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_cp(pos_t control, pos_t targe, real_t phase){
-    complex<double> p = imag_I*phase;
-    complex<double> mat[2] = {1., std::exp(p)};
-    apply_one_targe_gate_diag<1>(vector<pos_t>{control, targe}, mat);
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_crx(pos_t control, pos_t targe,  real_t theta){
-    complex<double> mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)};
-
-    apply_one_targe_gate_general<1>(vector<pos_t>{control, targe}, mat);
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_cry(pos_t control, pos_t targe,  real_t theta){
-     complex<double> mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)};
-
-    apply_one_targe_gate_real<1>(vector<pos_t>{control, targe}, mat);
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_ccx(pos_t control1, pos_t control2, pos_t targe){
-    apply_one_targe_gate_x<2>(vector<pos_t>{control1, control2, targe});
-}
-
-/////// General implementation /////////
-
-template <class real_t>
-template <int ctrl_num>
-void StateVector<real_t>::apply_one_targe_gate_general(vector<pos_t> const& posv, complex<double> *mat)
-{
-    std::function<size_t(size_t)> getind_func_near;
-    std::function<size_t(size_t)> getind_func;
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control = 0;
-    size_t setbit;
-    size_t poffset;
-    bool has_control=false;
-    vector<pos_t> posv_sorted = posv;
-    if (ctrl_num == 0){
-        targe = posv[0];
-        offset = 1ll<<targe;
-        rsize = size_>>1;
-        getind_func_near = [&](size_t j)-> size_t {
-            return 2*j;
-        };
-
-        getind_func = [&](size_t j)-> size_t {
-            return (j&(offset-1)) | (j>>targe<<targe<<1);
-        };
-
-    }
-    else if(ctrl_num == 1){
-        has_control = true;
-        control = posv[0];
-        targe = posv[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size_>>2;
-        getind_func = [&](size_t j) -> size_t {
-            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
-            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
-            return i;
-        };
-
-        getind_func_near = getind_func;
-
-
-    }
-    else if(ctrl_num == 2){
-        has_control = true;
-        control = *min_element(posv.begin(), posv.end()-1);
-        targe = *(posv.end()-1);
-        offset = 1ll<<targe;
-        sort(posv_sorted.begin(), posv_sorted.end());
-        rsize = size_>>posv.size();
-        getind_func = [&](size_t j)-> size_t{
-            size_t i = j;
-            for (size_t k=0;k < posv.size();k++)
-            {
-                size_t _pos = posv_sorted[k];
-                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-            }
-            for (size_t k=0;k < posv.size()-1;k++){
-                i |= 1ll<<posv[k];
-            }
-            return i;
-        };
-        getind_func_near = getind_func;
-    }
-
-    const complex<real_t> mat00 = mat[0];
-    const complex<real_t> mat01 = mat[1];
-    const complex<real_t> mat10 = mat[2];
-    const complex<real_t> mat11 = mat[3];
-    if (targe == 0){
-#pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func_near(j);
-                complex<real_t> temp = data_[i];
-                data_[i] = mat00*data_[i] + mat01*data_[i+1];
-                data_[i+1] = mat10*temp + mat11*data_[i+1];
-            }
-    }else if (has_control && control == 0){ //single step
-#pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func(j);
-                complex<real_t> temp = data_[i];
-                data_[i] = mat00*data_[i] + mat01*data_[i+offset];
-                data_[i+offset] = mat10*temp + mat11*data_[i+offset];
-            }
-
-    }else{//unroll to 2
-#ifdef USE_SIMD
-    __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(),  mat[0].real());
-    __m256d m_00im = _mm256_set_pd(mat[0].imag(),  -mat[0].imag(),  mat[0].imag(),  -mat[0].imag());
-    __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(),  mat[1].real(), mat[1].real());
-    __m256d m_01im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(),  mat[1].imag(), -mat[1].imag());
-
-    __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real());
-    __m256d m_10im = _mm256_set_pd(mat[2].imag(),  -mat[2].imag(),mat[2].imag(), -mat[2].imag());
-    __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real());
-    __m256d m_11im = _mm256_set_pd(mat[3].imag(), -mat[3].imag(), mat[3].imag(),  -mat[3].imag());
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize; j+= 2){
-            size_t i = getind_func(j);
-
-            double* p0 = (double*)(data_.get()+i);
-            double* p1 = (double*)(data_.get()+i+offset);
-            //load data
-            __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0
-            __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1
-            __m256d data0_p = _mm256_permute_pd(data0, 5);
-            __m256d data1_p = _mm256_permute_pd(data1, 5);
-
-             //row0
-            __m256d temp00re = _mm256_mul_pd(m_00re, data0);
-            __m256d temp00im = _mm256_mul_pd(m_00im, data0_p);
-            __m256d temp00 = _mm256_add_pd(temp00re, temp00im);
-            __m256d temp01re = _mm256_mul_pd(m_01re, data1);
-            __m256d temp01im = _mm256_mul_pd(m_01im, data1_p);
-            __m256d temp01 = _mm256_add_pd(temp01re, temp01im);
-            __m256d temp0 = _mm256_add_pd(temp00, temp01);
-
-            //row1
-            __m256d temp10re = _mm256_mul_pd(m_10re, data0);
-            __m256d temp10im = _mm256_mul_pd(m_10im, data0_p);
-            __m256d temp10 = _mm256_add_pd(temp10re, temp10im);
-            __m256d temp11re = _mm256_mul_pd(m_11re, data1);
-            __m256d temp11im = _mm256_mul_pd(m_11im, data1_p);
-            __m256d temp11 = _mm256_add_pd(temp11re, temp11im);
-            __m256d temp1 = _mm256_add_pd(temp10, temp11);
-
-            _mm256_storeu_pd(p0, temp0);
-            _mm256_storeu_pd(p1, temp1);
-        }
-#else
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = getind_func(j);
-            size_t i1 = i+1;
-            complex<real_t> temp = data_[i];
-            complex<real_t> temp1 = data_[i1];
-            data_[i] = mat00*data_[i] + mat01*data_[i+offset];
-            data_[i+offset] = mat10*temp + mat11*data_[i+offset];
-            data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset];
-            data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset];
-        }
-#endif
-    }
-}
-
-
-template <class real_t>
-template <int ctrl_num>
-void StateVector<real_t>::apply_one_targe_gate_x(vector<pos_t> const& posv)
-{
-    std::function<size_t(size_t)> getind_func_near;
-    std::function<size_t(size_t)> getind_func;
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control;
-    size_t setbit;
-    size_t poffset;
-    vector<pos_t> posv_sorted = posv;
-    bool has_control=false;
-    if (ctrl_num == 0){
-        targe = posv[0];
-        offset = 1ll<<targe;
-        rsize = size_>>1;
-        getind_func_near = [&](size_t j)-> size_t {
-            return 2*j;
-        };
-
-        getind_func = [&](size_t j)-> size_t {
-            return (j&(offset-1)) | (j>>targe<<targe<<1);
-        };
-
-    }
-    else if(ctrl_num == 1){
-        has_control = true;
-        control = posv[0];
-        targe = posv[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size_>>2;
-        getind_func = [&](size_t j) -> size_t {
-            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
-            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
-            return i;
-        };
-        getind_func_near = getind_func;
-    }
-    else if(ctrl_num == 2){
-        has_control = true;
-        control = *min_element(posv.begin(), posv.end()-1);
-        targe = *(posv.end()-1);
-        offset = 1ll<<targe;
-        sort(posv_sorted.begin(), posv_sorted.end());
-        rsize = size_>>posv.size();
-
-        getind_func = [&](size_t j) -> size_t{
-            size_t i = j;
-            for (size_t k=0;k < posv.size();k++){
-                size_t _pos = posv_sorted[k];
-                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-            }
-            for (size_t k=0;k < posv.size()-1;k++){
-                i |= 1ll<<posv[k];
-            }
-            return i;
-        };
-        getind_func_near = getind_func;
-    }
-
-    if (targe == 0){
-#ifdef USE_SIMD
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j++){
-            size_t i = getind_func_near(j);
-            double* ptr = (double*)(data_.get() + i);
-            __m256d data = _mm256_loadu_pd(ptr);
-            data = _mm256_permute4x64_pd(data, 78);
-            _mm256_storeu_pd(ptr, data);
-        }
-#else
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j++){
-            size_t i = getind_func(j);
-            std::swap(data_[i], data_[i+1]);
-        }
-#endif
-    }else if (has_control && control == 0){ //single step
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j++){
-            size_t i = getind_func(j);
-            std::swap(data_[i], data_[i+offset]);
-        }
-
-    }else{//unroll to 2
-#ifdef USE_SIMD
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize; j+= 2){
-            size_t i = getind_func(j);
-            double* ptr0 = (double*)(data_.get() + i);
-            double* ptr1 = (double*)(data_.get() + i + offset);
-            __m256d data0 = _mm256_loadu_pd(ptr0);
-            __m256d data1 = _mm256_loadu_pd(ptr1);
-            _mm256_storeu_pd(ptr1, data0);
-            _mm256_storeu_pd(ptr0, data1);
-        }
-#else
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = getind_func(j);
-            size_t i1 = i+1;
-            std::swap(data_[i], data_[i+offset]);
-            std::swap(data_[i1], data_[i1+offset]);
-        }
-#endif
-    }
-}
-
-template <class real_t>
-template <int ctrl_num>
-void StateVector<real_t>::apply_one_targe_gate_real(vector<pos_t> const& posv, complex<double> *mat)
-{
-    std::function<size_t(size_t)> getind_func_near;
-    std::function<size_t(size_t)> getind_func;
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control = 0;
-    size_t setbit;
-    size_t poffset;
-    bool has_control=false;
-    vector<pos_t> posv_sorted = posv;
-    if (ctrl_num == 0){
-        targe = posv[0];
-        offset = 1ll<<targe;
-        rsize = size_>>1;
-        getind_func_near = [&](size_t j)-> size_t {
-            return 2*j;
-        };
-
-        getind_func = [&](size_t j)-> size_t {
-            return (j&(offset-1)) | (j>>targe<<targe<<1);
-        };
-
-    }
-    else if(ctrl_num == 1){
-        has_control = true;
-        control = posv[0];
-        targe = posv[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size_>>2;
-        getind_func = [&](size_t j) -> size_t {
-            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
-            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
-            return i;
-        };
-
-        getind_func_near = getind_func;
-    }
-    else if(ctrl_num == 2){
-        has_control = true;
-        control = *min_element(posv.begin(), posv.end()-1);
-        targe = *(posv.end()-1);
-        offset = 1ll<<targe;
-        sort(posv_sorted.begin(), posv_sorted.end());
-        rsize = size_>>posv.size();
-        getind_func = [&](size_t j)-> size_t{
-            size_t i = j;
-            for (size_t k=0;k < posv.size();k++){
-                size_t _pos = posv_sorted[k];
-                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-            }
-            for (size_t k=0;k < posv.size()-1;k++){
-                i |= 1ll<<posv[k];
-            }
-            return i;
-        };
-        getind_func_near = getind_func;
-    }
-
-    const double mat00 = mat[0].real();
-    const double mat01 = mat[1].real();
-    const double mat10 = mat[2].real();
-    const double mat11 = mat[3].real();
-    if (targe == 0){
-#pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func_near(j);
-                complex<real_t> temp = data_[i];
-                data_[i] = mat00*data_[i] + mat01*data_[i+1];
-                data_[i+1] = mat10*temp + mat11*data_[i+1];
-            }
-
-    }else if (has_control && control == 0){ //single step
-
-#pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func(j);
-                complex<real_t> temp = data_[i];
-                data_[i] = mat00*data_[i] + mat01*data_[i+offset];
-                data_[i+offset] = mat10*temp + mat11*data_[i+offset];
-            }
-    }else{//unroll to 2
-#ifdef USE_SIMD
-    __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(),  mat[0].real());
-    __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(),  mat[1].real(), mat[1].real());
-    __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real());
-    __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real());
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize; j+= 2){
-            size_t i = getind_func(j);
-
-            double* p0 = (double*)(data_.get()+i);
-            double* p1 = (double*)(data_.get()+i+offset);
-             //load data
-            __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0
-            __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1
-            __m256d data0_p = _mm256_permute_pd(data0, 5);
-            __m256d data1_p = _mm256_permute_pd(data1, 5);
-
-                //row0
-            __m256d temp00re = _mm256_mul_pd(m_00re, data0);
-            __m256d temp01re = _mm256_mul_pd(m_01re, data1);
-            __m256d temp0 = _mm256_add_pd(temp00re, temp01re);
-
-            //row1
-            __m256d temp10re = _mm256_mul_pd(m_10re, data0);
-            __m256d temp11re = _mm256_mul_pd(m_11re, data1);
-            __m256d temp1 = _mm256_add_pd(temp10re, temp11re);
-
-            _mm256_storeu_pd(p0, temp0);
-            _mm256_storeu_pd(p1, temp1);
-        }
-#else
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = getind_func(j);
-            size_t i1 = i+1;
-            complex<real_t> temp = data_[i];
-            complex<real_t> temp1 = data_[i1];
-            data_[i] = mat00*data_[i] + mat01*data_[i+offset];
-            data_[i+offset] = mat10*temp + mat11*data_[i+offset];
-            data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset];
-            data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset];
-        }
-#endif
-    }
-}
-
-
-template <class real_t>
-template <int ctrl_num>
-void StateVector<real_t>::apply_one_targe_gate_diag(vector<pos_t> const& posv, complex<double> *mat)
-{
-    std::function<size_t(size_t)> getind_func_near;
-    std::function<size_t(size_t)> getind_func;
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control = 0;
-    size_t setbit;
-    size_t poffset;
-    bool has_control=false;
-    vector<pos_t> posv_sorted = posv;
-    if (ctrl_num == 0){
-        targe = posv[0];
-        offset = 1ll<<targe;
-        rsize = size_>>1;
-        getind_func_near = [&](size_t j)-> size_t {
-            return 2*j;
-        };
-
-        getind_func = [&](size_t j)-> size_t {
-            return (j&(offset-1)) | (j>>targe<<targe<<1);
-        };
-
-    }
-    else if(ctrl_num == 1){
-
-        has_control = true;
-        control = posv[0];
-        targe = posv[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size_>>2;
-        getind_func = [&](size_t j) -> size_t {
-            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
-            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
-            return i;
-        };
-
-        getind_func_near = getind_func;
-
-    }
-    else if(ctrl_num == 2){
-        has_control = true;
-        control = *min_element(posv.begin(), posv.end()-1);
-        targe = *(posv.end()-1);
-        offset = 1ll<<targe;
-        sort(posv_sorted.begin(), posv_sorted.end());
-        rsize = size_>>posv.size();
-        getind_func = [&](size_t j)-> size_t
-        {
-            size_t i = j;
-            for (size_t k=0;k < posv.size();k++){
-                size_t _pos = posv_sorted[k];
-                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-            }
-            for (size_t k=0;k < posv.size()-1;k++){
-                i |= 1ll<<posv[k];
-            }
-            return i;
-        };
-        getind_func_near = getind_func;
-    }
-
-    if (targe == 0){
-#pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func_near(j);
-                data_[i] *= mat[0];
-                data_[i+1] *= mat[1];
-            }
-
-    }else if (has_control && control == 0){ //single step
-
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j++){
-            size_t i = getind_func(j);
-            complex<real_t> temp = data_[i];
-            data_[i] *= mat[0];
-            data_[i+offset] *= mat[1];
-        }
-
-    }else{//unroll to 2
-#ifdef USE_SIMD
-     __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(),  mat[0].real());
-    __m256d m_00im = _mm256_set_pd(mat[0].imag(),  -mat[0].imag(),  mat[0].imag(),  -mat[0].imag());
-    __m256d m_11re = _mm256_set_pd(mat[1].real(), mat[1].real(),  mat[1].real(), mat[1].real());
-    __m256d m_11im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(),  mat[1].imag(), -mat[1].imag());
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize; j+= 2){
-            size_t i = getind_func(j);
-
-            double* p0 = (double*)(data_.get()+i);
-            double* p1 = (double*)(data_.get()+i+offset);
-
-            //load data
-            __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0
-            __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1
-            __m256d data0_p = _mm256_permute_pd(data0, 5);
-            __m256d data1_p = _mm256_permute_pd(data1, 5);
-
-             //row0
-            __m256d temp00re = _mm256_mul_pd(m_00re, data0);
-            __m256d temp00im = _mm256_mul_pd(m_00im, data0_p);
-            __m256d temp00 = _mm256_add_pd(temp00re, temp00im);
-
-            //row1
-            __m256d temp11re = _mm256_mul_pd(m_11re, data1);
-            __m256d temp11im = _mm256_mul_pd(m_11im, data1_p);
-            __m256d temp11 = _mm256_add_pd(temp11re, temp11im);
-
-            _mm256_storeu_pd(p0, temp00);
-            _mm256_storeu_pd(p1, temp11);
-        }
-#else
-#pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = getind_func(j);
-            size_t i1 = i+1;
-            data_[i] *= mat[0];
-            data_[i+offset] *= mat[1];
-            data_[i1] *= mat[0];
-            data_[i1+offset] *= mat[1];
-        }
-#endif
-    }
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_multi_targe_gate_general(vector<pos_t> const& posv, uint control_num, RowMatrixXcd const& mat)
-{
-    auto posv_sorted = posv;
-    auto targs = vector<pos_t>(posv.begin()+control_num, posv.end());
-    sort(posv_sorted.begin(), posv_sorted.end());
-    size_t rsize = size_ >> posv.size();
-    uint targe_num = targs.size();
-    size_t matsize= 1<< targe_num;
-    std::vector<uint> targ_mask(matsize);
-    //create target mask
-    for (size_t m = 0; m < matsize;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
-    }
-
-    //apply matrix
-//TODO: Disalbe Parallel when matsize is very large
-#pragma omp parallel for
-    for (omp_i j = 0;j < rsize;j++){
-        size_t i = j;
-        // Insert zeros
-        for(size_t k=0;k < posv.size();k++){
-            size_t _pos = posv_sorted[k];
-            i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-        }
-        // Set control
-        for (size_t k=0; k < control_num;k++){
-            i |= 1ll<<posv[k];
-        }
-
-        //load block vector
-        Eigen::VectorXcd vec_block(matsize);
-        for (size_t m = 0; m < matsize;m++){
-            vec_block(m) = data_[i | targ_mask[m]];
-            auto ele = vec_block(m);
-        }
-
-        //Eigen matrix multiply
-        vec_block = mat * vec_block;
-
-        //write back
-        for (size_t m = 0; m < matsize;m++){
-            data_[i | targ_mask[m]] = vec_block(m);
-        }
-    }
-}
-
-
-uint index0(vector<pos_t> const& qubits_sorted, const uint k) {
-    uint lowbits, retval = k;
-    for (size_t j = 0; j < qubits_sorted.size(); j++) {
-        lowbits = retval & ((1LL << qubits_sorted[j]) -1);
-        retval >>= qubits_sorted[j];
-        retval <<= qubits_sorted[j] + 1;
-        retval |= lowbits;
-    }
-    return retval;
-}
-
-using indexes_t = vector<uint>;
-inline indexes_t indexes(vector<pos_t> const& qbits, vector<pos_t> const& qubits_sorted, const uint k) {
-  const auto N = qubits_sorted.size();
-  indexes_t ret(1LL << N, 0);
-  // Get index0
-  ret[0] = index0(qubits_sorted, k);
-  for (size_t i = 0; i < N; i++) {
-    const auto n = 1LL << i;
-    const auto bit = 1ll << qbits[i];
-    for (size_t j = 0; j < n; j++)
-      ret[n + j] = ret[j] | bit;
-  }
-  return ret;
-}
-
-template <class real_t>
-vector<double> StateVector<real_t>::probabilities() const {
-    const int len = 1LL << num_;
-    vector<double> probs(len, 0.);
-#pragma omp parallel for
-    for (int j = 0; j < len; j++) {
-        probs[j] = std::real(data_[j] * std::conj(data_[j]));
-    }
-    return probs;
-}
-
-vector<std::complex<double>> convert(const vector<std::complex<double>> &v){
-    vector<std::complex<double>> ret(v.size(), 0.);
-    for (size_t i = 0; i< v.size(); ++i)
-        ret[i] = v[i];
-    return ret;
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_diagonal_matrix(vector<pos_t> const&qbits, vector<std::complex<double> > const&diag){
-    // just one qubit
-    if(qbits.size() == 1){
-        const uint qubit = qbits[0];
-        vector<pos_t> qbit0{qubit};
-        if (diag[0] == 1.0) { // [[1, 0], [0, z]] matrix
-            if (diag[1] == 1.0)
-                return; // Identity
-            if (diag[1] == std::complex<double>(0., -1.)) { // [[1, 0], [0, -i]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    const auto k = inds[1];
-                    double cache = data_[k].imag();
-                    data_[k].imag(data_[k].real() * -1.);
-                    data_[k].real(cache);
-                };
-#pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            if (diag[1] == std::complex<double>(0., 1.)) {
-            // [[1, 0], [0, i]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    const auto k = inds[1];
-                    double cache = data_[k].imag();
-                    data_[k].imag(data_[k].real());
-                    data_[k].real(cache * -1.);
-                };
-#pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            if (diag[0] == 0.0) {
-            // [[1, 0], [0, 0]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    data_[inds[1]] = 0.0;
-                };
-#pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            // general [[1, 0], [0, z]]
-            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
-                const auto k = inds[1];
-                data_[k] *= _mat[1];
-            };
-#pragma omp parallel for
-            for (int k = 0; k < (size_ >> 1); k+=1){
-                const auto inds = indexes(qbit0, qbit0, k);
-                func(inds, convert(diag));
-            }
-            return;
-        } else if (diag[1] == 1.0) {
-            // [[z, 0], [0, 1]] matrix
-            if (diag[0] == std::complex<double>(0., -1.)) {
-                // [[-i, 0], [0, 1]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    const auto k = inds[1];
-                    double cache = data_[k].imag();
-                    data_[k].imag(data_[k].real() * -1.);
-                    data_[k].real(cache);
-                };
-#pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            if (diag[0] == std::complex<double>(0., 1.)) {
-                // [[i, 0], [0, 1]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    const auto k = inds[1];
-                    double cache = data_[k].imag();
-                    data_[k].imag(data_[k].real());
-                    data_[k].real(cache * -1.);
-                };
-#pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            if (diag[0] == 0.0) {
-                // [[0, 0], [0, 1]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    data_[inds[0]] = 0.0;
-                };
-#pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }                
-                return;
-            }
-            // general [[z, 0], [0, 1]]
-            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
-                const auto k = inds[0];
-                data_[k] *= _mat[0];
-            };
-#pragma omp parallel for
-            for (int k = 0; k < (size_ >> 1); k+=1){
-                const auto inds = indexes(qbit0, qbit0, k);
-                func(inds, convert(diag));
-            }   
-            return;
-        } else {
-            // Lambda function for diagonal matrix multiplication
-            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
-                const auto k0 = inds[0];
-                const auto k1 = inds[1];
-                data_[k0] *= _mat[0];
-                data_[k1] *= _mat[1];
-            };
-#pragma omp parallel for
-            for (int k = 0; k < (size_ >> 1); k+=1){
-                const auto inds = indexes(qbit0, qbit0, k);
-                func(inds, convert(diag));
-            }  
-        }
-        return;
-    }
-    const uint N = qbits.size();
-    auto func = [&](const indexes_t &inds, const vector<std::complex<double>> &_diag) -> void { 
-        for(int i=0; i<2; ++i){
-            const int k = inds[i];
-            int iv = 0;
-            for(int j=0;j<N;j++){
-                if((k & (1ULL << qbits[j])) !=0 )
-                    iv += (1ULL << j);
-            }
-            if(_diag[iv] != (double)1.0)
-                data_[k] *= _diag[iv];
-        }
-    };
-    // apply func
-    vector<pos_t> qbit0{qbits[0]};
-#pragma omp parallel for
-    for (int k = 0; k < (size_ >> 1); k+=1){
-        const auto inds = indexes(qbit0, qbit0, k);
-        func(inds, convert(diag));
-    }
-}
-
-template <class real_t>
-void StateVector<real_t>::update(vector<pos_t> const& qbits, const uint final_state, const uint meas_state, const double meas_prob){
-    const uint dim = 1ULL << qbits.size();
-    vector<std::complex<double> >  matdiag(dim, 0.);
-    matdiag[meas_state] = 1./ std::sqrt(meas_prob);
-    apply_diagonal_matrix(qbits, matdiag);
-    
-    //TODO: Add reset
-    // for reset
-     if(final_state != meas_state){
-        if(qbits.size() == 1){
-            // apply a x gate
-            apply_x(qbits[0]);
-        }else{
-            // Diagonal matrix for projecting and renormalizing to measurement outcome
-            vector<std::complex<double> > perm(dim*dim, 0.);
-            perm[final_state * dim + meas_state] = 1.;
-            perm[meas_state * dim + final_state] = 1.;
-            for (uint j = 0; j < dim; j++){
-                if(j != final_state && j != meas_state) 
-                    perm[j * dim + j] = 1.;
-            }
-            // apply permutation to swap state
-            const uint N = qbits.size();
-            const uint DIM = 1ULL << N;
-            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
-                // std::array<std::complex<double>, 1ULL << N > cache;
-                vector<std::complex<double> > cache(1ULL << N, 0.);
-                for(uint i = 0; i< DIM; i++){
-                    const auto ii = inds[i];
-                    cache[i] = data_[ii];
-                    data_[ii] = 0.;
-                }
-                for(uint i=0; i< DIM; i++)
-                    for(uint j=0; j<DIM; j++)
-                        data_[inds[i]] += _mat[i + DIM * j] * cache[j];
-            };
-            vector<pos_t> qs(qbits.begin(), qbits.end());
-            vector<pos_t> qs_sorted(qs.begin(), qs.end());
-            std::sort(qs_sorted.begin(), qs_sorted.end());
-            uint END = size_ >> qs.size();
-#pragma omp parallel for
-            for (int k = 0; k < END; k+=1){
-                const auto inds = indexes(qs, qs_sorted, k);
-                func(inds, convert(perm));
-            }  
-        }
-    }
-}
-
-template <typename T>
-void printVector(const std::vector<T>& vec) {
-    for (const T& element : vec) {
-        std::cout << element << " ";
-    }
-    std::cout << std::endl;
-}
-
-template <class real_t>
-std::pair<uint, double> StateVector<real_t>::sample_measure_probs(vector<pos_t> const& qbits){
-    // 1. caculate actual measurement outcome
-    const int64_t N = qbits.size();
-    const int64_t DIM = 1LL << N;
-    const int64_t END = 1LL << (num_ - N);
-    vector<double> probs(DIM, 0.);
-    vector<uint> qubits_sorted(qbits.begin(), qbits.end());
-    
-    std::sort(qubits_sorted.begin(), qubits_sorted.end());
-    if ((num_ == N) && ( qubits_sorted == qbits )){
-        probs = probabilities();
-    }else{
-        vector<double> probs_private(DIM, 0.);
-#pragma omp parallel for
-        for (int64_t k = 0; k < END; k++){
-            auto idx = indexes(qbits, qubits_sorted, k);
-            // std::cout<<"indexes"<<k<<": ";
-            // printVector(idx);
-            for(int64_t m = 0; m < DIM; ++m){
-                double local_prob = std::real(data_[idx[m]] * std::conj(data_[idx[m]]));
-                #pragma omp critical
-                probs_private[m] += local_prob;
-            }
-        }
-        // std::cout<<"probs_private:";
-        // printVector(probs_private);
-        #pragma omp critical
-        for(int64_t m = 0; m < DIM; ++m){
-            probs[m] += probs_private[m];
-        }
-    }
-    set_rng();
-    // std::cout<<"probs:";
-    // printVector(probs);
-    uint outcome = std::discrete_distribution<uint>(probs.begin(), probs.end())(rng_);
-    return std::make_pair(outcome, probs[outcome]);
-}
-
-// change to bit endian
-vector<uint> int2vec(uint n, uint base){
-    vector<uint> ret;
-    while(n >= base){
-        ret.push_back(n%base);
-        n /= base;
-    }
-    ret.push_back(n);
-    return ret;
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_measure(vector<pos_t> const& qbits, vector<pos_t> const& cbits){
-    // 1. caculate actual measurement outcome
-    const auto meas = sample_measure_probs(qbits);
-    //2. update statevector
-    update(qbits, meas.first, meas.first, meas.second);
-    //3. store measure
-    vector<uint> outcome = int2vec(meas.first, 2);
-    if(outcome.size() < qbits.size()){
-        outcome.resize(qbits.size());
-    }
-    for(uint j=0; j < outcome.size(); j++){
-        creg_[cbits[j]] = outcome[j];
-    }
-}
-
-template <class real_t>
-void StateVector<real_t>::apply_reset(vector<pos_t> const& qbits){
-    const auto meas = sample_measure_probs(qbits);
-    update(qbits, 0, meas.first, meas.second);
-}
\ No newline at end of file
diff --git a/src/qfvm/types.hpp b/src/qfvm/types.hpp
deleted file mode 100644
index 5ece0f7..0000000
--- a/src/qfvm/types.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-#pragma once
-
-#include <complex>
-#include <vector>
-#include <string>
-#include <iostream>
-#include <tuple>
-#include <Eigen/Core>
-
-#ifdef _MSC_VER
-using omp_i = signed long long;
-#else
-using omp_i = size_t;
-#endif
-
-#ifdef _MSC_VER
-#else
-#ifndef __AVX2__
-#undef USE_SIMD
-#endif
-#endif
-
-
-typedef unsigned int uint;
-using pos_t = uint;
-using data_t = double;
-using std::complex;
-using std::vector;
-using std::string;
-using RowMatrixXcd = Eigen::Matrix<complex<double>, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-const complex<double> imag_I = complex<double>(0, 1.);
-const double PI = 3.14159265358979323846;
diff --git a/src/qfvm/util.h b/src/qfvm/util.h
deleted file mode 100644
index 950b48e..0000000
--- a/src/qfvm/util.h
+++ /dev/null
@@ -1,99 +0,0 @@
-#pragma once
-
-#include <iostream>
-#include <vector>
-#include <time.h>
-#include <regex>
-#include <type_traits>
-#include <bitset>
-
-namespace Qfutil{
-
-std::vector<int> randomArr(size_t length, size_t max){
-    srand((unsigned)time(NULL));
-    std::vector<int> arr(length);
-    for (size_t i=0;i < arr.size();i++){
-        arr[i] = rand()%max;
-    }
-    return arr;
-}
-
-int randomint(int min, int max){
-    srand((unsigned)time(NULL));
-    return (rand()%(max - min + 1)) + min;
-}
-
-static uint32_t randomize(uint32_t i){
-    i = (i^61)^(i>>16);
-    i *=9;
-    i ^= i<<4;
-    i *= 0x27d4eb2d;
-    i ^= i >> 15;
-    return i;
-}
-
-template<class T>
-void printVector(std::vector<T> const &arr){
-    for (auto i : arr){
-        std::cout << i << " ";
-    }
-    std::cout << std::endl;
-}
-
-
-std::vector<std::string> split_string(const std::string& str, char delim) {
-    std::vector<std::string> elems;
-    auto lastPos = str.find_first_not_of(delim, 0);
-    auto pos = str.find_first_of(delim, lastPos);
-    while (pos != std::string::npos || lastPos != std::string::npos) {
-        elems.push_back(str.substr(lastPos, pos - lastPos));
-        lastPos = str.find_first_not_of(delim, pos);
-        pos = str.find_first_of(delim, lastPos);
-    }
-    return elems;
-}
-
-std::vector<std::string> split_string(const std::string& str, char delim, uint num){
-    auto end = str.length();
-    std::vector<std::string> elems;
-    auto lastPos = str.find_first_not_of(delim, 0);
-    auto pos = str.find_first_of(delim, lastPos);
-    while ((pos != std::string::npos || lastPos != std::string::npos) && elems.size() < num) {
-        elems.push_back(str.substr(lastPos, pos - lastPos));
-        lastPos = str.find_first_not_of(delim, pos);
-        pos = str.find_first_of(delim, lastPos);
-    }
-
-    if ((pos != std::string::npos || lastPos != std::string::npos)){
-        elems.push_back(str.substr(lastPos, end));
-    }
-    return elems;
-}
-
-template<class real_t>
-std::vector<real_t> find_numbers(const std::string &str){
-    std::smatch matchs;
-    std::vector<real_t> res;
-    std::regex pattern;
-    if (std::is_unsigned<real_t>::value){
-        pattern = std::regex("\\d+");
-    }else if(std::is_floating_point<real_t>::value){
-        pattern = std::regex("-?(([1-9]\\d*\\.\\d*)|(0\\.\\d*[1-9]\\d*))|\\d+");
-    }
-
-    auto begin = std::sregex_iterator(str.begin(), str.end(), pattern);
-    const std::sregex_iterator end;
-
-    for (std::sregex_iterator i = begin; i != end; ++i) {
-        std::string match_str = i->str();
-         if (std::is_unsigned<real_t>::value){
-            res.push_back(std::stoi(match_str));
-        }else if (std::is_floating_point<real_t>::value){
-            res.push_back(std::stod(match_str));
-        }
-    }
-
-    return res;
-}
-
-}//namespace Qfutil
\ No newline at end of file
diff --git a/src/qfvm_gpu/apply_gate_custate.cuh b/src/qfvm_gpu/apply_gate_custate.cuh
deleted file mode 100644
index 2d9c2e7..0000000
--- a/src/qfvm_gpu/apply_gate_custate.cuh
+++ /dev/null
@@ -1,46 +0,0 @@
-
-#pragma once
-#include <custatevec.h>
-#include <helper_custatevec.hpp>
-
-
-void apply_gate_custate(cuDoubleComplex *psi_d, QuantumOperator &op, int n)
-{   
-         
-    //get information form op
-    auto pos = op.positions();
-    const int nTargets  = op.targe_num();
-    const int nControls = op.control_num();
-    const int adjoint   = 0;
-
-    vector<int> targets{pos.begin()+nControls, pos.end()};
-    vector<int> controls{pos.begin(), pos.begin()+nControls};
-
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = 
-    reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-
-    // custatevec handle initialization
-    custatevecHandle_t handle;
-    custatevecCreate(&handle) ;
-    void* extraWorkspace = nullptr;
-    size_t extraWorkspaceSizeInBytes = 0;
-
-    // check the size of external workspace
-    custatevecApplyMatrixGetWorkspaceSize(
-                handle, CUDA_C_64F, n, mat, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW,
-                adjoint, nTargets, nControls, CUSTATEVEC_COMPUTE_64F, &extraWorkspaceSizeInBytes) ;
-
-    // allocate external workspace if necessary
-    if (extraWorkspaceSizeInBytes > 0)
-        cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes);
-
-    custatevecApplyMatrix(
-                    handle, psi_d, CUDA_C_64F, n, mat, CUDA_C_64F,
-                    CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, targets.data(), nTargets, controls.data(), nullptr,
-                    nControls, CUSTATEVEC_COMPUTE_64F, extraWorkspace, extraWorkspaceSizeInBytes);
-
-                      // destroy handle
-    custatevecDestroy(handle);    
-}
-    
\ No newline at end of file
diff --git a/src/qfvm_gpu/apply_gate_gpu.cuh b/src/qfvm_gpu/apply_gate_gpu.cuh
deleted file mode 100644
index e2d7be0..0000000
--- a/src/qfvm_gpu/apply_gate_gpu.cuh
+++ /dev/null
@@ -1,411 +0,0 @@
-#pragma once
-#include <cuda_runtime.h>
-#include <helper_cuda.h>
-#include <types.hpp>
-#include <algorithm>
-#include <util.h>
-
-struct targeIndex
-{
-    size_t ind0;
-    size_t ind1;
-};
-
-
-__constant__ uint posv_d[50];
-__constant__ uint posv_sorted_d[50];
-__constant__ cuDoubleComplex mat_d_const[32*32]; //If target qubit < 5, use const memory; 
-__constant__ uint mat_mask_d_const[32];
-
-
-//-------------Single-target gate-------------------------------
-template<class Func>
-__global__ void apply_one_targe_gate_kernel(cuDoubleComplex *psi_d, Func get_index, int rsize){
-    cuDoubleComplex mat00 = mat_d_const[0];
-    cuDoubleComplex mat01 = mat_d_const[1];
-    cuDoubleComplex mat10 = mat_d_const[2];
-    cuDoubleComplex mat11 = mat_d_const[3];
-
-    unsigned int gridSize = blockDim.x*gridDim.x;
-    for (int j = blockDim.x * blockIdx.x + threadIdx.x; j < rsize;j+= gridSize){
-        targeIndex ind = get_index(j);
-        cuDoubleComplex temp = psi_d[ind.ind0];
-        psi_d[ind.ind0] = cuCadd(cuCmul(mat00, psi_d[ind.ind0]), cuCmul(mat01, psi_d[ind.ind1]));
-        psi_d[ind.ind1] = cuCadd(cuCmul(mat10, temp), cuCmul(mat11, psi_d[ind.ind1]));
-    }
-}
-
-template<int ctrl_num>
-void apply_one_targe_gate_gpu(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){ 
-    //copy mat to device
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = 
-    reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-    checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, 4*sizeof(cuDoubleComplex)));
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control;
-    size_t setbit;
-    size_t poffset;
-    if (ctrl_num == 0){
-        targe = op.positions()[0];
-        offset = 1ll<<targe;
-        rsize = size>>1;
-        auto getind_func = [offset, targe] __device__ (size_t j)-> targeIndex {
-            size_t ind0 = (j&(offset-1)) | (j>>targe<<targe<<1);
-            size_t ind1 = ind0 + offset;
-            return {ind0, ind1};
-        };
-
-        size_t blockdim = rsize <= 1024 ? rsize : 1024;
-        size_t griddim = rsize / blockdim;
-        apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d,  getind_func, rsize);
-    }
-    else if(ctrl_num == 1){
-        control = op.positions()[0];
-        targe = op.positions()[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size>>2;
-        auto getind_func = [control, targe, poffset, offset, setbit] __device__ (size_t j) -> targeIndex {
-            size_t ind0 = (j>>control<<(control+1))|(j&(poffset-1));
-            ind0 = (ind0 >> targe << (targe+1))|(ind0 &(offset-1))|setbit;
-            size_t ind1 = ind0 + offset;
-            return {ind0, ind1};
-        };
-        
-
-        size_t blockdim = rsize <= 1024 ? rsize : 1024;
-        size_t griddim = rsize / blockdim;
-       
-        apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d,  getind_func, rsize);      
-    }
-    else if(ctrl_num == 2){    
-        targe = op.positions().back();
-        offset = 1ll<<targe;
-        uint psize = op.positions().size();
-        rsize = size>>psize;
-
-        vector<pos_t> posv_sorted = op.positions();
-        std::sort(posv_sorted.begin(), posv_sorted.end());
-        //Copy pos to device
-        checkCudaErrors(cudaMemcpyToSymbol(posv_d, op.positions().data(), psize*sizeof(uint)));
-        checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-        auto getind_func = [offset, psize] __device__ (size_t j)-> targeIndex{
-            size_t ind0 = j;
-            for (pos_t k=0;k < psize;k++)
-            {   
-                pos_t _pos = posv_sorted_d[k];
-                ind0 = (ind0&((1ll<<_pos)-1)) | (ind0>>_pos<<_pos<<1);
-            }
-            for (pos_t k=0;k < psize-1;k++){
-                ind0 |= 1ll<<posv_d[k];
-            }
-
-            size_t ind1 = ind0 + offset;
-            return {ind0, ind1};
-        };
-        size_t blockdim = rsize <= 1024 ? rsize : 1024;
-        size_t griddim = rsize / blockdim;
-        apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d, getind_func, rsize);
-    } 
-}
-
-template<int targe_num>
-__global__ void apply_2to4_targe_gate_kernel(cuDoubleComplex *psi_d,  uint ctrlnum, int psize){
-    constexpr uint matlen = 1<<targe_num;
-    uint block_length = blockDim.x;
-    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
-    // Insert zeros
-    for(size_t k=0;k < psize;k++){
-        size_t _pos = posv_sorted_d[k];
-        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-    }
-    // Set control
-    for (size_t k=0; k < ctrlnum;k++){
-        i |= 1ll<<posv_d[k];
-    }
-
-    cuDoubleComplex psi_d_buffer[matlen];
-    for (int y = 0; y < matlen;++y){
-        psi_d_buffer[y] = {0., 0.};
-        for (int x = 0; x < matlen;++x){
-            psi_d_buffer[y] = cuCadd(psi_d_buffer[y], cuCmul(psi_d[i | mat_mask_d_const[x]], mat_d_const[y*matlen+x]));
-        }
-    }
-    for (int y = 0; y < matlen;++y) psi_d[i | mat_mask_d_const[y]] = psi_d_buffer[y];
-}
-
-template<int targe_num>
-void apply_2to4_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){
-    // uint targe_num = op.targe_num();
-    uint matlen = 1<<targe_num;
-    auto pos = op.positions();
-    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
-    vector<uint> targ_mask(matlen);
-    //create target mask
-    for (size_t m = 0; m < matlen;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
-    }
-
-    vector<pos_t> posv_sorted = op.positions();
-    uint psize = pos.size();
-    std::sort(posv_sorted.begin(),posv_sorted.end());
-    //Copy pos to device
-    checkCudaErrors(cudaMemcpyToSymbol(posv_d,  pos.data(), psize*sizeof(uint)));
-    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-    //copy mat to const memory
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-
-    checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex)));
-    checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint)));
-    size_t rsize = size>>psize;
-    
-    uint max_thread_num = targe_num < 4 ? 1024 : 512;
-    size_t blockdim = rsize <= max_thread_num ? rsize : max_thread_num;
-    size_t griddim = rsize / blockdim;
-    apply_2to4_targe_gate_kernel<targe_num><<<griddim, blockdim>>>(psi_d, op.control_num(), psize);
-}
-
-
-
-// ------------Large target number gate---------------
-
-__global__ void apply_5_targe_gate_kernel_const(cuDoubleComplex *psi_d, uint ctrlnum, int psize, size_t size){
-    uint rsize = size>>psize;
-    uint targnum = psize-ctrlnum;
-    uint matlen = (1<<targnum);
-    uint block_length = blockDim.x;
-    size_t b = blockIdx.x; // < rsize
-    int idx = threadIdx.x;// 
-    int idy = threadIdx.y;// 
-    size_t i = b;
-    // Insert zeros
-    for(size_t k=0;k < psize;k++){
-        size_t _pos = posv_sorted_d[k];
-        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-    }
-    // Set control
-    for (size_t k=0; k < ctrlnum;k++){
-        i |= 1ll<<posv_d[k];
-    }
-
-    __syncthreads();
-    cuDoubleComplex v;
-    v = cuCmul(psi_d[ i | mat_mask_d_const[idx]], mat_d_const[idy*matlen+idx]);
-    for (int offset = block_length>>1;offset > 0;offset >>=1){
-        v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
-        v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
-    }
-    __syncthreads(); 
-    if (!idx) psi_d[ i | mat_mask_d_const[idy]] = v;
-}
-
-
-void apply_5_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){
-    uint targe_num = op.targe_num();
-    uint matlen = 1<<targe_num;
-    auto pos = op.positions();
-    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
-    vector<uint> targ_mask(matlen);
-    //create target mask
-    for (size_t m = 0; m < matlen;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
-    }
-
-    vector<pos_t> posv_sorted = op.positions();
-    uint psize = pos.size();
-    std::sort(posv_sorted.begin(),posv_sorted.end());
-    //Copy pos to device
-    checkCudaErrors(cudaMemcpyToSymbol(posv_d,  pos.data(), psize*sizeof(uint)));
-    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-    //copy mat to const memory
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-
-    checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex)));
-    checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint)));
-    size_t rsize = size>>psize;
-    uint thread_num = matlen > 32 ? 32 : matlen;
-    dim3 blockdim = dim3(thread_num, thread_num);
-    apply_5_targe_gate_kernel_const<<<rsize, blockdim, thread_num*sizeof(cuDoubleComplex)>>>(psi_d, op.control_num(), psize, size);
-}
-
-
-//For target number 6-10
-__global__ void apply_multi_targe_gate_kernel_shared(cuDoubleComplex *psi_d, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){
-    
-    uint rsize = size>>psize;
-    uint targnum = psize-ctrlnum;
-    uint matlen = (1<<targnum);
-    uint block_length = blockDim.x;
-    size_t b = blockIdx.x; // < rsize
-    int idx = threadIdx.x;// 
-    int idy = threadIdx.y;// 
-    size_t i = b;
-    // Insert zeros
-    for(size_t k=0;k < psize;k++){
-        size_t _pos = posv_sorted_d[k];
-        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-    }
-    // Set control
-    for (size_t k=0; k < ctrlnum;k++){
-        i |= 1ll<<posv_d[k];
-    }
-
-    __syncthreads();
-    __shared__ cuDoubleComplex local_sum[1024];
-    cuDoubleComplex v;
-    for (int y = idy; y < matlen;y+=blockDim.y){
-        local_sum[y] = {0, 0};
-        for (int x = idx; x < matlen;x+=blockDim.x){
-            v = cuCmul(psi_d[ i | mat_mask_d[x]], mat_d[y*matlen+x]);    
-            __syncthreads(); 
-            for (int offset = block_length>>1;offset > 0;offset >>=1){
-                v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
-                v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
-            }
-            __syncthreads();
-            if (!idx) local_sum[y] = cuCadd(local_sum[y], v);
-        }
-    }
-
-    for (int y = idy; y < matlen;y+=blockDim.y){
-    if (!idx) psi_d[ i | mat_mask_d[y]] = local_sum[y];
-    }
-}
-
-
-void apply_multi_targe_gate_gpu_shared(cuDoubleComplex *psi_d, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){
-    uint targe_num = op.targe_num();
-    uint matlen = 1<<targe_num;
-    auto pos = op.positions();
-    uint psize = pos.size();
-    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
-    vector<uint> targ_mask(matlen);
-    //create target mask
-    for (size_t m = 0; m < matlen;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
-    }
-
-    vector<pos_t> posv_sorted = pos;
-    std::sort(posv_sorted.begin(),posv_sorted.end());
-    //Copy pos to device
-    checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint)));
-    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-     
-    //copy mat to global memory
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-    checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice));
-
-    size_t rsize = size>>psize;
-    uint thread_num = matlen > 32 ? 32 : matlen;
-    dim3 blockdim = dim3(thread_num, thread_num);
-
-    apply_multi_targe_gate_kernel_shared<<<rsize, blockdim>>>(psi_d, op.control_num(), mat_d, mat_mask_d, psize, size);
-}
-
-//For target number > 10
-__global__ void apply_multi_targe_gate_kernel_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){
-    uint rsize = size>>psize;
-    uint targnum = psize-ctrlnum;
-    uint matlen = (1<<targnum);
-    uint block_length = blockDim.x;
-    size_t b = blockIdx.x; // < rsize
-    int idx = threadIdx.x;// 
-    int idy = threadIdx.y;// 
-    size_t i = b;
-    // Insert zeros
-    for(size_t k=0;k < psize;k++){
-        size_t _pos = posv_sorted_d[k];
-        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-    }
-    // Set control
-    for (size_t k=0; k < ctrlnum;k++){
-        i |= 1ll<<posv_d[k];
-    }
-
-    __syncthreads();
-
-    cuDoubleComplex v;
-    cuDoubleComplex v_sum;
-    for (int y = idy; y < matlen;y+=blockDim.y){
-        v_sum = {0, 0};
-        for (int x = idx; x < matlen;x+=blockDim.x){
-            v = cuCmul(psi_d_copy[ i | mat_mask_d[x]], mat_d[y*matlen+x]);    
-            __syncthreads(); 
-            for (int offset = block_length>>1;offset > 0;offset >>=1){
-                v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
-                v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
-            }
-            __syncthreads();
-            if (!idx) v_sum = cuCadd(v_sum, v);
-        }
-        if (!idx) psi_d[ i | mat_mask_d[y]] = v_sum;
-    }
-}
-
-void apply_multi_targe_gate_gpu_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){
-    uint targe_num = op.targe_num();
-    uint matlen = 1<<targe_num;
-    auto pos = op.positions();
-    uint psize = pos.size();
-    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
-    vector<uint> targ_mask(matlen);
-    //create target mask
-    for (size_t m = 0; m < matlen;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
-    }
-
-    vector<pos_t> posv_sorted = pos;
-    std::sort(posv_sorted.begin(),posv_sorted.end());
-    //Copy pos to device
-    checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint)));
-    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-     
-    //copy mat to global memory
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-    checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice));
-
-    size_t rsize = size>>psize;
-    uint thread_num = matlen > 32 ? 32 : matlen;
-    dim3 blockdim = dim3(thread_num, thread_num);
-
-    apply_multi_targe_gate_kernel_global<<<rsize, blockdim>>>(psi_d, psi_d_copy, op.control_num(), mat_d, mat_mask_d, psize, size);
-    checkCudaErrors(cudaMemcpy(psi_d_copy, psi_d, size*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice));
-}
\ No newline at end of file
diff --git a/src/qfvm_gpu/cuda_simulator.cuh b/src/qfvm_gpu/cuda_simulator.cuh
deleted file mode 100644
index 99ee5e4..0000000
--- a/src/qfvm_gpu/cuda_simulator.cuh
+++ /dev/null
@@ -1,91 +0,0 @@
-#pragma once
-#include "cuda_statevector.cuh"
-#include <circuit.hpp>
-#include <types.hpp>
-#include <statevector.hpp>
-#include <ticktock.h>
-#include "apply_gate_gpu.cuh"
-
-void simulate_gpu(Circuit & circuit, CudaStateVector & psi_d){
-    size_t size = psi_d.size();
-    //initialize mat
-    cuDoubleComplex *mat_d;
-    uint *mat_mask_d;
-    CudaStateVector psi_d_copy{};
-    if (circuit.max_targe_num() > 5)
-    {
-        uint max_matlen = 1<<circuit.max_targe_num();
-        checkCudaErrors(cudaMalloc(&mat_d, (max_matlen*max_matlen)*sizeof(cuDoubleComplex)));
-        checkCudaErrors(cudaMalloc(&mat_mask_d, max_matlen*sizeof(cuDoubleComplex)));
-        if (circuit.max_targe_num() > 10){
-            psi_d_copy = psi_d;
-        }
-    }
-
-    //apply_gate
-    for (auto gate : circuit.gates()){
-        uint targnum = gate.targe_num();
-        uint ctrlnum = gate.control_num();
-
-        if (targnum  == 1){
-            if (ctrlnum == 0){ 
-                apply_one_targe_gate_gpu<0>(psi_d.data(), gate, size);
-                }
-            else if (ctrlnum == 1){
-                apply_one_targe_gate_gpu<1>(psi_d.data(), gate, size);
-                }
-            else{
-                 apply_one_targe_gate_gpu<2>(psi_d.data(), gate, size);
-                }
-        }
-        else if(targnum > 1){
-            if (targnum == 2){
-                apply_2to4_targe_gate_gpu_const<2>(psi_d.data(), gate, size);
-            }
-            else if(targnum == 3){
-                apply_2to4_targe_gate_gpu_const<3>(psi_d.data(), gate, size);
-            }
-            else if(targnum == 4){
-                apply_2to4_targe_gate_gpu_const<4>(psi_d.data(), gate, size);
-            }
-            else if (targnum == 5){
-                apply_5_targe_gate_gpu_const(psi_d.data(), gate, size);
-            }else if (targnum > 5 && targnum <= 10){
-                apply_multi_targe_gate_gpu_shared(psi_d.data(), gate, mat_d, mat_mask_d, size);
-            }
-            else{
-                apply_multi_targe_gate_gpu_global(psi_d.data(), psi_d_copy.data(), gate, mat_d, mat_mask_d, size);
-            }
-        }   
-        else{
-            throw "Invalid target number";
-        }
-    }
-
-    //free source
-    if (circuit.max_targe_num() > 5){
-        checkCudaErrors(cudaFree(mat_d));
-        checkCudaErrors(cudaFree(mat_mask_d));
-    }
-}
-
-void simulate_gpu(Circuit & circuit, StateVector<data_t> & state){
-    //initialize psi
-    state.set_num(circuit.qubit_num());
-    size_t size = state.size();
-    CudaStateVector psi_d(state);
-    
-    simulate_gpu(circuit, psi_d);
-    cudaDeviceSynchronize();
-
-    //copy back
-    complex<double>* psi = reinterpret_cast<complex<double>*>(psi_d.data());
-    checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex<double>), cudaMemcpyDeviceToHost));
-    psi=nullptr;
-}   
-
-StateVector<double> simulate_gpu(Circuit & circuit){
-    StateVector<double> state(circuit.qubit_num());
-    simulate_gpu(circuit, state);
-    return std::move(state);
-}
\ No newline at end of file
diff --git a/src/qfvm_gpu/cuda_statevector.cuh b/src/qfvm_gpu/cuda_statevector.cuh
deleted file mode 100644
index 19e8cb4..0000000
--- a/src/qfvm_gpu/cuda_statevector.cuh
+++ /dev/null
@@ -1,58 +0,0 @@
-
-#pragma once
-#include <cuda_runtime.h>
-#include <helper_cuda.h>
-#include <cuComplex.h>
-#include <statevector.hpp>
-
-class CudaStateVector
-{
-    protected:
-        uint num_;
-        size_t size_;
-        cuDoubleComplex *data_;
-
-    public:
-        //construct function
-        CudaStateVector(){checkCudaErrors(cudaMalloc(&data_, 0));}
-        CudaStateVector(CudaStateVector const &other);
-       
-        explicit CudaStateVector(StateVector<double> &sv);
-        ~CudaStateVector() {
-            checkCudaErrors(cudaFree(data_));
-        }
-
-        CudaStateVector &operator=(CudaStateVector const &other)
-        {   
-            num_ = other.num();
-            size_  = other.size();
-            checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex)));
-            checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice));
-            return *this;
-        }
-
-        cuDoubleComplex* data() const { return data_;}
-        size_t size() const { return size_; }
-        uint num() const { return num_; }
-};
-
-
-CudaStateVector::CudaStateVector(StateVector<double> &sv)
-:
-num_(sv.num()),
-size_(sv.size())
-{   
-    cuDoubleComplex *psi_h = reinterpret_cast<cuDoubleComplex*>(sv.data());
-    checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex)));
-    checkCudaErrors(cudaMemcpy(data_, psi_h, size_*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
-}
-
-CudaStateVector::CudaStateVector(CudaStateVector const &other)
-:
-num_(other.num()), 
-size_(other.size())
-{
-    checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex)));
-    checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice));
-}
-
diff --git a/src/qfvm_gpu/cuda_utils/CudaTexture.h b/src/qfvm_gpu/cuda_utils/CudaTexture.h
deleted file mode 100644
index 0661d0d..0000000
--- a/src/qfvm_gpu/cuda_utils/CudaTexture.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#pragma once
-
-
-#include "helper_cuda.h"
-#include <cuda_runtime.h>
-
-
-struct CudaTexture {
-    cudaTextureObject_t tex;
-
-    CudaTexture(CudaTexture const &) = delete;
-    CudaTexture(CudaTexture &&) = default;
-    CudaTexture &operator=(CudaTexture const &) = delete;
-    CudaTexture &operator=(CudaTexture &&) = default;
-
-    template <class T>
-    CudaTexture(T *dataDev, int width, int height) {
-        cudaTextureObject_t tex;
-        cudaResourceDesc resDesc;
-        memset(&resDesc, 0, sizeof(resDesc));
-        resDesc.resType = cudaResourceTypePitch2D;
-        resDesc.res.pitch2D.devPtr = dataDev;
-        resDesc.res.pitch2D.width = width;
-        resDesc.res.pitch2D.height = height;
-        resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
-        resDesc.res.pitch2D.pitchInBytes = width * sizeof(T);
-        cudaTextureDesc texDesc;
-        memset(&texDesc, 0, sizeof(texDesc));
-        checkCudaErrors(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
-    }
-
-    ~CudaTexture() {
-        checkCudaErrors(cudaDestroyTextureObject(tex));
-    }
-
-    constexpr operator cudaTextureObject_t() const {
-        return tex;
-    }
-};
diff --git a/src/qfvm_gpu/cuda_utils/helper_cuda.h b/src/qfvm_gpu/cuda_utils/helper_cuda.h
deleted file mode 100644
index 516dd57..0000000
--- a/src/qfvm_gpu/cuda_utils/helper_cuda.h
+++ /dev/null
@@ -1,953 +0,0 @@
-/**
- * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-////////////////////////////////////////////////////////////////////////////////
-// These are CUDA Helper functions for initialization and error checking
-
-#ifndef COMMON_HELPER_CUDA_H_
-#define COMMON_HELPER_CUDA_H_
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "helper_string.h"
-
-#ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
-#endif
-
-// Note, it is required that your SDK sample to include the proper header
-// files, please refer the CUDA examples for examples of the needed CUDA
-// headers, which may change depending on which CUDA functions are used.
-
-// CUDA Runtime error messages
-#ifdef __DRIVER_TYPES_H__
-static const char *_cudaGetErrorEnum(cudaError_t error) {
-  return cudaGetErrorName(error);
-}
-#endif
-
-#ifdef CUDA_DRIVER_API
-// CUDA Driver API errors
-static const char *_cudaGetErrorEnum(CUresult error) {
-  static char unknown[] = "<unknown>";
-  const char *ret = NULL;
-  cuGetErrorName(error, &ret);
-  return ret ? ret : unknown;
-}
-#endif
-
-#ifdef CUBLAS_API_H_
-// cuBLAS API errors
-static const char *_cudaGetErrorEnum(cublasStatus_t error) {
-  switch (error) {
-    case CUBLAS_STATUS_SUCCESS:
-      return "CUBLAS_STATUS_SUCCESS";
-
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "CUBLAS_STATUS_ALLOC_FAILED";
-
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "CUBLAS_STATUS_INVALID_VALUE";
-
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "CUBLAS_STATUS_MAPPING_ERROR";
-
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-    case CUBLAS_STATUS_LICENSE_ERROR:
-      return "CUBLAS_STATUS_LICENSE_ERROR";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef _CUFFT_H_
-// cuFFT API errors
-static const char *_cudaGetErrorEnum(cufftResult error) {
-  switch (error) {
-    case CUFFT_SUCCESS:
-      return "CUFFT_SUCCESS";
-
-    case CUFFT_INVALID_PLAN:
-      return "CUFFT_INVALID_PLAN";
-
-    case CUFFT_ALLOC_FAILED:
-      return "CUFFT_ALLOC_FAILED";
-
-    case CUFFT_INVALID_TYPE:
-      return "CUFFT_INVALID_TYPE";
-
-    case CUFFT_INVALID_VALUE:
-      return "CUFFT_INVALID_VALUE";
-
-    case CUFFT_INTERNAL_ERROR:
-      return "CUFFT_INTERNAL_ERROR";
-
-    case CUFFT_EXEC_FAILED:
-      return "CUFFT_EXEC_FAILED";
-
-    case CUFFT_SETUP_FAILED:
-      return "CUFFT_SETUP_FAILED";
-
-    case CUFFT_INVALID_SIZE:
-      return "CUFFT_INVALID_SIZE";
-
-    case CUFFT_UNALIGNED_DATA:
-      return "CUFFT_UNALIGNED_DATA";
-
-    case CUFFT_INCOMPLETE_PARAMETER_LIST:
-      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
-
-    case CUFFT_INVALID_DEVICE:
-      return "CUFFT_INVALID_DEVICE";
-
-    case CUFFT_PARSE_ERROR:
-      return "CUFFT_PARSE_ERROR";
-
-    case CUFFT_NO_WORKSPACE:
-      return "CUFFT_NO_WORKSPACE";
-
-    case CUFFT_NOT_IMPLEMENTED:
-      return "CUFFT_NOT_IMPLEMENTED";
-
-    case CUFFT_LICENSE_ERROR:
-      return "CUFFT_LICENSE_ERROR";
-
-    case CUFFT_NOT_SUPPORTED:
-      return "CUFFT_NOT_SUPPORTED";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef CUSPARSEAPI
-// cuSPARSE API errors
-static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
-  switch (error) {
-    case CUSPARSE_STATUS_SUCCESS:
-      return "CUSPARSE_STATUS_SUCCESS";
-
-    case CUSPARSE_STATUS_NOT_INITIALIZED:
-      return "CUSPARSE_STATUS_NOT_INITIALIZED";
-
-    case CUSPARSE_STATUS_ALLOC_FAILED:
-      return "CUSPARSE_STATUS_ALLOC_FAILED";
-
-    case CUSPARSE_STATUS_INVALID_VALUE:
-      return "CUSPARSE_STATUS_INVALID_VALUE";
-
-    case CUSPARSE_STATUS_ARCH_MISMATCH:
-      return "CUSPARSE_STATUS_ARCH_MISMATCH";
-
-    case CUSPARSE_STATUS_MAPPING_ERROR:
-      return "CUSPARSE_STATUS_MAPPING_ERROR";
-
-    case CUSPARSE_STATUS_EXECUTION_FAILED:
-      return "CUSPARSE_STATUS_EXECUTION_FAILED";
-
-    case CUSPARSE_STATUS_INTERNAL_ERROR:
-      return "CUSPARSE_STATUS_INTERNAL_ERROR";
-
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef CUSOLVER_COMMON_H_
-// cuSOLVER API errors
-static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
-  switch (error) {
-    case CUSOLVER_STATUS_SUCCESS:
-      return "CUSOLVER_STATUS_SUCCESS";
-    case CUSOLVER_STATUS_NOT_INITIALIZED:
-      return "CUSOLVER_STATUS_NOT_INITIALIZED";
-    case CUSOLVER_STATUS_ALLOC_FAILED:
-      return "CUSOLVER_STATUS_ALLOC_FAILED";
-    case CUSOLVER_STATUS_INVALID_VALUE:
-      return "CUSOLVER_STATUS_INVALID_VALUE";
-    case CUSOLVER_STATUS_ARCH_MISMATCH:
-      return "CUSOLVER_STATUS_ARCH_MISMATCH";
-    case CUSOLVER_STATUS_MAPPING_ERROR:
-      return "CUSOLVER_STATUS_MAPPING_ERROR";
-    case CUSOLVER_STATUS_EXECUTION_FAILED:
-      return "CUSOLVER_STATUS_EXECUTION_FAILED";
-    case CUSOLVER_STATUS_INTERNAL_ERROR:
-      return "CUSOLVER_STATUS_INTERNAL_ERROR";
-    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSOLVER_STATUS_NOT_SUPPORTED:
-      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
-    case CUSOLVER_STATUS_ZERO_PIVOT:
-      return "CUSOLVER_STATUS_ZERO_PIVOT";
-    case CUSOLVER_STATUS_INVALID_LICENSE:
-      return "CUSOLVER_STATUS_INVALID_LICENSE";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef CURAND_H_
-// cuRAND API errors
-static const char *_cudaGetErrorEnum(curandStatus_t error) {
-  switch (error) {
-    case CURAND_STATUS_SUCCESS:
-      return "CURAND_STATUS_SUCCESS";
-
-    case CURAND_STATUS_VERSION_MISMATCH:
-      return "CURAND_STATUS_VERSION_MISMATCH";
-
-    case CURAND_STATUS_NOT_INITIALIZED:
-      return "CURAND_STATUS_NOT_INITIALIZED";
-
-    case CURAND_STATUS_ALLOCATION_FAILED:
-      return "CURAND_STATUS_ALLOCATION_FAILED";
-
-    case CURAND_STATUS_TYPE_ERROR:
-      return "CURAND_STATUS_TYPE_ERROR";
-
-    case CURAND_STATUS_OUT_OF_RANGE:
-      return "CURAND_STATUS_OUT_OF_RANGE";
-
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-
-    case CURAND_STATUS_LAUNCH_FAILURE:
-      return "CURAND_STATUS_LAUNCH_FAILURE";
-
-    case CURAND_STATUS_PREEXISTING_FAILURE:
-      return "CURAND_STATUS_PREEXISTING_FAILURE";
-
-    case CURAND_STATUS_INITIALIZATION_FAILED:
-      return "CURAND_STATUS_INITIALIZATION_FAILED";
-
-    case CURAND_STATUS_ARCH_MISMATCH:
-      return "CURAND_STATUS_ARCH_MISMATCH";
-
-    case CURAND_STATUS_INTERNAL_ERROR:
-      return "CURAND_STATUS_INTERNAL_ERROR";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef NVJPEGAPI
-// nvJPEG API errors
-static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
-  switch (error) {
-    case NVJPEG_STATUS_SUCCESS:
-      return "NVJPEG_STATUS_SUCCESS";
-
-    case NVJPEG_STATUS_NOT_INITIALIZED:
-      return "NVJPEG_STATUS_NOT_INITIALIZED";
-
-    case NVJPEG_STATUS_INVALID_PARAMETER:
-      return "NVJPEG_STATUS_INVALID_PARAMETER";
-
-    case NVJPEG_STATUS_BAD_JPEG:
-      return "NVJPEG_STATUS_BAD_JPEG";
-
-    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
-      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
-
-    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
-      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
-
-    case NVJPEG_STATUS_EXECUTION_FAILED:
-      return "NVJPEG_STATUS_EXECUTION_FAILED";
-
-    case NVJPEG_STATUS_ARCH_MISMATCH:
-      return "NVJPEG_STATUS_ARCH_MISMATCH";
-
-    case NVJPEG_STATUS_INTERNAL_ERROR:
-      return "NVJPEG_STATUS_INTERNAL_ERROR";
-  }
-
-  return "<unknown>";
-}
-#endif
-
-#ifdef NV_NPPIDEFS_H
-// NPP API errors
-static const char *_cudaGetErrorEnum(NppStatus error) {
-  switch (error) {
-    case NPP_NOT_SUPPORTED_MODE_ERROR:
-      return "NPP_NOT_SUPPORTED_MODE_ERROR";
-
-    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
-      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
-
-    case NPP_RESIZE_NO_OPERATION_ERROR:
-      return "NPP_RESIZE_NO_OPERATION_ERROR";
-
-    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
-      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
-
-    case NPP_BAD_ARG_ERROR:
-      return "NPP_BAD_ARGUMENT_ERROR";
-
-    case NPP_COEFF_ERROR:
-      return "NPP_COEFFICIENT_ERROR";
-
-    case NPP_RECT_ERROR:
-      return "NPP_RECTANGLE_ERROR";
-
-    case NPP_QUAD_ERROR:
-      return "NPP_QUADRANGLE_ERROR";
-
-    case NPP_MEM_ALLOC_ERR:
-      return "NPP_MEMORY_ALLOCATION_ERROR";
-
-    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
-      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
-
-    case NPP_INVALID_INPUT:
-      return "NPP_INVALID_INPUT";
-
-    case NPP_POINTER_ERROR:
-      return "NPP_POINTER_ERROR";
-
-    case NPP_WARNING:
-      return "NPP_WARNING";
-
-    case NPP_ODD_ROI_WARNING:
-      return "NPP_ODD_ROI_WARNING";
-#else
-
-    // These are for CUDA 5.5 or higher
-    case NPP_BAD_ARGUMENT_ERROR:
-      return "NPP_BAD_ARGUMENT_ERROR";
-
-    case NPP_COEFFICIENT_ERROR:
-      return "NPP_COEFFICIENT_ERROR";
-
-    case NPP_RECTANGLE_ERROR:
-      return "NPP_RECTANGLE_ERROR";
-
-    case NPP_QUADRANGLE_ERROR:
-      return "NPP_QUADRANGLE_ERROR";
-
-    case NPP_MEMORY_ALLOCATION_ERR:
-      return "NPP_MEMORY_ALLOCATION_ERROR";
-
-    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
-      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
-
-    case NPP_INVALID_HOST_POINTER_ERROR:
-      return "NPP_INVALID_HOST_POINTER_ERROR";
-
-    case NPP_INVALID_DEVICE_POINTER_ERROR:
-      return "NPP_INVALID_DEVICE_POINTER_ERROR";
-#endif
-
-    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
-      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
-
-    case NPP_TEXTURE_BIND_ERROR:
-      return "NPP_TEXTURE_BIND_ERROR";
-
-    case NPP_WRONG_INTERSECTION_ROI_ERROR:
-      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
-
-    case NPP_NOT_EVEN_STEP_ERROR:
-      return "NPP_NOT_EVEN_STEP_ERROR";
-
-    case NPP_INTERPOLATION_ERROR:
-      return "NPP_INTERPOLATION_ERROR";
-
-    case NPP_RESIZE_FACTOR_ERROR:
-      return "NPP_RESIZE_FACTOR_ERROR";
-
-    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
-      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
-
-    case NPP_MEMFREE_ERR:
-      return "NPP_MEMFREE_ERR";
-
-    case NPP_MEMSET_ERR:
-      return "NPP_MEMSET_ERR";
-
-    case NPP_MEMCPY_ERR:
-      return "NPP_MEMCPY_ERROR";
-
-    case NPP_MIRROR_FLIP_ERR:
-      return "NPP_MIRROR_FLIP_ERR";
-#else
-
-    case NPP_MEMFREE_ERROR:
-      return "NPP_MEMFREE_ERROR";
-
-    case NPP_MEMSET_ERROR:
-      return "NPP_MEMSET_ERROR";
-
-    case NPP_MEMCPY_ERROR:
-      return "NPP_MEMCPY_ERROR";
-
-    case NPP_MIRROR_FLIP_ERROR:
-      return "NPP_MIRROR_FLIP_ERROR";
-#endif
-
-    case NPP_ALIGNMENT_ERROR:
-      return "NPP_ALIGNMENT_ERROR";
-
-    case NPP_STEP_ERROR:
-      return "NPP_STEP_ERROR";
-
-    case NPP_SIZE_ERROR:
-      return "NPP_SIZE_ERROR";
-
-    case NPP_NULL_POINTER_ERROR:
-      return "NPP_NULL_POINTER_ERROR";
-
-    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
-      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
-
-    case NPP_NOT_IMPLEMENTED_ERROR:
-      return "NPP_NOT_IMPLEMENTED_ERROR";
-
-    case NPP_ERROR:
-      return "NPP_ERROR";
-
-    case NPP_SUCCESS:
-      return "NPP_SUCCESS";
-
-    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
-      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
-
-    case NPP_MISALIGNED_DST_ROI_WARNING:
-      return "NPP_MISALIGNED_DST_ROI_WARNING";
-
-    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
-      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
-
-    case NPP_DOUBLE_SIZE_WARNING:
-      return "NPP_DOUBLE_SIZE_WARNING";
-
-    case NPP_WRONG_INTERSECTION_ROI_WARNING:
-      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
-    /* These are 6.0 or higher */
-    case NPP_LUT_PALETTE_BITSIZE_ERROR:
-      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
-
-    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
-      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
-
-    case NPP_QUALITY_INDEX_ERROR:
-      return "NPP_QUALITY_INDEX_ERROR";
-
-    case NPP_CHANNEL_ORDER_ERROR:
-      return "NPP_CHANNEL_ORDER_ERROR";
-
-    case NPP_ZERO_MASK_VALUE_ERROR:
-      return "NPP_ZERO_MASK_VALUE_ERROR";
-
-    case NPP_NUMBER_OF_CHANNELS_ERROR:
-      return "NPP_NUMBER_OF_CHANNELS_ERROR";
-
-    case NPP_COI_ERROR:
-      return "NPP_COI_ERROR";
-
-    case NPP_DIVISOR_ERROR:
-      return "NPP_DIVISOR_ERROR";
-
-    case NPP_CHANNEL_ERROR:
-      return "NPP_CHANNEL_ERROR";
-
-    case NPP_STRIDE_ERROR:
-      return "NPP_STRIDE_ERROR";
-
-    case NPP_ANCHOR_ERROR:
-      return "NPP_ANCHOR_ERROR";
-
-    case NPP_MASK_SIZE_ERROR:
-      return "NPP_MASK_SIZE_ERROR";
-
-    case NPP_MOMENT_00_ZERO_ERROR:
-      return "NPP_MOMENT_00_ZERO_ERROR";
-
-    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
-      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
-
-    case NPP_THRESHOLD_ERROR:
-      return "NPP_THRESHOLD_ERROR";
-
-    case NPP_CONTEXT_MATCH_ERROR:
-      return "NPP_CONTEXT_MATCH_ERROR";
-
-    case NPP_FFT_FLAG_ERROR:
-      return "NPP_FFT_FLAG_ERROR";
-
-    case NPP_FFT_ORDER_ERROR:
-      return "NPP_FFT_ORDER_ERROR";
-
-    case NPP_SCALE_RANGE_ERROR:
-      return "NPP_SCALE_RANGE_ERROR";
-
-    case NPP_DATA_TYPE_ERROR:
-      return "NPP_DATA_TYPE_ERROR";
-
-    case NPP_OUT_OFF_RANGE_ERROR:
-      return "NPP_OUT_OFF_RANGE_ERROR";
-
-    case NPP_DIVIDE_BY_ZERO_ERROR:
-      return "NPP_DIVIDE_BY_ZERO_ERROR";
-
-    case NPP_RANGE_ERROR:
-      return "NPP_RANGE_ERROR";
-
-    case NPP_NO_MEMORY_ERROR:
-      return "NPP_NO_MEMORY_ERROR";
-
-    case NPP_ERROR_RESERVED:
-      return "NPP_ERROR_RESERVED";
-
-    case NPP_NO_OPERATION_WARNING:
-      return "NPP_NO_OPERATION_WARNING";
-
-    case NPP_DIVIDE_BY_ZERO_WARNING:
-      return "NPP_DIVIDE_BY_ZERO_WARNING";
-#endif
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
-    /* These are 7.0 or higher */
-    case NPP_OVERFLOW_ERROR:
-      return "NPP_OVERFLOW_ERROR";
-
-    case NPP_CORRUPTED_DATA_ERROR:
-      return "NPP_CORRUPTED_DATA_ERROR";
-#endif
-  }
-
-  return "<unknown>";
-}
-#endif
-
-template <typename T>
-void check(T result, char const *const func, const char *const file,
-           int const line) {
-  if (result) {
-    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
-            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
-    exit(EXIT_FAILURE);
-  }
-}
-
-#ifdef __DRIVER_TYPES_H__
-// This will output the proper CUDA error strings in the event
-// that a CUDA host call returns an error
-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
-
-// This will output the proper error string when calling cudaGetLastError
-#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
-
-inline void __getLastCudaError(const char *errorMessage, const char *file,
-                               const int line) {
-  cudaError_t err = cudaGetLastError();
-
-  if (cudaSuccess != err) {
-    fprintf(stderr,
-            "%s(%i) : getLastCudaError() CUDA error :"
-            " %s : (%d) %s.\n",
-            file, line, errorMessage, static_cast<int>(err),
-            cudaGetErrorString(err));
-    exit(EXIT_FAILURE);
-  }
-}
-
-// This will only print the proper error string when calling cudaGetLastError
-// but not exit program incase error detected.
-#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
-
-inline void __printLastCudaError(const char *errorMessage, const char *file,
-                                 const int line) {
-  cudaError_t err = cudaGetLastError();
-
-  if (cudaSuccess != err) {
-    fprintf(stderr,
-            "%s(%i) : getLastCudaError() CUDA error :"
-            " %s : (%d) %s.\n",
-            file, line, errorMessage, static_cast<int>(err),
-            cudaGetErrorString(err));
-  }
-}
-#endif
-
-#ifndef MAX
-#define MAX(a, b) (a > b ? a : b)
-#endif
-
-// Float To Int conversion
-inline int ftoi(float value) {
-  return (value >= 0 ? static_cast<int>(value + 0.5)
-                     : static_cast<int>(value - 0.5));
-}
-
-// Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2Cores(int major, int minor) {
-  // Defines for GPU Architecture types (using the SM version to determine
-  // the # of cores per SM
-  typedef struct {
-    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
-    // and m = SM minor version
-    int Cores;
-  } sSMtoCores;
-
-  sSMtoCores nGpuArchCoresPerSM[] = {
-      {0x30, 192},
-      {0x32, 192},
-      {0x35, 192},
-      {0x37, 192},
-      {0x50, 128},
-      {0x52, 128},
-      {0x53, 128},
-      {0x60,  64},
-      {0x61, 128},
-      {0x62, 128},
-      {0x70,  64},
-      {0x72,  64},
-      {0x75,  64},
-      {0x80,  64},
-      {0x86, 128},
-      {0x87, 128},
-      {-1, -1}};
-
-  int index = 0;
-
-  while (nGpuArchCoresPerSM[index].SM != -1) {
-    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
-      return nGpuArchCoresPerSM[index].Cores;
-    }
-
-    index++;
-  }
-
-  // If we don't find the values, we default use the previous one
-  // to run properly
-  printf(
-      "MapSMtoCores for SM %d.%d is undefined."
-      "  Default to use %d Cores/SM\n",
-      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
-  return nGpuArchCoresPerSM[index - 1].Cores;
-}
-
-inline const char* _ConvertSMVer2ArchName(int major, int minor) {
-  // Defines for GPU Architecture types (using the SM version to determine
-  // the GPU Arch name)
-  typedef struct {
-    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
-    // and m = SM minor version
-    const char* name;
-  } sSMtoArchName;
-
-  sSMtoArchName nGpuArchNameSM[] = {
-      {0x30, "Kepler"},
-      {0x32, "Kepler"},
-      {0x35, "Kepler"},
-      {0x37, "Kepler"},
-      {0x50, "Maxwell"},
-      {0x52, "Maxwell"},
-      {0x53, "Maxwell"},
-      {0x60, "Pascal"},
-      {0x61, "Pascal"},
-      {0x62, "Pascal"},
-      {0x70, "Volta"},
-      {0x72, "Xavier"},
-      {0x75, "Turing"},
-      {0x80, "Ampere"},
-      {0x86, "Ampere"},
-      {0x87, "Ampere"},
-      {-1, "Graphics Device"}};
-
-  int index = 0;
-
-  while (nGpuArchNameSM[index].SM != -1) {
-    if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
-      return nGpuArchNameSM[index].name;
-    }
-
-    index++;
-  }
-
-  // If we don't find the values, we default use the previous one
-  // to run properly
-  printf(
-      "MapSMtoArchName for SM %d.%d is undefined."
-      "  Default to use %s\n",
-      major, minor, nGpuArchNameSM[index - 1].name);
-  return nGpuArchNameSM[index - 1].name;
-}
-  // end of GPU Architecture definitions
-
-#ifdef __CUDA_RUNTIME_H__
-// General GPU Device CUDA Initialization
-inline int gpuDeviceInit(int devID) {
-  int device_count;
-  checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-  if (device_count == 0) {
-    fprintf(stderr,
-            "gpuDeviceInit() CUDA error: "
-            "no devices supporting CUDA.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  if (devID < 0) {
-    devID = 0;
-  }
-
-  if (devID > device_count - 1) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
-            device_count);
-    fprintf(stderr,
-            ">> gpuDeviceInit (-device=%d) is not a valid"
-            " GPU device. <<\n",
-            devID);
-    fprintf(stderr, "\n");
-    return -devID;
-  }
-
-  int computeMode = -1, major = 0, minor = 0;
-  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
-  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
-  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
-  if (computeMode == cudaComputeModeProhibited) {
-    fprintf(stderr,
-            "Error: device is running in <Compute Mode "
-            "Prohibited>, no threads can use cudaSetDevice().\n");
-    return -1;
-  }
-
-  if (major < 1) {
-    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  checkCudaErrors(cudaSetDevice(devID));
-  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
-
-  return devID;
-}
-
-// This function returns the best GPU (with maximum GFLOPS)
-inline int gpuGetMaxGflopsDeviceId() {
-  int current_device = 0, sm_per_multiproc = 0;
-  int max_perf_device = 0;
-  int device_count = 0;
-  int devices_prohibited = 0;
-
-  uint64_t max_compute_perf = 0;
-  checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-  if (device_count == 0) {
-    fprintf(stderr,
-            "gpuGetMaxGflopsDeviceId() CUDA error:"
-            " no devices supporting CUDA.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Find the best CUDA capable GPU device
-  current_device = 0;
-
-  while (current_device < device_count) {
-    int computeMode = -1, major = 0, minor = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
-    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
-    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
-
-    // If this GPU is not running on Compute Mode prohibited,
-    // then we can add it to the list
-    if (computeMode != cudaComputeModeProhibited) {
-      if (major == 9999 && minor == 9999) {
-        sm_per_multiproc = 1;
-      } else {
-        sm_per_multiproc =
-            _ConvertSMVer2Cores(major,  minor);
-      }
-      int multiProcessorCount = 0, clockRate = 0;
-      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
-      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
-      if (result != cudaSuccess) {
-        // If cudaDevAttrClockRate attribute is not supported we
-        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
-        if(result == cudaErrorInvalidValue) {
-          clockRate = 1;
-        }
-        else {
-          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
-            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
-          exit(EXIT_FAILURE);
-        }
-      }
-      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
-
-      if (compute_perf > max_compute_perf) {
-        max_compute_perf = compute_perf;
-        max_perf_device = current_device;
-      }
-    } else {
-      devices_prohibited++;
-    }
-
-    ++current_device;
-  }
-
-  if (devices_prohibited == device_count) {
-    fprintf(stderr,
-            "gpuGetMaxGflopsDeviceId() CUDA error:"
-            " all devices have compute mode prohibited.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  return max_perf_device;
-}
-
-// Initialization code to find the best CUDA Device
-inline int findCudaDevice(int argc, const char **argv) {
-  int devID = 0;
-
-  // If the command-line has a device number specified, use it
-  if (checkCmdLineFlag(argc, argv, "device")) {
-    devID = getCmdLineArgumentInt(argc, argv, "device=");
-
-    if (devID < 0) {
-      printf("Invalid command line parameter\n ");
-      exit(EXIT_FAILURE);
-    } else {
-      devID = gpuDeviceInit(devID);
-
-      if (devID < 0) {
-        printf("exiting...\n");
-        exit(EXIT_FAILURE);
-      }
-    }
-  } else {
-    // Otherwise pick the device with highest Gflops/s
-    devID = gpuGetMaxGflopsDeviceId();
-    checkCudaErrors(cudaSetDevice(devID));
-    int major = 0, minor = 0; 
-    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
-    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
-    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
-           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
-
-  }
-
-  return devID;
-}
-
-inline int findIntegratedGPU() {
-  int current_device = 0;
-  int device_count = 0;
-  int devices_prohibited = 0;
-
-  checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-  if (device_count == 0) {
-    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  // Find the integrated GPU which is compute capable
-  while (current_device < device_count) {
-    int computeMode = -1, integrated = -1;
-    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
-    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
-    // If GPU is integrated and is not running on Compute Mode prohibited,
-    // then cuda can map to GLES resource
-    if (integrated && (computeMode != cudaComputeModeProhibited)) {
-      checkCudaErrors(cudaSetDevice(current_device));
-
-      int major = 0, minor = 0; 
-      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
-      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
-      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
-             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
-
-      return current_device;
-    } else {
-      devices_prohibited++;
-    }
-
-    current_device++;
-  }
-
-  if (devices_prohibited == device_count) {
-    fprintf(stderr,
-            "CUDA error:"
-            " No GLES-CUDA Interop capable GPU found.\n");
-    exit(EXIT_FAILURE);
-  }
-
-  return -1;
-}
-
-// General check for CUDA GPU SM Capabilities
-inline bool checkCudaCapabilities(int major_version, int minor_version) {
-  int dev;
-  int major = 0, minor = 0;
-
-  checkCudaErrors(cudaGetDevice(&dev));
-  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
-  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
-
-  if ((major > major_version) ||
-      (major == major_version &&
-       minor >= minor_version)) {
-    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
-           _ConvertSMVer2ArchName(major, minor), major, minor);
-    return true;
-  } else {
-    printf(
-        "  No GPU device was found that can support "
-        "CUDA compute capability %d.%d.\n",
-        major_version, minor_version);
-    return false;
-  }
-}
-#endif
-
-  // end of CUDA Helper Functions
-
-#endif  // COMMON_HELPER_CUDA_H_
diff --git a/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp b/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp
deleted file mode 100644
index 59fdbc3..0000000
--- a/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
- *
- * SPDX-License-Identifier: BSD-3-Clause
- */
-
-#define HANDLE_ERROR(x)                                                        \
-{   const auto err = x;                                                        \
-    if (err != CUSTATEVEC_STATUS_SUCCESS ) {                                   \
-        printf("Error: %s in line %d\n",                                       \
-               custatevecGetErrorString(err), __LINE__); return err; }         \
-};
-
-#define HANDLE_CUDA_ERROR(x)                                                   \
-{   const auto err = x;                                                        \
-    if (err != cudaSuccess ) {                                                 \
-        printf("Error: %s in line %d\n",                                       \
-               cudaGetErrorString(err), __LINE__); return err; }               \
-};
-
-bool almost_equal(cuDoubleComplex x, cuDoubleComplex y) {
-    const double eps = 1.0e-5;
-    const cuDoubleComplex diff = cuCsub(x, y);
-    return (cuCabs(diff) < eps);
-}
-
-bool almost_equal(double x, double y) {
-    const double eps = 1.0e-5;
-    const double diff = x - y;
-    return (abs(diff) < eps);
-}
diff --git a/src/qfvm_gpu/cuda_utils/helper_string.h b/src/qfvm_gpu/cuda_utils/helper_string.h
deleted file mode 100644
index 77864b8..0000000
--- a/src/qfvm_gpu/cuda_utils/helper_string.h
+++ /dev/null
@@ -1,683 +0,0 @@
-/**
- * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
- *
- * Please refer to the NVIDIA end user license agreement (EULA) associated
- * with this source code for terms and conditions that govern your use of
- * this software. Any use, reproduction, disclosure, or distribution of
- * this software and related documentation outside the terms of the EULA
- * is strictly prohibited.
- *
- */
-
-// These are helper functions for the SDK samples (string parsing, timers, etc)
-#ifndef COMMON_HELPER_STRING_H_
-#define COMMON_HELPER_STRING_H_
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <fstream>
-#include <string>
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-#ifndef _CRT_SECURE_NO_DEPRECATE
-#define _CRT_SECURE_NO_DEPRECATE
-#endif
-#ifndef STRCASECMP
-#define STRCASECMP _stricmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP _strnicmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
-#endif
-
-#ifndef FOPEN
-#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result != 0)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf_s
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf_s
-#endif
-#else  // Linux Includes
-#include <string.h>
-#include <strings.h>
-
-#ifndef STRCASECMP
-#define STRCASECMP strcasecmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP strncasecmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
-#endif
-
-#ifndef FOPEN
-#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result == NULL)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf
-#endif
-#endif
-
-#ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
-#endif
-
-// CUDA Utility Helper Functions
-inline int stringRemoveDelimiter(char delimiter, const char *string) {
-  int string_start = 0;
-
-  while (string[string_start] == delimiter) {
-    string_start++;
-  }
-
-  if (string_start >= static_cast<int>(strlen(string) - 1)) {
-    return 0;
-  }
-
-  return string_start;
-}
-
-inline int getFileExtension(char *filename, char **extension) {
-  int string_length = static_cast<int>(strlen(filename));
-
-  while (filename[string_length--] != '.') {
-    if (string_length == 0) break;
-  }
-
-  if (string_length > 0) string_length += 2;
-
-  if (string_length == 0)
-    *extension = NULL;
-  else
-    *extension = &filename[string_length];
-
-  return string_length;
-}
-
-inline bool checkCmdLineFlag(const int argc, const char **argv,
-                             const char *string_ref) {
-  bool bFound = false;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
-
-      const char *equal_pos = strchr(string_argv, '=');
-      int argv_length = static_cast<int>(
-          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
-
-      int length = static_cast<int>(strlen(string_ref));
-
-      if (length == argv_length &&
-          !STRNCASECMP(string_argv, string_ref, length)) {
-        bFound = true;
-        continue;
-      }
-    }
-  }
-
-  return bFound;
-}
-
-// This function wraps the CUDA Driver API into a template function
-template <class T>
-inline bool getCmdLineArgumentValue(const int argc, const char **argv,
-                                    const char *string_ref, T *value) {
-  bool bFound = false;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
-      int length = static_cast<int>(strlen(string_ref));
-
-      if (!STRNCASECMP(string_argv, string_ref, length)) {
-        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
-          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-          *value = (T)atoi(&string_argv[length + auto_inc]);
-        }
-
-        bFound = true;
-        i = argc;
-      }
-    }
-  }
-
-  return bFound;
-}
-
-inline int getCmdLineArgumentInt(const int argc, const char **argv,
-                                 const char *string_ref) {
-  bool bFound = false;
-  int value = -1;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
-      int length = static_cast<int>(strlen(string_ref));
-
-      if (!STRNCASECMP(string_argv, string_ref, length)) {
-        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
-          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-          value = atoi(&string_argv[length + auto_inc]);
-        } else {
-          value = 0;
-        }
-
-        bFound = true;
-        continue;
-      }
-    }
-  }
-
-  if (bFound) {
-    return value;
-  } else {
-    return 0;
-  }
-}
-
-inline float getCmdLineArgumentFloat(const int argc, const char **argv,
-                                     const char *string_ref) {
-  bool bFound = false;
-  float value = -1;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
-      int length = static_cast<int>(strlen(string_ref));
-
-      if (!STRNCASECMP(string_argv, string_ref, length)) {
-        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
-          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
-        } else {
-          value = 0.f;
-        }
-
-        bFound = true;
-        continue;
-      }
-    }
-  }
-
-  if (bFound) {
-    return value;
-  } else {
-    return 0;
-  }
-}
-
-inline bool getCmdLineArgumentString(const int argc, const char **argv,
-                                     const char *string_ref,
-                                     char **string_retval) {
-  bool bFound = false;
-
-  if (argc >= 1) {
-    for (int i = 1; i < argc; i++) {
-      int string_start = stringRemoveDelimiter('-', argv[i]);
-      char *string_argv = const_cast<char*>(&argv[i][string_start]);
-      int length = static_cast<int>(strlen(string_ref));
-
-      if (!STRNCASECMP(string_argv, string_ref, length)) {
-        *string_retval = &string_argv[length + 1];
-        bFound = true;
-        continue;
-      }
-    }
-  }
-
-  if (!bFound) {
-    *string_retval = NULL;
-  }
-
-  return bFound;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Find the path for a file assuming that
-//! files are found in the searchPath.
-//!
-//! @return the path if succeeded, otherwise 0
-//! @param filename         name of the file
-//! @param executable_path  optional absolute path of the executable
-//////////////////////////////////////////////////////////////////////////////
-inline char *sdkFindFilePath(const char *filename,
-                             const char *executable_path) {
-  // <executable_name> defines a variable that is replaced with the name of the
-  // executable
-
-  // Typical relative search paths to locate needed companion files (e.g. sample
-  // input data, or JIT source files) The origin for the relative search may be
-  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
-  // .exe or .bat, etc
-  const char *searchPath[] = {
-      "./",  // same dir
-      "./<executable_name>_data_files/",
-      "./common/",                      // "/common/" subdir
-      "./common/data/",                 // "/common/data/" subdir
-      "./data/",                        // "/data/" subdir
-      "./src/",                         // "/src/" subdir
-      "./src/<executable_name>/data/",  // "/src/<executable_name>/data/" subdir
-      "./inc/",                         // "/inc/" subdir
-      "./0_Simple/",                    // "/0_Simple/" subdir
-      "./1_Utilities/",                 // "/1_Utilities/" subdir
-      "./2_Graphics/",                  // "/2_Graphics/" subdir
-      "./3_Imaging/",                   // "/3_Imaging/" subdir
-      "./4_Finance/",                   // "/4_Finance/" subdir
-      "./5_Simulations/",               // "/5_Simulations/" subdir
-      "./6_Advanced/",                  // "/6_Advanced/" subdir
-      "./7_CUDALibraries/",             // "/7_CUDALibraries/" subdir
-      "./8_Android/",                   // "/8_Android/" subdir
-      "./samples/",                     // "/samples/" subdir
-
-      "./0_Simple/<executable_name>/data/",  // "/0_Simple/<executable_name>/data/"
-                                             // subdir
-      "./1_Utilities/<executable_name>/data/",  // "/1_Utilities/<executable_name>/data/"
-                                                // subdir
-      "./2_Graphics/<executable_name>/data/",  // "/2_Graphics/<executable_name>/data/"
-                                               // subdir
-      "./3_Imaging/<executable_name>/data/",  // "/3_Imaging/<executable_name>/data/"
-                                              // subdir
-      "./4_Finance/<executable_name>/data/",  // "/4_Finance/<executable_name>/data/"
-                                              // subdir
-      "./5_Simulations/<executable_name>/data/",  // "/5_Simulations/<executable_name>/data/"
-                                                  // subdir
-      "./6_Advanced/<executable_name>/data/",  // "/6_Advanced/<executable_name>/data/"
-                                               // subdir
-      "./7_CUDALibraries/<executable_name>/",  // "/7_CUDALibraries/<executable_name>/"
-                                               // subdir
-      "./7_CUDALibraries/<executable_name>/data/",  // "/7_CUDALibraries/<executable_name>/data/"
-                                                    // subdir
-
-      "../",              // up 1 in tree
-      "../common/",       // up 1 in tree, "/common/" subdir
-      "../common/data/",  // up 1 in tree, "/common/data/" subdir
-      "../data/",         // up 1 in tree, "/data/" subdir
-      "../src/",          // up 1 in tree, "/src/" subdir
-      "../inc/",          // up 1 in tree, "/inc/" subdir
-
-      "../0_Simple/<executable_name>/data/",  // up 1 in tree,
-                                              // "/0_Simple/<executable_name>/"
-                                              // subdir
-      "../1_Utilities/<executable_name>/data/",  // up 1 in tree,
-                                                 // "/1_Utilities/<executable_name>/"
-                                                 // subdir
-      "../2_Graphics/<executable_name>/data/",  // up 1 in tree,
-                                                // "/2_Graphics/<executable_name>/"
-                                                // subdir
-      "../3_Imaging/<executable_name>/data/",  // up 1 in tree,
-                                               // "/3_Imaging/<executable_name>/"
-                                               // subdir
-      "../4_Finance/<executable_name>/data/",  // up 1 in tree,
-                                               // "/4_Finance/<executable_name>/"
-                                               // subdir
-      "../5_Simulations/<executable_name>/data/",  // up 1 in tree,
-                                                   // "/5_Simulations/<executable_name>/"
-                                                   // subdir
-      "../6_Advanced/<executable_name>/data/",  // up 1 in tree,
-                                                // "/6_Advanced/<executable_name>/"
-                                                // subdir
-      "../7_CUDALibraries/<executable_name>/data/",  // up 1 in tree,
-                                                     // "/7_CUDALibraries/<executable_name>/"
-                                                     // subdir
-      "../8_Android/<executable_name>/data/",  // up 1 in tree,
-                                               // "/8_Android/<executable_name>/"
-                                               // subdir
-      "../samples/<executable_name>/data/",  // up 1 in tree,
-                                             // "/samples/<executable_name>/"
-                                             // subdir
-      "../../",                              // up 2 in tree
-      "../../common/",                       // up 2 in tree, "/common/" subdir
-      "../../common/data/",  // up 2 in tree, "/common/data/" subdir
-      "../../data/",         // up 2 in tree, "/data/" subdir
-      "../../src/",          // up 2 in tree, "/src/" subdir
-      "../../inc/",          // up 2 in tree, "/inc/" subdir
-      "../../sandbox/<executable_name>/data/",  // up 2 in tree,
-                                                // "/sandbox/<executable_name>/"
-                                                // subdir
-      "../../0_Simple/<executable_name>/data/",  // up 2 in tree,
-                                                 // "/0_Simple/<executable_name>/"
-                                                 // subdir
-      "../../1_Utilities/<executable_name>/data/",  // up 2 in tree,
-                                                    // "/1_Utilities/<executable_name>/"
-                                                    // subdir
-      "../../2_Graphics/<executable_name>/data/",  // up 2 in tree,
-                                                   // "/2_Graphics/<executable_name>/"
-                                                   // subdir
-      "../../3_Imaging/<executable_name>/data/",  // up 2 in tree,
-                                                  // "/3_Imaging/<executable_name>/"
-                                                  // subdir
-      "../../4_Finance/<executable_name>/data/",  // up 2 in tree,
-                                                  // "/4_Finance/<executable_name>/"
-                                                  // subdir
-      "../../5_Simulations/<executable_name>/data/",  // up 2 in tree,
-                                                      // "/5_Simulations/<executable_name>/"
-                                                      // subdir
-      "../../6_Advanced/<executable_name>/data/",  // up 2 in tree,
-                                                   // "/6_Advanced/<executable_name>/"
-                                                   // subdir
-      "../../7_CUDALibraries/<executable_name>/data/",  // up 2 in tree,
-                                                        // "/7_CUDALibraries/<executable_name>/"
-                                                        // subdir
-      "../../8_Android/<executable_name>/data/",  // up 2 in tree,
-                                                  // "/8_Android/<executable_name>/"
-                                                  // subdir
-      "../../samples/<executable_name>/data/",  // up 2 in tree,
-                                                // "/samples/<executable_name>/"
-                                                // subdir
-      "../../../",                              // up 3 in tree
-      "../../../src/<executable_name>/",        // up 3 in tree,
-                                          // "/src/<executable_name>/" subdir
-      "../../../src/<executable_name>/data/",  // up 3 in tree,
-                                               // "/src/<executable_name>/data/"
-                                               // subdir
-      "../../../src/<executable_name>/src/",   // up 3 in tree,
-                                               // "/src/<executable_name>/src/"
-                                               // subdir
-      "../../../src/<executable_name>/inc/",   // up 3 in tree,
-                                               // "/src/<executable_name>/inc/"
-                                               // subdir
-      "../../../sandbox/<executable_name>/",   // up 3 in tree,
-                                               // "/sandbox/<executable_name>/"
-                                               // subdir
-      "../../../sandbox/<executable_name>/data/",  // up 3 in tree,
-                                                   // "/sandbox/<executable_name>/data/"
-                                                   // subdir
-      "../../../sandbox/<executable_name>/src/",  // up 3 in tree,
-                                                  // "/sandbox/<executable_name>/src/"
-                                                  // subdir
-      "../../../sandbox/<executable_name>/inc/",  // up 3 in tree,
-                                                  // "/sandbox/<executable_name>/inc/"
-                                                  // subdir
-      "../../../0_Simple/<executable_name>/data/",  // up 3 in tree,
-                                                    // "/0_Simple/<executable_name>/"
-                                                    // subdir
-      "../../../1_Utilities/<executable_name>/data/",  // up 3 in tree,
-                                                       // "/1_Utilities/<executable_name>/"
-                                                       // subdir
-      "../../../2_Graphics/<executable_name>/data/",  // up 3 in tree,
-                                                      // "/2_Graphics/<executable_name>/"
-                                                      // subdir
-      "../../../3_Imaging/<executable_name>/data/",  // up 3 in tree,
-                                                     // "/3_Imaging/<executable_name>/"
-                                                     // subdir
-      "../../../4_Finance/<executable_name>/data/",  // up 3 in tree,
-                                                     // "/4_Finance/<executable_name>/"
-                                                     // subdir
-      "../../../5_Simulations/<executable_name>/data/",  // up 3 in tree,
-                                                         // "/5_Simulations/<executable_name>/"
-                                                         // subdir
-      "../../../6_Advanced/<executable_name>/data/",  // up 3 in tree,
-                                                      // "/6_Advanced/<executable_name>/"
-                                                      // subdir
-      "../../../7_CUDALibraries/<executable_name>/data/",  // up 3 in tree,
-                                                           // "/7_CUDALibraries/<executable_name>/"
-                                                           // subdir
-      "../../../8_Android/<executable_name>/data/",  // up 3 in tree,
-                                                     // "/8_Android/<executable_name>/"
-                                                     // subdir
-      "../../../0_Simple/<executable_name>/",  // up 3 in tree,
-                                               // "/0_Simple/<executable_name>/"
-                                               // subdir
-      "../../../1_Utilities/<executable_name>/",  // up 3 in tree,
-                                                  // "/1_Utilities/<executable_name>/"
-                                                  // subdir
-      "../../../2_Graphics/<executable_name>/",  // up 3 in tree,
-                                                 // "/2_Graphics/<executable_name>/"
-                                                 // subdir
-      "../../../3_Imaging/<executable_name>/",  // up 3 in tree,
-                                                // "/3_Imaging/<executable_name>/"
-                                                // subdir
-      "../../../4_Finance/<executable_name>/",  // up 3 in tree,
-                                                // "/4_Finance/<executable_name>/"
-                                                // subdir
-      "../../../5_Simulations/<executable_name>/",  // up 3 in tree,
-                                                    // "/5_Simulations/<executable_name>/"
-                                                    // subdir
-      "../../../6_Advanced/<executable_name>/",  // up 3 in tree,
-                                                 // "/6_Advanced/<executable_name>/"
-                                                 // subdir
-      "../../../7_CUDALibraries/<executable_name>/",  // up 3 in tree,
-                                                      // "/7_CUDALibraries/<executable_name>/"
-                                                      // subdir
-      "../../../8_Android/<executable_name>/",  // up 3 in tree,
-                                                // "/8_Android/<executable_name>/"
-                                                // subdir
-      "../../../samples/<executable_name>/data/",  // up 3 in tree,
-                                                   // "/samples/<executable_name>/"
-                                                   // subdir
-      "../../../common/",       // up 3 in tree, "../../../common/" subdir
-      "../../../common/data/",  // up 3 in tree, "../../../common/data/" subdir
-      "../../../data/",         // up 3 in tree, "../../../data/" subdir
-      "../../../../",           // up 4 in tree
-      "../../../../src/<executable_name>/",  // up 4 in tree,
-                                             // "/src/<executable_name>/" subdir
-      "../../../../src/<executable_name>/data/",  // up 4 in tree,
-                                                  // "/src/<executable_name>/data/"
-                                                  // subdir
-      "../../../../src/<executable_name>/src/",  // up 4 in tree,
-                                                 // "/src/<executable_name>/src/"
-                                                 // subdir
-      "../../../../src/<executable_name>/inc/",  // up 4 in tree,
-                                                 // "/src/<executable_name>/inc/"
-                                                 // subdir
-      "../../../../sandbox/<executable_name>/",  // up 4 in tree,
-                                                 // "/sandbox/<executable_name>/"
-                                                 // subdir
-      "../../../../sandbox/<executable_name>/data/",  // up 4 in tree,
-                                                      // "/sandbox/<executable_name>/data/"
-                                                      // subdir
-      "../../../../sandbox/<executable_name>/src/",  // up 4 in tree,
-                                                     // "/sandbox/<executable_name>/src/"
-                                                     // subdir
-      "../../../../sandbox/<executable_name>/inc/",  // up 4 in tree,
-                                                     // "/sandbox/<executable_name>/inc/"
-                                                     // subdir
-      "../../../../0_Simple/<executable_name>/data/",  // up 4 in tree,
-                                                       // "/0_Simple/<executable_name>/"
-                                                       // subdir
-      "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree,
-                                                          // "/1_Utilities/<executable_name>/"
-                                                          // subdir
-      "../../../../2_Graphics/<executable_name>/data/",  // up 4 in tree,
-                                                         // "/2_Graphics/<executable_name>/"
-                                                         // subdir
-      "../../../../3_Imaging/<executable_name>/data/",  // up 4 in tree,
-                                                        // "/3_Imaging/<executable_name>/"
-                                                        // subdir
-      "../../../../4_Finance/<executable_name>/data/",  // up 4 in tree,
-                                                        // "/4_Finance/<executable_name>/"
-                                                        // subdir
-      "../../../../5_Simulations/<executable_name>/data/",  // up 4 in tree,
-                                                            // "/5_Simulations/<executable_name>/"
-                                                            // subdir
-      "../../../../6_Advanced/<executable_name>/data/",  // up 4 in tree,
-                                                         // "/6_Advanced/<executable_name>/"
-                                                         // subdir
-      "../../../../7_CUDALibraries/<executable_name>/data/",  // up 4 in tree,
-                                                              // "/7_CUDALibraries/<executable_name>/"
-                                                              // subdir
-      "../../../../8_Android/<executable_name>/data/",  // up 4 in tree,
-                                                        // "/8_Android/<executable_name>/"
-                                                        // subdir
-      "../../../../0_Simple/<executable_name>/",  // up 4 in tree,
-                                                  // "/0_Simple/<executable_name>/"
-                                                  // subdir
-      "../../../../1_Utilities/<executable_name>/",  // up 4 in tree,
-                                                     // "/1_Utilities/<executable_name>/"
-                                                     // subdir
-      "../../../../2_Graphics/<executable_name>/",  // up 4 in tree,
-                                                    // "/2_Graphics/<executable_name>/"
-                                                    // subdir
-      "../../../../3_Imaging/<executable_name>/",  // up 4 in tree,
-                                                   // "/3_Imaging/<executable_name>/"
-                                                   // subdir
-      "../../../../4_Finance/<executable_name>/",  // up 4 in tree,
-                                                   // "/4_Finance/<executable_name>/"
-                                                   // subdir
-      "../../../../5_Simulations/<executable_name>/",  // up 4 in tree,
-                                                       // "/5_Simulations/<executable_name>/"
-                                                       // subdir
-      "../../../../6_Advanced/<executable_name>/",  // up 4 in tree,
-                                                    // "/6_Advanced/<executable_name>/"
-                                                    // subdir
-      "../../../../7_CUDALibraries/<executable_name>/",  // up 4 in tree,
-                                                         // "/7_CUDALibraries/<executable_name>/"
-                                                         // subdir
-      "../../../../8_Android/<executable_name>/",  // up 4 in tree,
-                                                   // "/8_Android/<executable_name>/"
-                                                   // subdir
-      "../../../../samples/<executable_name>/data/",  // up 4 in tree,
-                                                      // "/samples/<executable_name>/"
-                                                      // subdir
-      "../../../../common/",       // up 4 in tree, "../../../common/" subdir
-      "../../../../common/data/",  // up 4 in tree, "../../../common/data/"
-                                   // subdir
-      "../../../../data/",         // up 4 in tree, "../../../data/" subdir
-      "../../../../../",           // up 5 in tree
-      "../../../../../src/<executable_name>/",  // up 5 in tree,
-                                                // "/src/<executable_name>/"
-                                                // subdir
-      "../../../../../src/<executable_name>/data/",  // up 5 in tree,
-                                                     // "/src/<executable_name>/data/"
-                                                     // subdir
-      "../../../../../src/<executable_name>/src/",  // up 5 in tree,
-                                                    // "/src/<executable_name>/src/"
-                                                    // subdir
-      "../../../../../src/<executable_name>/inc/",  // up 5 in tree,
-                                                    // "/src/<executable_name>/inc/"
-                                                    // subdir
-      "../../../../../sandbox/<executable_name>/",  // up 5 in tree,
-                                                    // "/sandbox/<executable_name>/"
-                                                    // subdir
-      "../../../../../sandbox/<executable_name>/data/",  // up 5 in tree,
-                                                         // "/sandbox/<executable_name>/data/"
-                                                         // subdir
-      "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree,
-                                                        // "/sandbox/<executable_name>/src/"
-                                                        // subdir
-      "../../../../../sandbox/<executable_name>/inc/",  // up 5 in tree,
-                                                        // "/sandbox/<executable_name>/inc/"
-                                                        // subdir
-      "../../../../../0_Simple/<executable_name>/data/",  // up 5 in tree,
-                                                          // "/0_Simple/<executable_name>/"
-                                                          // subdir
-      "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree,
-                                                             // "/1_Utilities/<executable_name>/"
-                                                             // subdir
-      "../../../../../2_Graphics/<executable_name>/data/",  // up 5 in tree,
-                                                            // "/2_Graphics/<executable_name>/"
-                                                            // subdir
-      "../../../../../3_Imaging/<executable_name>/data/",  // up 5 in tree,
-                                                           // "/3_Imaging/<executable_name>/"
-                                                           // subdir
-      "../../../../../4_Finance/<executable_name>/data/",  // up 5 in tree,
-                                                           // "/4_Finance/<executable_name>/"
-                                                           // subdir
-      "../../../../../5_Simulations/<executable_name>/data/",  // up 5 in tree,
-                                                               // "/5_Simulations/<executable_name>/"
-                                                               // subdir
-      "../../../../../6_Advanced/<executable_name>/data/",  // up 5 in tree,
-                                                            // "/6_Advanced/<executable_name>/"
-                                                            // subdir
-      "../../../../../7_CUDALibraries/<executable_name>/data/",  // up 5 in
-                                                                 // tree,
-                                                                 // "/7_CUDALibraries/<executable_name>/"
-                                                                 // subdir
-      "../../../../../8_Android/<executable_name>/data/",  // up 5 in tree,
-                                                           // "/8_Android/<executable_name>/"
-                                                           // subdir
-      "../../../../../samples/<executable_name>/data/",  // up 5 in tree,
-                                                         // "/samples/<executable_name>/"
-                                                         // subdir
-      "../../../../../common/",       // up 5 in tree, "../../../common/" subdir
-      "../../../../../common/data/",  // up 5 in tree, "../../../common/data/"
-                                      // subdir
-  };
-
-  // Extract the executable name
-  std::string executable_name;
-
-  if (executable_path != 0) {
-    executable_name = std::string(executable_path);
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-    // Windows path delimiter
-    size_t delimiter_pos = executable_name.find_last_of('\\');
-    executable_name.erase(0, delimiter_pos + 1);
-
-    if (executable_name.rfind(".exe") != std::string::npos) {
-      // we strip .exe, only if the .exe is found
-      executable_name.resize(executable_name.size() - 4);
-    }
-
-#else
-    // Linux & OSX path delimiter
-    size_t delimiter_pos = executable_name.find_last_of('/');
-    executable_name.erase(0, delimiter_pos + 1);
-#endif
-  }
-
-  // Loop over all search paths and return the first hit
-  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
-    std::string path(searchPath[i]);
-    size_t executable_name_pos = path.find("<executable_name>");
-
-    // If there is executable_name variable in the searchPath
-    // replace it with the value
-    if (executable_name_pos != std::string::npos) {
-      if (executable_path != 0) {
-        path.replace(executable_name_pos, strlen("<executable_name>"),
-                     executable_name);
-      } else {
-        // Skip this path entry if no executable argument is given
-        continue;
-      }
-    }
-
-#ifdef _DEBUG
-    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
-#endif
-
-    // Test if the file exists
-    path.append(filename);
-    FILE *fp;
-    FOPEN(fp, path.c_str(), "rb");
-
-    if (fp != NULL) {
-      fclose(fp);
-      // File found
-      // returning an allocated array here for backwards compatibility reasons
-      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
-      STRCPY(file_path, path.length() + 1, path.c_str());
-      return file_path;
-    }
-
-    if (fp) {
-      fclose(fp);
-    }
-  }
-
-  // File not found
-  return 0;
-}
-
-#endif  // COMMON_HELPER_STRING_H_
diff --git a/src/qfvm_gpu/cuda_utils/ticktock.h b/src/qfvm_gpu/cuda_utils/ticktock.h
deleted file mode 100644
index 1adb8a9..0000000
--- a/src/qfvm_gpu/cuda_utils/ticktock.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-//#include <chrono>
-//#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
-//#define TOCK(x) std::cout << #x ": " << std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - bench_##x).count() << "s" << std::endl;
-
-#include <tbb/tick_count.h>
-#define TICK(x) auto bench_##x = tbb::tick_count::now();
-#define TOCK(x) std::cout << #x ": " << (tbb::tick_count::now() - bench_##x).seconds() << "s" << std::endl;
diff --git a/src/qfvm_gpu/custate_simu.cuh b/src/qfvm_gpu/custate_simu.cuh
deleted file mode 100644
index 6456c7e..0000000
--- a/src/qfvm_gpu/custate_simu.cuh
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-#include "cuda_statevector.cuh"
-#include <circuit.hpp>
-#include <types.hpp>
-#include <statevector.hpp>
-#include "apply_gate_custate.cuh"
-
-void simulate_custate(Circuit & circuit, CudaStateVector & psi_d){
-    size_t size = psi_d.size();
-    int n = psi_d.num();
-    for (auto gate : circuit.gates()){
-        apply_gate_custate(psi_d.data(), gate, n);
-    }
-}
-
-void simulate_custate(Circuit & circuit, StateVector<data_t> & state){
-    //initialize psi
-    state.set_num(circuit.qubit_num());
-    size_t size = state.size();
-    CudaStateVector psi_d(state);
-    
-    simulate_custate(circuit, psi_d);
-    cudaDeviceSynchronize();
-
-    //copy back
-    complex<double>* psi = reinterpret_cast<complex<double>*>(psi_d.data());
-    checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex<double>), cudaMemcpyDeviceToHost));
-    psi=nullptr;
-}   
-
-StateVector<double> simulate_custate(Circuit & circuit){
-    StateVector<double> state(circuit.qubit_num());
-    simulate_custate(circuit, state);
-    return std::move(state);
-}
\ No newline at end of file

From 2177df4b0aef500d91642f1a67f89e8f9965289a Mon Sep 17 00:00:00 2001
From: chensgit169 <weichen191@mails.ucas.ac.cn>
Date: Thu, 30 Nov 2023 16:09:35 +0800
Subject: [PATCH 3/5] Revert "remove redundant files"

This reverts commit 4efd9b53b12b7c0faf54462101474e8c4810a07a.
---
 src/qfvm/circuit.hpp                          |  241 +++
 src/qfvm/operators.hpp                        |   97 ++
 src/qfvm/qasm.hpp                             |   42 +
 src/qfvm/qfvm.cpp                             |  164 ++
 src/qfvm/simulator.hpp                        |  116 ++
 src/qfvm/statevector.hpp                      | 1355 +++++++++++++++++
 src/qfvm/types.hpp                            |   33 +
 src/qfvm/util.h                               |   99 ++
 src/qfvm_gpu/apply_gate_custate.cuh           |   46 +
 src/qfvm_gpu/apply_gate_gpu.cuh               |  411 +++++
 src/qfvm_gpu/cuda_simulator.cuh               |   91 ++
 src/qfvm_gpu/cuda_statevector.cuh             |   58 +
 src/qfvm_gpu/cuda_utils/CudaTexture.h         |   39 +
 src/qfvm_gpu/cuda_utils/helper_cuda.h         |  953 ++++++++++++
 src/qfvm_gpu/cuda_utils/helper_custatevec.hpp |   31 +
 src/qfvm_gpu/cuda_utils/helper_string.h       |  683 +++++++++
 src/qfvm_gpu/cuda_utils/ticktock.h            |    9 +
 src/qfvm_gpu/custate_simu.cuh                 |   35 +
 18 files changed, 4503 insertions(+)
 create mode 100644 src/qfvm/circuit.hpp
 create mode 100644 src/qfvm/operators.hpp
 create mode 100644 src/qfvm/qasm.hpp
 create mode 100644 src/qfvm/qfvm.cpp
 create mode 100644 src/qfvm/simulator.hpp
 create mode 100644 src/qfvm/statevector.hpp
 create mode 100644 src/qfvm/types.hpp
 create mode 100644 src/qfvm/util.h
 create mode 100644 src/qfvm_gpu/apply_gate_custate.cuh
 create mode 100644 src/qfvm_gpu/apply_gate_gpu.cuh
 create mode 100644 src/qfvm_gpu/cuda_simulator.cuh
 create mode 100644 src/qfvm_gpu/cuda_statevector.cuh
 create mode 100644 src/qfvm_gpu/cuda_utils/CudaTexture.h
 create mode 100644 src/qfvm_gpu/cuda_utils/helper_cuda.h
 create mode 100644 src/qfvm_gpu/cuda_utils/helper_custatevec.hpp
 create mode 100644 src/qfvm_gpu/cuda_utils/helper_string.h
 create mode 100644 src/qfvm_gpu/cuda_utils/ticktock.h
 create mode 100644 src/qfvm_gpu/custate_simu.cuh

diff --git a/src/qfvm/circuit.hpp b/src/qfvm/circuit.hpp
new file mode 100644
index 0000000..cd08bea
--- /dev/null
+++ b/src/qfvm/circuit.hpp
@@ -0,0 +1,241 @@
+#pragma once
+#include "operators.hpp"
+#include "qasm.hpp"
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+#include <pybind11/eigen.h>
+#include "util.h"
+#include <Eigen/Core>
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+
+void check_operator(QuantumOperator &op){
+    std::cout << "-------------" << std::endl;
+
+    std::cout << "name: " << op.name() << std::endl;
+    std::cout << "pos: ";
+    Qfutil::printVector(op.positions());
+
+    std::cout << "paras: ";
+    Qfutil::printVector(op.paras());
+
+    std::cout << "control number: ";
+    std::cout << op.control_num() << std::endl;
+
+    std::cout << "matrix: " << std::endl;
+    std::cout << op.mat() << std::endl;
+
+    std::cout << "flatten matrix: " << std::endl;
+    auto mat = op.mat();
+    // Eigen::Map<Eigen::RowVectorXcd> v1(mat.data(), mat.size());
+    // std::cout << "v1: " << v1 << std::endl;
+    auto matv = mat.data();
+    for (auto i = 0;i < mat.size();i++){
+        std::cout << matv[i] << " ";
+    }
+    std::cout << std::endl;
+    std::cout << "-------------" << std::endl;
+}
+
+
+class Circuit{
+    private:
+        uint qubit_num_;  
+        vector<QuantumOperator> instructions_;
+        uint max_targe_num_;
+        uint cbit_num_;
+        // to sample count
+        vector<std::pair<uint,uint>> measure_vec_;
+        bool final_measure_ = true;
+
+    public:
+    Circuit();
+    explicit Circuit(uint qubit_num);
+    explicit Circuit(vector<QuantumOperator> &ops);
+    explicit Circuit(py::object const&pycircuit); 
+
+    void add_op(QuantumOperator &op);
+    void compress_instructions();
+    uint qubit_num() const { return qubit_num_; }
+    uint cbit_num() const { return cbit_num_; }
+    uint max_targe_num() const { return max_targe_num_; }
+    bool final_measure() const { return final_measure_; }
+    vector<QuantumOperator> gates();
+    vector<std::pair<uint,uint>> measure_vec() { return measure_vec_; }
+    vector<QuantumOperator> instructions() const { return instructions_; }
+    QuantumOperator from_pyops(py::object const &obj);
+};
+
+void Circuit::add_op(QuantumOperator &op){
+    for (pos_t pos : op.positions()){
+        if (pos > qubit_num_) {
+            throw "invalid position on quantum registers";
+        }
+        else{
+            instructions_.push_back(op);
+        }
+    }
+}
+
+ Circuit::Circuit(){};
+ Circuit::Circuit(uint qubit_num)
+ :
+ qubit_num_(qubit_num){ }
+
+ Circuit::Circuit(vector<QuantumOperator> &ops)
+ :
+ instructions_(ops),
+ max_targe_num_(0){
+    qubit_num_ = 0;
+    for (auto op : ops){
+        for (pos_t pos : op.positions()){
+            if (op.targe_num() > max_targe_num_)
+                max_targe_num_ = op.targe_num();
+            if (pos+1 > qubit_num_){ qubit_num_ = pos+1; }
+        }
+    }
+}
+
+vector<QuantumOperator> Circuit::gates(){
+    // provide gates for gpu and custate
+    std::vector<std::string> classics = {"measure", "cif", "reset"};
+    vector<QuantumOperator> gates;
+    for(auto op : instructions_){
+        if(std::find(classics.begin(), classics.end(), op.name()) == classics.end()){
+            gates.push_back(op);
+        }
+    }
+    return gates;
+}
+
+// Construct C++ operators from pygates
+QuantumOperator Circuit::from_pyops(py::object const &obj){
+    string name;
+    vector<pos_t> positions;
+    vector<pos_t> qbits;
+    vector<pos_t> cbits;
+    vector<double> paras;
+    uint control_num = 0;
+    RowMatrixXcd mat;
+    
+    name = obj.attr("name").attr("lower")().cast<string>();
+    if (!(name == "barrier" || name == "delay" || name == "id" || name == "measure" || name == "reset" || name == "cif"))
+    {
+        if (py::isinstance<py::list>(obj.attr("pos"))){
+            positions = obj.attr("pos").cast<vector<pos_t>>();
+        }
+        else if(py::isinstance<py::int_>(obj.attr("pos"))){
+            positions = vector<pos_t>{obj.attr("pos").cast<pos_t>()};
+        }
+
+        if (py::isinstance<py::list>(obj.attr("paras"))){
+            paras = obj.attr("paras").cast<vector<double>>();
+        }
+        else if(py::isinstance<py::float_>(obj.attr("paras")) || py::isinstance<py::int_>(obj.attr("paras"))){
+            paras = vector<double>{obj.attr("paras").cast<double>()};
+        }
+
+        if (py::hasattr(obj, "ctrls")){
+                control_num = py::len(obj.attr("ctrls"));
+        }
+        
+        //Reverse order for multi-target gate
+        if (py::hasattr(obj, "_targ_matrix")){
+                mat = obj.attr("get_targ_matrix")("reverse_order"_a=true).cast<RowMatrixXcd>();
+        }
+        else{ //Single target gate
+                mat = obj.attr("matrix").cast<RowMatrixXcd>();
+        }
+        return QuantumOperator(name, paras, positions, control_num, mat);
+
+    }else if(name == "measure"){
+        if (py::isinstance<py::list>(obj.attr("qbits"))){
+            qbits = obj.attr("qbits").cast<vector<pos_t>>();
+        }else if(py::isinstance<py::int_>(obj.attr("qbits"))){
+            qbits = vector<pos_t>{obj.attr("qbits").cast<pos_t>()};
+        }
+
+        if (py::isinstance<py::list>(obj.attr("cbits"))){
+            cbits = obj.attr("cbits").cast<vector<pos_t>>();
+        }else if(py::isinstance<py::int_>(obj.attr("cbits"))){
+            cbits = vector<pos_t>{obj.attr("cbits").cast<pos_t>()};
+        }
+        //record qbit-cbit measure map
+        for(uint i = 0; i < qbits.size(); i++){
+            measure_vec_.push_back(std::make_pair(qbits[i], cbits[i]));
+        }
+        return QuantumOperator(name, qbits, cbits);
+
+    }else if(name == "reset"){
+        if (py::isinstance<py::list>(obj.attr("pos"))){
+            positions = obj.attr("pos").cast<vector<pos_t>>();
+        }
+        else if(py::isinstance<py::int_>(obj.attr("pos"))){
+            positions = vector<pos_t>{obj.attr("pos").cast<pos_t>()};
+        }
+        return QuantumOperator(name, positions);
+
+    }else if(name == "cif"){
+        uint condition = 0;
+        vector<QuantumOperator> instructions;
+        if (py::isinstance<py::list>(obj.attr("cbits"))){
+            cbits = obj.attr("cbits").cast<vector<pos_t>>();
+        }else if(py::isinstance<py::int_>(obj.attr("cbits"))){
+            cbits = vector<pos_t>{obj.attr("cbits").cast<pos_t>()};
+        }
+
+        if(py::isinstance<py::int_>(obj.attr("condition"))){
+            condition = obj.attr("condition").cast<pos_t>();
+        }
+
+        // Recursively handdle instruction
+        if (py::isinstance<py::list>(obj.attr("instructions"))){
+            auto pyops = obj.attr("instructions");
+            for(auto pyop_h : pyops){
+                py::object pyop = py::reinterpret_borrow<py::object>(pyop_h);
+                QuantumOperator op = from_pyops(pyop);
+                if (op){
+                    if (op.targe_num() > max_targe_num_)
+                        max_targe_num_ = op.targe_num();
+                    instructions.push_back(std::move(op));
+                }        
+            }
+        }
+        return QuantumOperator(name, cbits, condition, instructions); 
+
+    }else{
+        return QuantumOperator();
+    }
+   
+}
+
+Circuit::Circuit(py::object const&pycircuit)
+:
+max_targe_num_(0)
+{
+    // auto pygates = pycircuit.attr("gates");
+    auto pyops = pycircuit.attr("instructions");
+    auto used_qubits = pycircuit.attr("used_qubits").cast<vector<pos_t>>();
+    cbit_num_ =  pycircuit.attr("cbits_num").cast<uint>();
+    qubit_num_ = *std::max_element(used_qubits.begin(), used_qubits.end())+1;
+    // judge wheather op qubit after measure
+    bool measured = false;
+    for (auto pyop_h : pyops){
+        py::object pyop = py::reinterpret_borrow<py::object>(pyop_h);
+        QuantumOperator op = from_pyops(pyop);
+        if (op){
+            if (op.targe_num() > max_targe_num_)
+                max_targe_num_ = op.targe_num();
+            if(op.name() == "measure") {measured = true;}
+            else if(measured == true) {final_measure_ = false; } 
+            instructions_.push_back(std::move(op));
+        }        
+    }
+} 
+
+void Circuit::compress_instructions(){}
diff --git a/src/qfvm/operators.hpp b/src/qfvm/operators.hpp
new file mode 100644
index 0000000..cb9fd66
--- /dev/null
+++ b/src/qfvm/operators.hpp
@@ -0,0 +1,97 @@
+#pragma once
+
+#include <iostream>
+#include "statevector.hpp"
+
+class QuantumOperator{
+    protected:
+        string name_;
+        vector<pos_t> positions_;
+        vector<double> paras_;
+        uint control_num_;
+        uint targe_num_;  
+        bool diag_;
+        bool real_;
+        RowMatrixXcd mat_;
+        vector<pos_t> qbits_;
+        vector<pos_t> cbits_;
+        vector<QuantumOperator> instructions_;
+        uint condition_;
+    public:
+        //Constructor
+        QuantumOperator();
+        QuantumOperator(string name, vector<pos_t> const &qbits);
+        QuantumOperator(string name, vector<pos_t> const &qbits, vector<pos_t> const &cbits);
+        QuantumOperator(string name, vector<pos_t> const &cbits, const uint condition, vector<QuantumOperator> const &ins);
+        QuantumOperator(string name, vector<double> paras, vector<pos_t> const &control_qubits, vector<pos_t> const &targe_qubits, RowMatrixXcd const &mat, bool diag=false, bool real=false);
+        QuantumOperator(string name,vector<double> paras, vector<pos_t> const &positions, uint control_num, RowMatrixXcd const &mat, bool diag=false, bool real=false);
+
+        //data accessor
+        string name() const {return name_;}
+        vector<double> paras() const {return paras_;}
+        bool has_control() const{return control_num_ == 0 ? false : true;}
+        bool is_real() const{ return real_; }
+        bool is_diag() const{ return diag_; }
+        RowMatrixXcd mat() const { return mat_;}
+        uint control_num() const { return control_num_; } 
+        uint targe_num() const { return targe_num_; }
+        uint condition() const { return condition_; }
+        vector<pos_t> positions(){ return positions_; }
+        explicit operator bool() const {
+            return !(name_ == "empty");
+        }
+        vector<pos_t> qbits(){ return qbits_; }
+        vector<pos_t> cbits(){ return cbits_; }
+        vector<QuantumOperator> instructions(){ return instructions_; }
+        //Apply method
+        virtual void apply_to_state(StateVector<double> & state){ };
+};
+
+
+QuantumOperator::QuantumOperator() : name_("empty"){ };
+
+QuantumOperator::QuantumOperator(string name, vector<pos_t> const &qbits)
+:
+name_(name),
+targe_num_(0),
+qbits_(qbits){}
+
+QuantumOperator::QuantumOperator(string name, vector<pos_t> const &qbits, vector<pos_t> const &cbits)
+:
+name_(name),
+targe_num_(0),
+qbits_(qbits),
+cbits_(cbits){}
+
+QuantumOperator::QuantumOperator(string name, vector<pos_t> const &cbits, const uint condition, vector<QuantumOperator> const &ins)
+:
+name_(name),
+targe_num_(0),
+cbits_(cbits),
+instructions_(ins),
+condition_(condition){}
+
+QuantumOperator::QuantumOperator(string name, vector<double> paras, vector<pos_t> const &positions, uint control_num, RowMatrixXcd const &mat, bool diag, bool real)
+:
+name_(name),
+paras_(paras),
+positions_(positions),
+control_num_(control_num),
+targe_num_(positions.size()-control_num),
+diag_(diag),
+real_(real),
+mat_(mat){ }
+
+QuantumOperator::QuantumOperator(string name, vector<double> paras, vector<pos_t> const &control_qubits, vector<pos_t> const &targe_qubits, RowMatrixXcd const &mat, bool diag, bool real)
+:
+name_(name),
+paras_(paras),
+diag_(diag),
+real_(real),
+mat_(mat){
+    positions_ = control_qubits;
+    positions_.insert(positions_.end(), targe_qubits.begin(), targe_qubits.end());
+    control_num_ = control_qubits.size();
+    targe_num_ = targe_qubits.size();
+}
+
diff --git a/src/qfvm/qasm.hpp b/src/qfvm/qasm.hpp
new file mode 100644
index 0000000..e0a2285
--- /dev/null
+++ b/src/qfvm/qasm.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <unordered_map>
+#include "types.hpp"
+#include "util.h"
+
+using Qfutil::split_string;
+using Qfutil::find_numbers;
+#define Pair(name) {#name, Opname::name}
+
+enum class Opname{
+    creg, x, y, z, h, s, sdg, t, tdg, p, rx, ry, rz, cnot, cx, cz, crx, cp, ccx, toffoli, swap, iswap, rxx, ryy, rzz, measure, reset, cif
+};
+
+std::unordered_map<string, Opname> OPMAP{Pair(creg), Pair(x), Pair(y), Pair(z), Pair(h), Pair(s), Pair(sdg), Pair(t),
+                            Pair(tdg), Pair(p), Pair(rx), Pair(ry), Pair(rz), Pair(cnot), Pair(cx), Pair(cz), 
+                            Pair(crx), Pair(cp), Pair(ccx), Pair(swap), Pair(iswap), Pair(rxx), Pair(ryy), 
+                            Pair(rzz), Pair(measure), Pair(reset), Pair(cif)};
+
+struct Operation{
+    string name;
+    vector<pos_t> positions;
+    vector<double> params; 
+
+    void print_info(){
+        std::cout << "name " << name << std::endl;
+        std::cout << "positions: ";
+        for (auto pos : positions){
+            std::cout << pos << " ";
+        }
+        std::cout << std::endl;
+
+        if (params.size() > 0){
+        printf("parameters: ");
+            for (auto para : params){
+                printf("%.6f ", para);
+            }
+        }
+        printf("\n");
+        printf("-----\n");
+    }
+} ;
diff --git a/src/qfvm/qfvm.cpp b/src/qfvm/qfvm.cpp
new file mode 100644
index 0000000..f318da6
--- /dev/null
+++ b/src/qfvm/qfvm.cpp
@@ -0,0 +1,164 @@
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include "simulator.hpp"
+#include <iostream>
+#include <random>
+#ifdef _USE_GPU
+#include <cuda_simulator.cuh>
+#endif
+
+#ifdef _USE_CUQUANTUM
+#include <custate_simu.cuh>
+#endif
+
+namespace py = pybind11;
+
+template <typename T>
+py::array_t<T> to_numpy(const std::tuple<T*, size_t> &src) {
+    auto src_ptr = std::get<0>(src);
+    auto src_size = std::get<1>(src);
+
+    auto capsule = py::capsule(src_ptr, [](void* p) {
+        delete [] reinterpret_cast<T*>(p);
+    });
+    return py::array_t<T>(
+        src_size,
+        src_ptr,
+        capsule
+    );
+}
+
+std::pair<std::map<uint, uint>, py::array_t<complex<double>> > simulate_circuit(py::object const&pycircuit, py::array_t<complex<double>> &np_inputstate, const int &shots){
+    auto circuit = Circuit(pycircuit);
+    py::buffer_info buf = np_inputstate.request();
+    auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
+    size_t data_size = buf.size;
+    // If measure all at the end, simulate once
+    uint actual_shots = shots;
+    if (circuit.final_measure()) actual_shots = 1;
+    StateVector<double> global_state;
+    vector<std::pair<uint,uint>>measures = circuit.measure_vec();
+    std::map<uint,bool>cbit_measured;
+    for(auto &pair: measures){
+        cbit_measured[pair.second] = true;        
+    }
+    // Store outcome's count
+    std::map<uint, uint> outcount;
+    for(uint i =0; i < actual_shots; i++){
+        StateVector<double> state;
+        if(data_size == 0){
+            simulate(circuit, state);
+        }else{
+            //deepcopy state
+            vector<std::complex<double>> data_copy(data_ptr, data_ptr + data_size);
+            state = std::move(StateVector<double>(data_copy.data(), data_copy.size()));
+            simulate(circuit, state);
+        }
+        if(!circuit.final_measure()){
+            // store reg
+            vector<uint> tmpcreg = state.creg();
+            uint outcome = 0;
+            for(uint j=0;j<tmpcreg.size();j++){
+                if(cbit_measured.find(j) == cbit_measured.end()) continue; 
+                outcome *= 2;
+                outcome += tmpcreg[j];
+            }
+            if(outcount.find(outcome) != outcount.end()) outcount[outcome]++;
+            else outcount[outcome] = 1;
+        }
+        if (circuit.final_measure() || i == actual_shots-1) global_state = std::move(state);
+    }
+    // sample outcome if final_measure is true
+    if(circuit.final_measure() && !measures.empty()){
+        vector<uint> tmpcount(global_state.size(), 0);
+        vector<double> probs = global_state.probabilities();
+        std::random_device rd;
+        std::mt19937 global_rng(rd());
+        for(uint i = 0; i < shots; i++){
+            uint outcome = std::discrete_distribution<uint>(probs.begin(), probs.end())(global_rng);
+            tmpcount[outcome]++;
+        }
+        // map to reg
+        for(uint i = 0; i < global_state.size(); i++){
+            if(tmpcount[i] == 0) continue;
+            vector<uint> tmpcreg(global_state.cbit_num(), 0); 
+            vector<uint> tmpout = int2vec(i, 2);
+            if(tmpout.size() < global_state.num())
+                tmpout.resize(global_state.num());
+            for(auto &pair: measures){
+                tmpcreg[pair.second] = tmpout[pair.first];
+            }
+            uint outcome = 0;
+            for(uint j=0;j<tmpcreg.size();j++){
+                if(cbit_measured.find(j) == cbit_measured.end()) continue; 
+                outcome *= 2;
+                outcome += tmpcreg[j];
+            }
+            if(outcount.find(outcome) != outcount.end()) outcount[outcome] += tmpcount[i];
+            else outcount[outcome] = tmpcount[i];
+        }
+    } 
+    // return 
+    if(data_size == 0) return std::make_pair(outcount, to_numpy(global_state.move_data_to_python()));
+    else return std::make_pair(outcount, np_inputstate);  
+}
+
+#ifdef _USE_GPU
+py::object simulate_circuit_gpu(py::object const&pycircuit, py::array_t<complex<double>> &np_inputstate){
+    auto circuit = Circuit(pycircuit);
+    py::buffer_info buf = np_inputstate.request();
+    auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
+    size_t data_size = buf.size;
+
+
+    if (data_size == 0){
+        StateVector<double> state;
+        simulate_gpu(circuit, state);
+        return to_numpy(state.move_data_to_python());
+    }
+    else{
+      StateVector<double> state(data_ptr, buf.size);
+      simulate_gpu(circuit, state);
+      state.move_data_to_python();
+      return np_inputstate;
+    }
+}
+#endif
+
+#ifdef _USE_CUQUANTUM
+py::object simulate_circuit_custate(py::object const&pycircuit, py::array_t<complex<double>> &np_inputstate){
+    auto circuit = Circuit(pycircuit);
+    py::buffer_info buf = np_inputstate.request();
+    auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
+    size_t data_size = buf.size;
+
+
+    if (data_size == 0){
+        StateVector<double> state;
+        simulate_custate(circuit, state);
+        return to_numpy(state.move_data_to_python());
+    }
+    else{
+      StateVector<double> state(data_ptr, buf.size);
+      simulate_custate(circuit, state);
+      state.move_data_to_python();
+      return np_inputstate;
+    }
+}
+#endif
+
+
+
+PYBIND11_MODULE(qfvm, m) {
+    m.doc() = "Qfvm simulator";
+    m.def("simulate_circuit", &simulate_circuit, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t<complex<double>>(0), py::arg("shots"));
+
+    #ifdef _USE_GPU
+     m.def("simulate_circuit_gpu", &simulate_circuit_gpu, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t<complex<double>>(0));
+    #endif
+
+    #ifdef _USE_CUQUANTUM
+    m.def("simulate_circuit_custate", &simulate_circuit_custate, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t<complex<double>>(0));
+    #endif
+}
+
diff --git a/src/qfvm/simulator.hpp b/src/qfvm/simulator.hpp
new file mode 100644
index 0000000..1499922
--- /dev/null
+++ b/src/qfvm/simulator.hpp
@@ -0,0 +1,116 @@
+#pragma once
+
+#include "statevector.hpp"
+#include "circuit.hpp"
+
+void apply_op(QuantumOperator &op, StateVector<data_t> &state){
+    bool matched = false; 
+    switch (OPMAP[op.name()]){
+            //Named gate
+        case Opname::x:
+            state.apply_x(op.positions()[0]);
+            break;
+        case Opname::y:
+            state.apply_y(op.positions()[0]);
+            break;
+        case Opname::z:
+            state.apply_z(op.positions()[0]);
+            break;
+        case Opname::h:
+            state.apply_h(op.positions()[0]);
+            break;
+        case Opname::s:
+            state.apply_s(op.positions()[0]);
+            break;
+        case Opname::sdg:
+            state.apply_sdag(op.positions()[0]);
+            break;
+        case Opname::t:
+            state.apply_t(op.positions()[0]);
+            break;
+        case Opname::tdg:
+            state.apply_tdag(op.positions()[0]);
+            break;
+        case Opname::p:
+            state.apply_p(op.positions()[0], op.paras()[0]);
+            break;
+        case Opname::rx:
+            state.apply_rx(op.positions()[0], op.paras()[0]);
+            break;
+        case Opname::ry:
+            state.apply_ry(op.positions()[0], op.paras()[0]);
+            break;
+        case Opname::rz:
+            state.apply_rz(op.positions()[0], op.paras()[0]);
+            break;
+        case Opname::cx:
+            state.apply_cnot(op.positions()[0], op.positions()[1]);
+            break;
+        case Opname::cnot:
+            state.apply_cnot(op.positions()[0], op.positions()[1]);
+            break;
+        case Opname::cp:
+            state.apply_cp(op.positions()[0], op.positions()[1], op.paras()[0]);
+            break;
+        case Opname::cz:
+            state.apply_cz(op.positions()[0], op.positions()[1]);
+            break;
+        case Opname::ccx:
+            state.apply_ccx(op.positions()[0], op.positions()[1],  op.positions()[2]);
+            break;
+        case Opname::toffoli:
+            state.apply_ccx(op.positions()[0], op.positions()[1],  op.positions()[2]);
+        case Opname::rzz:
+            state.apply_cnot(op.positions()[0], op.positions()[1]);
+            state.apply_rz(op.positions()[1], op.paras()[0]);
+            state.apply_cnot(op.positions()[0], op.positions()[1]);
+            break;
+        case Opname::measure:
+            state.apply_measure(op.qbits(), op.cbits());
+            break;
+        case Opname::reset:
+            state.apply_reset(op.qbits());
+            break;
+        case Opname::cif:
+            // check cbits and condition
+            matched = state.check_cif(op.cbits(), op.condition());
+            // apply op in instructions
+            if(matched){
+                for(auto op_h :op.instructions()){
+                    apply_op(op_h, state);
+                }
+            }
+            break;
+        //Other general gate
+        default:
+        {
+            if (op.targe_num() == 1){
+                auto mat_temp = op.mat();
+                complex<double> *mat = mat_temp.data();
+                if (op.control_num() == 0){
+                    state.apply_one_targe_gate_general<0>(op.positions(), mat);
+                }else if (op.control_num() == 1){
+                    state.apply_one_targe_gate_general<1>(op.positions(), mat);
+                }else{
+                    state.apply_one_targe_gate_general<2>(op.positions(), mat);
+                }
+            }else if(op.targe_num() > 1){
+                state.apply_multi_targe_gate_general(op.positions(), op.control_num(), op.mat());
+            }else{
+                throw "Invalid target number";
+            }
+        }
+    }
+}
+
+void simulate(Circuit const& circuit, StateVector<data_t> & state){
+    state.set_num(circuit.qubit_num());
+    state.set_creg(circuit.cbit_num());
+    // skip measure and handle it in qfvm.cpp 
+    bool skip_measure = circuit.final_measure();
+    for (auto op : circuit.instructions()){
+        if(skip_measure == true && op.name() == "measure") continue;
+        apply_op(op , state);
+    }
+}
+
diff --git a/src/qfvm/statevector.hpp b/src/qfvm/statevector.hpp
new file mode 100644
index 0000000..99cf5f7
--- /dev/null
+++ b/src/qfvm/statevector.hpp
@@ -0,0 +1,1355 @@
+#pragma once
+#include "types.hpp"
+#include "util.h"
+#include <stdlib.h>
+#include <iostream>
+#include <cmath>
+#include <omp.h>
+#include <functional>
+#include <algorithm>
+#include <random>
+#ifdef USE_SIMD
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+
+template <class real_t = double>
+class StateVector{
+    private:
+        uint num_;
+         // classical bit
+        uint cbit_num_;  
+        vector<uint> creg_;
+        size_t size_;
+        std::unique_ptr<complex<real_t>[]> data_;
+        //random engine
+        std::mt19937_64 rng_;
+
+    public:
+        //construct function
+        StateVector();
+        explicit StateVector(uint num);
+        explicit StateVector(complex<real_t> *data, size_t data_size);
+        //move assign
+        // StateVector& operator=(StateVector&& other){
+        //     if(this != &other){
+        //         data_ = std::move(other.data_);
+        //         creg_ = std::move(other.creg_);
+        //         num_ = other.num_;
+        //         cbit_num_ = other.cbit_num_;
+        //         size_ = other.size_;
+
+        //     }
+        //     return *this;
+        // }
+
+        //Named gate function
+        void apply_x(pos_t pos);
+        void apply_y(pos_t pos);
+        void apply_z(pos_t pos);
+        void apply_h(pos_t pos);
+        void apply_s(pos_t pos);
+        void apply_sdag(pos_t pos);
+        void apply_t(pos_t pos);
+        void apply_tdag(pos_t pos);
+        void apply_p(pos_t pos, real_t phase);
+        void apply_rx(pos_t pos, real_t theta);
+        void apply_ry(pos_t pos, real_t theta);
+        void apply_rz(pos_t pos, real_t theta);
+        void apply_cnot(pos_t control, pos_t targe);
+        void apply_cz(pos_t control, pos_t targe);
+        void apply_cp(pos_t control, pos_t targe, real_t phase);
+        void apply_crx(pos_t control, pos_t targe,  real_t theta);
+        void apply_cry(pos_t control, pos_t targe,  real_t theta);
+        void apply_ccx(pos_t control1, pos_t control2, pos_t targe);
+        void apply_swap(pos_t q1, pos_t q2);
+
+        //General implementation
+        //One-target gate, ctrl_num equal 2 represent multi-controlled gate
+        template<int ctrl_num>
+        void apply_one_targe_gate_general(vector<pos_t> const& posv, complex<double> *mat);
+        template<int ctrl_num>
+        void apply_one_targe_gate_diag(vector<pos_t> const& posv, complex<double> *mat);
+        template<int ctrl_num>
+        void apply_one_targe_gate_real(vector<pos_t> const& posv, complex<double> *mat);
+        template<int ctrl_num>
+        void apply_one_targe_gate_x(vector<pos_t> const& posv);
+
+        //Multiple-target gate
+        void apply_multi_targe_gate_general(vector<pos_t> const& posv, uint control_num, RowMatrixXcd const&mat);
+
+        // Measure and Reset
+        std::pair<uint, double> sample_measure_probs(vector<pos_t> const& qbits);
+        vector<double> probabilities() const;
+        void apply_diagonal_matrix(vector<pos_t> const& qbits, vector<std::complex<double> > const& mdiag);
+        void update(vector<pos_t> const& qbits, const uint final_state, const uint meas_state, const double meas_prob);
+        void apply_measure(vector<pos_t> const& qbits,const vector<pos_t> &cbits);
+        void apply_reset(vector<pos_t> const& qbits);
+
+        // cif check
+        bool check_cif(const vector<pos_t> &cbits, const uint condition);
+
+        complex<real_t> operator[] (size_t j) const ;
+        void set_num(uint num);
+        void set_creg(uint num){
+            if(num > 0){
+                cbit_num_ = num;
+                creg_.resize(cbit_num_, 0);
+            }else{
+                throw std::logic_error("The number of cbit must be positive.");
+            }
+        }
+
+        vector<uint> creg(){
+            return creg_;
+        }
+
+        void set_rng(){
+            std::random_device rd;
+            rng_.seed(rd());
+        }
+
+        void print_state();
+        std::tuple<std::complex<real_t>*, size_t> move_data_to_python() {
+            auto data_ptr = data_.release();
+            return std::make_tuple(std::move(data_ptr), size_);
+        }
+        
+        complex<real_t>* data(){ return data_.get(); }
+        size_t size(){ return size_; }
+        uint num(){ return num_; }
+        uint cbit_num(){ return cbit_num_; }
+};
+
+
+//////// constructors ///////
+
+template <class real_t>
+StateVector<real_t>::StateVector(uint num)
+: num_(num),
+size_(1ULL<<num)
+{   data_ = std::make_unique<complex<real_t>[]>(size_);
+    data_[0] = complex<real_t>(1., 0);
+};
+
+template <class real_t>
+StateVector<real_t>::StateVector() : StateVector(0){ }
+
+template <class real_t>
+StateVector<real_t>::StateVector(complex<real_t> *data, size_t data_size)
+:
+data_(data),
+size_(data_size)
+{   
+    num_ = static_cast<int>(std::log2(size_));
+}
+
+
+
+//// useful functions /////
+template <class real_t>
+std::complex<real_t> StateVector<real_t>::operator[] (size_t j) const{
+    return data_[j];
+}
+
+template <class real_t>
+void StateVector<real_t>::set_num(uint num){
+    if (num_ > 0) {
+        // Initialized from statevector,
+        // should not resize
+        return;
+    }
+    num_ = num;
+
+    if (size_ != 1ULL << num) {
+        data_.reset();
+        size_ = 1ULL << num;
+        data_ = std::make_unique<complex<real_t>[]>(size_);
+        data_[0] = complex<real_t>(1, 0);
+    }
+}
+template <class real_t>
+bool StateVector<real_t>::check_cif(const vector<pos_t> &cbits, const uint condition){
+    uint out = 0;
+    for(uint i = 0; i < cbits.size(); i++){
+        out *= 2;
+        out += creg_[cbits[i]];
+    }
+    return out == condition; 
+}
+
+template <class real_t>
+void StateVector<real_t>::print_state(){
+    std::cout << "state_data: ";
+    for (auto i=0;i<size_;i++){
+        std::cout << data_[i] << " ";
+    }
+    std::cout << std::endl;
+}
+
+
+////// apply gate ////////
+
+template <class real_t>
+void StateVector<real_t>::apply_x(pos_t pos){
+    const size_t offset = 1<<pos;
+    const size_t rsize = size_>>1;
+     if (pos == 0){ //single step
+#ifdef USE_SIMD
+#pragma omp parallel for
+         for(omp_i j = 0;j < size_;j+=2){
+             double* ptr = (double*)(data_.get() + j);
+            __m256d data = _mm256_loadu_pd(ptr);
+            data = _mm256_permute4x64_pd(data, 78);
+            _mm256_storeu_pd(ptr, data);
+         }
+#else
+#pragma omp parallel for
+            for(omp_i j = 0;j < size_;j+=2){
+                std::swap(data_[j], data_[j+1]);
+            }
+#endif
+     }
+     else{
+#ifdef USE_SIMD
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
+            double* ptr0 = (double*)(data_.get()+ i);
+            double* ptr1 = (double*)(data_.get() + i + offset);
+            __m256d data0 = _mm256_loadu_pd(ptr0);
+            __m256d data1 = _mm256_loadu_pd(ptr1);
+            _mm256_storeu_pd(ptr1, data0);
+            _mm256_storeu_pd(ptr0, data1);
+        }
+#else
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
+            size_t i1 = i+1;
+            std::swap(data_[i], data_[i+offset]);
+            std::swap(data_[i1], data_[i1+offset]);
+        }
+#endif
+     }
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_y(pos_t pos){
+    const size_t offset = 1<<pos;
+    const size_t rsize = size_>>1;
+    const complex<real_t> im = imag_I;
+     if (pos == 0){ //single step
+#ifdef USE_SIMD
+        __m256d minus_half = _mm256_set_pd(1, -1, -1, 1);
+#pragma omp parallel for
+         for(omp_i j = 0;j < size_;j+=2){
+             double* ptr = (double*)(data_.get() + j);
+            __m256d data = _mm256_loadu_pd(ptr);
+            data = _mm256_permute4x64_pd(data, 27);
+            data = _mm256_mul_pd(data, minus_half);
+            _mm256_storeu_pd(ptr, data);
+         }
+#else
+#pragma omp parallel for
+        for(omp_i j = 0;j < size_;j+=2){
+            complex<real_t> temp = data_[j];
+            data_[j] = -im*data_[j+1];
+            data_[j+1] = im*temp;
+        }
+#endif
+     }
+     else{
+#ifdef USE_SIMD
+        __m256d minus_even = _mm256_set_pd(1, -1, 1, -1);
+        __m256d minus_odd = _mm256_set_pd(-1, 1, -1, 1);
+
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
+
+            double* ptr0 = (double*)(data_.get() + i);
+            double* ptr1 = (double*)(data_.get() + i + offset);
+            __m256d data0 = _mm256_loadu_pd(ptr0);
+            __m256d data1 = _mm256_loadu_pd(ptr1);
+            data0 = _mm256_permute_pd(data0, 5);
+            data1 = _mm256_permute_pd(data1, 5);
+            data0 = _mm256_mul_pd(data0, minus_even);
+            data1 = _mm256_mul_pd(data1, minus_odd);
+            _mm256_storeu_pd(ptr1, data0);
+            _mm256_storeu_pd(ptr0, data1);
+        }
+#else
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
+            size_t i1 = i+1;
+            complex<real_t> temp = data_[i];
+            data_[i] = -im*data_[i+offset];
+            data_[i+offset] = im*temp;
+            complex<real_t> temp1 = data_[i1];
+            data_[i1] = -im*data_[i1+offset];
+            data_[i1+offset] = im*temp1;
+        }
+#endif
+     }
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_z(pos_t pos){
+    const size_t offset = 1<<pos;
+    const size_t rsize = size_>>1;
+    if (pos == 0){ //single step
+#pragma omp parallel for
+        for(omp_i j = 1;j < size_;j+=2){
+            data_[j] *= -1;
+        }
+     }
+     else{
+#ifdef USE_SIMD
+        __m256d minus_one = _mm256_set_pd(-1, -1, -1, -1);
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
+            double* ptr1 = (double*)(data_.get() + i + offset);
+            __m256d data1 = _mm256_loadu_pd(ptr1);
+            data1 = _mm256_mul_pd(data1, minus_one);
+            _mm256_storeu_pd(ptr1, data1);
+        }
+#else
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
+            data_[i+offset] *= -1;
+            data_[i+offset+1] *= -1;
+        }
+#endif
+     }
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_h(pos_t pos){
+    const double sqrt2inv = 1. / std::sqrt(2.);
+    complex<double> mat[4] = {sqrt2inv, sqrt2inv, sqrt2inv, -sqrt2inv};
+    apply_one_targe_gate_real<0>(vector<pos_t>{pos}, mat);
+
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_s(pos_t pos){
+    complex<double> mat[2] = {1., imag_I};
+    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_sdag(pos_t pos){
+    complex<double> mat[2] = {1., -imag_I};
+    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
+
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_t(pos_t pos){
+    complex<double> p = imag_I*PI/4.;
+    complex<double> mat[2] = {1., std::exp(p)};
+   apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
+
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_tdag(pos_t pos){
+    complex<double> p = -imag_I*PI/4.;
+    complex<double> mat[2] = {1., std::exp(p)};
+    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
+
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_p(pos_t pos, real_t phase){
+    complex<double> p = imag_I*phase;
+    complex<double> mat[2] = {1., std::exp(p)};
+    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
+}
+
+
+template <class real_t>
+void StateVector<real_t>::apply_rx(pos_t pos, real_t theta){
+    complex<double> mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)};
+    apply_one_targe_gate_general<0>(vector<pos_t>{pos}, mat);
+}
+
+
+template <class real_t>
+void StateVector<real_t>::apply_ry(pos_t pos, real_t theta){
+    complex<double> mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)};
+    apply_one_targe_gate_real<0>(vector<pos_t>{pos}, mat);
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_rz(pos_t pos, real_t theta){
+    complex<double> z0 = -imag_I*theta/2.;
+    complex<double> z1 = imag_I*theta/2.;
+    complex<double> mat[2] = {std::exp(z0), std::exp(z1)};
+    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_cnot(pos_t control, pos_t targe){
+    apply_one_targe_gate_x<1>(vector<pos_t>{control, targe});
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_cz(pos_t control, pos_t targe){
+    complex<double> mat[2] = {1., -1.};
+    apply_one_targe_gate_diag<1>(vector<pos_t>{control, targe}, mat);
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_cp(pos_t control, pos_t targe, real_t phase){
+    complex<double> p = imag_I*phase;
+    complex<double> mat[2] = {1., std::exp(p)};
+    apply_one_targe_gate_diag<1>(vector<pos_t>{control, targe}, mat);
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_crx(pos_t control, pos_t targe,  real_t theta){
+    complex<double> mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)};
+
+    apply_one_targe_gate_general<1>(vector<pos_t>{control, targe}, mat);
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_cry(pos_t control, pos_t targe,  real_t theta){
+     complex<double> mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)};
+
+    apply_one_targe_gate_real<1>(vector<pos_t>{control, targe}, mat);
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_ccx(pos_t control1, pos_t control2, pos_t targe){
+    apply_one_targe_gate_x<2>(vector<pos_t>{control1, control2, targe});
+}
+
+/////// General implementation /////////
+
+template <class real_t>
+template <int ctrl_num>
+void StateVector<real_t>::apply_one_targe_gate_general(vector<pos_t> const& posv, complex<double> *mat)
+{
+    std::function<size_t(size_t)> getind_func_near;
+    std::function<size_t(size_t)> getind_func;
+    size_t rsize;
+    size_t offset;
+    size_t targe;
+    size_t control = 0;
+    size_t setbit;
+    size_t poffset;
+    bool has_control=false;
+    vector<pos_t> posv_sorted = posv;
+    if (ctrl_num == 0){
+        targe = posv[0];
+        offset = 1ll<<targe;
+        rsize = size_>>1;
+        getind_func_near = [&](size_t j)-> size_t {
+            return 2*j;
+        };
+
+        getind_func = [&](size_t j)-> size_t {
+            return (j&(offset-1)) | (j>>targe<<targe<<1);
+        };
+
+    }
+    else if(ctrl_num == 1){
+        has_control = true;
+        control = posv[0];
+        targe = posv[1];
+        offset = 1ll<<targe;
+        setbit = 1ll<<control;
+        if (control>targe) {
+            control--;
+        }
+        poffset=1ll<<control;
+        rsize = size_>>2;
+        getind_func = [&](size_t j) -> size_t {
+            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
+            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
+            return i;
+        };
+
+        getind_func_near = getind_func;
+
+
+    }
+    else if(ctrl_num == 2){
+        has_control = true;
+        control = *min_element(posv.begin(), posv.end()-1);
+        targe = *(posv.end()-1);
+        offset = 1ll<<targe;
+        sort(posv_sorted.begin(), posv_sorted.end());
+        rsize = size_>>posv.size();
+        getind_func = [&](size_t j)-> size_t{
+            size_t i = j;
+            for (size_t k=0;k < posv.size();k++)
+            {
+                size_t _pos = posv_sorted[k];
+                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
+            }
+            for (size_t k=0;k < posv.size()-1;k++){
+                i |= 1ll<<posv[k];
+            }
+            return i;
+        };
+        getind_func_near = getind_func;
+    }
+
+    const complex<real_t> mat00 = mat[0];
+    const complex<real_t> mat01 = mat[1];
+    const complex<real_t> mat10 = mat[2];
+    const complex<real_t> mat11 = mat[3];
+    if (targe == 0){
+#pragma omp parallel for
+            for(omp_i j = 0;j < rsize;j++){
+                size_t i = getind_func_near(j);
+                complex<real_t> temp = data_[i];
+                data_[i] = mat00*data_[i] + mat01*data_[i+1];
+                data_[i+1] = mat10*temp + mat11*data_[i+1];
+            }
+    }else if (has_control && control == 0){ //single step
+#pragma omp parallel for
+            for(omp_i j = 0;j < rsize;j++){
+                size_t i = getind_func(j);
+                complex<real_t> temp = data_[i];
+                data_[i] = mat00*data_[i] + mat01*data_[i+offset];
+                data_[i+offset] = mat10*temp + mat11*data_[i+offset];
+            }
+
+    }else{//unroll to 2
+#ifdef USE_SIMD
+    __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(),  mat[0].real());
+    __m256d m_00im = _mm256_set_pd(mat[0].imag(),  -mat[0].imag(),  mat[0].imag(),  -mat[0].imag());
+    __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(),  mat[1].real(), mat[1].real());
+    __m256d m_01im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(),  mat[1].imag(), -mat[1].imag());
+
+    __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real());
+    __m256d m_10im = _mm256_set_pd(mat[2].imag(),  -mat[2].imag(),mat[2].imag(), -mat[2].imag());
+    __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real());
+    __m256d m_11im = _mm256_set_pd(mat[3].imag(), -mat[3].imag(), mat[3].imag(),  -mat[3].imag());
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize; j+= 2){
+            size_t i = getind_func(j);
+
+            double* p0 = (double*)(data_.get()+i);
+            double* p1 = (double*)(data_.get()+i+offset);
+            //load data
+            __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0
+            __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1
+            __m256d data0_p = _mm256_permute_pd(data0, 5);
+            __m256d data1_p = _mm256_permute_pd(data1, 5);
+
+             //row0
+            __m256d temp00re = _mm256_mul_pd(m_00re, data0);
+            __m256d temp00im = _mm256_mul_pd(m_00im, data0_p);
+            __m256d temp00 = _mm256_add_pd(temp00re, temp00im);
+            __m256d temp01re = _mm256_mul_pd(m_01re, data1);
+            __m256d temp01im = _mm256_mul_pd(m_01im, data1_p);
+            __m256d temp01 = _mm256_add_pd(temp01re, temp01im);
+            __m256d temp0 = _mm256_add_pd(temp00, temp01);
+
+            //row1
+            __m256d temp10re = _mm256_mul_pd(m_10re, data0);
+            __m256d temp10im = _mm256_mul_pd(m_10im, data0_p);
+            __m256d temp10 = _mm256_add_pd(temp10re, temp10im);
+            __m256d temp11re = _mm256_mul_pd(m_11re, data1);
+            __m256d temp11im = _mm256_mul_pd(m_11im, data1_p);
+            __m256d temp11 = _mm256_add_pd(temp11re, temp11im);
+            __m256d temp1 = _mm256_add_pd(temp10, temp11);
+
+            _mm256_storeu_pd(p0, temp0);
+            _mm256_storeu_pd(p1, temp1);
+        }
+#else
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = getind_func(j);
+            size_t i1 = i+1;
+            complex<real_t> temp = data_[i];
+            complex<real_t> temp1 = data_[i1];
+            data_[i] = mat00*data_[i] + mat01*data_[i+offset];
+            data_[i+offset] = mat10*temp + mat11*data_[i+offset];
+            data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset];
+            data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset];
+        }
+#endif
+    }
+}
+
+
+template <class real_t>
+template <int ctrl_num>
+void StateVector<real_t>::apply_one_targe_gate_x(vector<pos_t> const& posv)
+{
+    std::function<size_t(size_t)> getind_func_near;
+    std::function<size_t(size_t)> getind_func;
+    size_t rsize;
+    size_t offset;
+    size_t targe;
+    size_t control;
+    size_t setbit;
+    size_t poffset;
+    vector<pos_t> posv_sorted = posv;
+    bool has_control=false;
+    if (ctrl_num == 0){
+        targe = posv[0];
+        offset = 1ll<<targe;
+        rsize = size_>>1;
+        getind_func_near = [&](size_t j)-> size_t {
+            return 2*j;
+        };
+
+        getind_func = [&](size_t j)-> size_t {
+            return (j&(offset-1)) | (j>>targe<<targe<<1);
+        };
+
+    }
+    else if(ctrl_num == 1){
+        has_control = true;
+        control = posv[0];
+        targe = posv[1];
+        offset = 1ll<<targe;
+        setbit = 1ll<<control;
+        if (control>targe) {
+            control--;
+        }
+        poffset=1ll<<control;
+        rsize = size_>>2;
+        getind_func = [&](size_t j) -> size_t {
+            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
+            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
+            return i;
+        };
+        getind_func_near = getind_func;
+    }
+    else if(ctrl_num == 2){
+        has_control = true;
+        control = *min_element(posv.begin(), posv.end()-1);
+        targe = *(posv.end()-1);
+        offset = 1ll<<targe;
+        sort(posv_sorted.begin(), posv_sorted.end());
+        rsize = size_>>posv.size();
+
+        getind_func = [&](size_t j) -> size_t{
+            size_t i = j;
+            for (size_t k=0;k < posv.size();k++){
+                size_t _pos = posv_sorted[k];
+                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
+            }
+            for (size_t k=0;k < posv.size()-1;k++){
+                i |= 1ll<<posv[k];
+            }
+            return i;
+        };
+        getind_func_near = getind_func;
+    }
+
+    if (targe == 0){
+#ifdef USE_SIMD
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j++){
+            size_t i = getind_func_near(j);
+            double* ptr = (double*)(data_.get() + i);
+            __m256d data = _mm256_loadu_pd(ptr);
+            data = _mm256_permute4x64_pd(data, 78);
+            _mm256_storeu_pd(ptr, data);
+        }
+#else
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j++){
+            size_t i = getind_func(j);
+            std::swap(data_[i], data_[i+1]);
+        }
+#endif
+    }else if (has_control && control == 0){ //single step
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j++){
+            size_t i = getind_func(j);
+            std::swap(data_[i], data_[i+offset]);
+        }
+
+    }else{//unroll to 2
+#ifdef USE_SIMD
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize; j+= 2){
+            size_t i = getind_func(j);
+            double* ptr0 = (double*)(data_.get() + i);
+            double* ptr1 = (double*)(data_.get() + i + offset);
+            __m256d data0 = _mm256_loadu_pd(ptr0);
+            __m256d data1 = _mm256_loadu_pd(ptr1);
+            _mm256_storeu_pd(ptr1, data0);
+            _mm256_storeu_pd(ptr0, data1);
+        }
+#else
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = getind_func(j);
+            size_t i1 = i+1;
+            std::swap(data_[i], data_[i+offset]);
+            std::swap(data_[i1], data_[i1+offset]);
+        }
+#endif
+    }
+}
+
+template <class real_t>
+template <int ctrl_num>
+void StateVector<real_t>::apply_one_targe_gate_real(vector<pos_t> const& posv, complex<double> *mat)
+{
+    std::function<size_t(size_t)> getind_func_near;
+    std::function<size_t(size_t)> getind_func;
+    size_t rsize;
+    size_t offset;
+    size_t targe;
+    size_t control = 0;
+    size_t setbit;
+    size_t poffset;
+    bool has_control=false;
+    vector<pos_t> posv_sorted = posv;
+    if (ctrl_num == 0){
+        targe = posv[0];
+        offset = 1ll<<targe;
+        rsize = size_>>1;
+        getind_func_near = [&](size_t j)-> size_t {
+            return 2*j;
+        };
+
+        getind_func = [&](size_t j)-> size_t {
+            return (j&(offset-1)) | (j>>targe<<targe<<1);
+        };
+
+    }
+    else if(ctrl_num == 1){
+        has_control = true;
+        control = posv[0];
+        targe = posv[1];
+        offset = 1ll<<targe;
+        setbit = 1ll<<control;
+        if (control>targe) {
+            control--;
+        }
+        poffset=1ll<<control;
+        rsize = size_>>2;
+        getind_func = [&](size_t j) -> size_t {
+            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
+            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
+            return i;
+        };
+
+        getind_func_near = getind_func;
+    }
+    else if(ctrl_num == 2){
+        has_control = true;
+        control = *min_element(posv.begin(), posv.end()-1);
+        targe = *(posv.end()-1);
+        offset = 1ll<<targe;
+        sort(posv_sorted.begin(), posv_sorted.end());
+        rsize = size_>>posv.size();
+        getind_func = [&](size_t j)-> size_t{
+            size_t i = j;
+            for (size_t k=0;k < posv.size();k++){
+                size_t _pos = posv_sorted[k];
+                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
+            }
+            for (size_t k=0;k < posv.size()-1;k++){
+                i |= 1ll<<posv[k];
+            }
+            return i;
+        };
+        getind_func_near = getind_func;
+    }
+
+    const double mat00 = mat[0].real();
+    const double mat01 = mat[1].real();
+    const double mat10 = mat[2].real();
+    const double mat11 = mat[3].real();
+    if (targe == 0){
+#pragma omp parallel for
+            for(omp_i j = 0;j < rsize;j++){
+                size_t i = getind_func_near(j);
+                complex<real_t> temp = data_[i];
+                data_[i] = mat00*data_[i] + mat01*data_[i+1];
+                data_[i+1] = mat10*temp + mat11*data_[i+1];
+            }
+
+    }else if (has_control && control == 0){ //single step
+
+#pragma omp parallel for
+            for(omp_i j = 0;j < rsize;j++){
+                size_t i = getind_func(j);
+                complex<real_t> temp = data_[i];
+                data_[i] = mat00*data_[i] + mat01*data_[i+offset];
+                data_[i+offset] = mat10*temp + mat11*data_[i+offset];
+            }
+    }else{//unroll to 2
+#ifdef USE_SIMD
+    __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(),  mat[0].real());
+    __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(),  mat[1].real(), mat[1].real());
+    __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real());
+    __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real());
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize; j+= 2){
+            size_t i = getind_func(j);
+
+            double* p0 = (double*)(data_.get()+i);
+            double* p1 = (double*)(data_.get()+i+offset);
+             //load data
+            __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0
+            __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1
+            __m256d data0_p = _mm256_permute_pd(data0, 5);
+            __m256d data1_p = _mm256_permute_pd(data1, 5);
+
+                //row0
+            __m256d temp00re = _mm256_mul_pd(m_00re, data0);
+            __m256d temp01re = _mm256_mul_pd(m_01re, data1);
+            __m256d temp0 = _mm256_add_pd(temp00re, temp01re);
+
+            //row1
+            __m256d temp10re = _mm256_mul_pd(m_10re, data0);
+            __m256d temp11re = _mm256_mul_pd(m_11re, data1);
+            __m256d temp1 = _mm256_add_pd(temp10re, temp11re);
+
+            _mm256_storeu_pd(p0, temp0);
+            _mm256_storeu_pd(p1, temp1);
+        }
+#else
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = getind_func(j);
+            size_t i1 = i+1;
+            complex<real_t> temp = data_[i];
+            complex<real_t> temp1 = data_[i1];
+            data_[i] = mat00*data_[i] + mat01*data_[i+offset];
+            data_[i+offset] = mat10*temp + mat11*data_[i+offset];
+            data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset];
+            data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset];
+        }
+#endif
+    }
+}
+
+
+template <class real_t>
+template <int ctrl_num>
+void StateVector<real_t>::apply_one_targe_gate_diag(vector<pos_t> const& posv, complex<double> *mat)
+{
+    std::function<size_t(size_t)> getind_func_near;
+    std::function<size_t(size_t)> getind_func;
+    size_t rsize;
+    size_t offset;
+    size_t targe;
+    size_t control = 0;
+    size_t setbit;
+    size_t poffset;
+    bool has_control=false;
+    vector<pos_t> posv_sorted = posv;
+    if (ctrl_num == 0){
+        targe = posv[0];
+        offset = 1ll<<targe;
+        rsize = size_>>1;
+        getind_func_near = [&](size_t j)-> size_t {
+            return 2*j;
+        };
+
+        getind_func = [&](size_t j)-> size_t {
+            return (j&(offset-1)) | (j>>targe<<targe<<1);
+        };
+
+    }
+    else if(ctrl_num == 1){
+
+        has_control = true;
+        control = posv[0];
+        targe = posv[1];
+        offset = 1ll<<targe;
+        setbit = 1ll<<control;
+        if (control>targe) {
+            control--;
+        }
+        poffset=1ll<<control;
+        rsize = size_>>2;
+        getind_func = [&](size_t j) -> size_t {
+            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
+            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
+            return i;
+        };
+
+        getind_func_near = getind_func;
+
+    }
+    else if(ctrl_num == 2){
+        has_control = true;
+        control = *min_element(posv.begin(), posv.end()-1);
+        targe = *(posv.end()-1);
+        offset = 1ll<<targe;
+        sort(posv_sorted.begin(), posv_sorted.end());
+        rsize = size_>>posv.size();
+        getind_func = [&](size_t j)-> size_t
+        {
+            size_t i = j;
+            for (size_t k=0;k < posv.size();k++){
+                size_t _pos = posv_sorted[k];
+                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
+            }
+            for (size_t k=0;k < posv.size()-1;k++){
+                i |= 1ll<<posv[k];
+            }
+            return i;
+        };
+        getind_func_near = getind_func;
+    }
+
+    if (targe == 0){
+#pragma omp parallel for
+            for(omp_i j = 0;j < rsize;j++){
+                size_t i = getind_func_near(j);
+                data_[i] *= mat[0];
+                data_[i+1] *= mat[1];
+            }
+
+    }else if (has_control && control == 0){ //single step
+
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j++){
+            size_t i = getind_func(j);
+            complex<real_t> temp = data_[i];
+            data_[i] *= mat[0];
+            data_[i+offset] *= mat[1];
+        }
+
+    }else{//unroll to 2
+#ifdef USE_SIMD
+     __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(),  mat[0].real());
+    __m256d m_00im = _mm256_set_pd(mat[0].imag(),  -mat[0].imag(),  mat[0].imag(),  -mat[0].imag());
+    __m256d m_11re = _mm256_set_pd(mat[1].real(), mat[1].real(),  mat[1].real(), mat[1].real());
+    __m256d m_11im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(),  mat[1].imag(), -mat[1].imag());
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize; j+= 2){
+            size_t i = getind_func(j);
+
+            double* p0 = (double*)(data_.get()+i);
+            double* p1 = (double*)(data_.get()+i+offset);
+
+            //load data
+            __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0
+            __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1
+            __m256d data0_p = _mm256_permute_pd(data0, 5);
+            __m256d data1_p = _mm256_permute_pd(data1, 5);
+
+             //row0
+            __m256d temp00re = _mm256_mul_pd(m_00re, data0);
+            __m256d temp00im = _mm256_mul_pd(m_00im, data0_p);
+            __m256d temp00 = _mm256_add_pd(temp00re, temp00im);
+
+            //row1
+            __m256d temp11re = _mm256_mul_pd(m_11re, data1);
+            __m256d temp11im = _mm256_mul_pd(m_11im, data1_p);
+            __m256d temp11 = _mm256_add_pd(temp11re, temp11im);
+
+            _mm256_storeu_pd(p0, temp00);
+            _mm256_storeu_pd(p1, temp11);
+        }
+#else
+#pragma omp parallel for
+        for(omp_i j = 0;j < rsize;j += 2){
+            size_t i = getind_func(j);
+            size_t i1 = i+1;
+            data_[i] *= mat[0];
+            data_[i+offset] *= mat[1];
+            data_[i1] *= mat[0];
+            data_[i1+offset] *= mat[1];
+        }
+#endif
+    }
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_multi_targe_gate_general(vector<pos_t> const& posv, uint control_num, RowMatrixXcd const& mat)
+{
+    auto posv_sorted = posv;
+    auto targs = vector<pos_t>(posv.begin()+control_num, posv.end());
+    sort(posv_sorted.begin(), posv_sorted.end());
+    size_t rsize = size_ >> posv.size();
+    uint targe_num = targs.size();
+    size_t matsize= 1<< targe_num;
+    std::vector<uint> targ_mask(matsize);
+    //create target mask
+    for (size_t m = 0; m < matsize;m++){
+        for (size_t j = 0; j < targe_num; j++){
+            if ((m>>j)&1){
+                auto mask_pos = targs[j];
+                targ_mask[m] |= 1ll<<mask_pos;
+            }
+        }
+    }
+
+    //apply matrix
+//TODO: Disalbe Parallel when matsize is very large
+#pragma omp parallel for
+    for (omp_i j = 0;j < rsize;j++){
+        size_t i = j;
+        // Insert zeros
+        for(size_t k=0;k < posv.size();k++){
+            size_t _pos = posv_sorted[k];
+            i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
+        }
+        // Set control
+        for (size_t k=0; k < control_num;k++){
+            i |= 1ll<<posv[k];
+        }
+
+        //load block vector
+        Eigen::VectorXcd vec_block(matsize);
+        for (size_t m = 0; m < matsize;m++){
+            vec_block(m) = data_[i | targ_mask[m]];
+            auto ele = vec_block(m);
+        }
+
+        //Eigen matrix multiply
+        vec_block = mat * vec_block;
+
+        //write back
+        for (size_t m = 0; m < matsize;m++){
+            data_[i | targ_mask[m]] = vec_block(m);
+        }
+    }
+}
+
+
+uint index0(vector<pos_t> const& qubits_sorted, const uint k) {
+    uint lowbits, retval = k;
+    for (size_t j = 0; j < qubits_sorted.size(); j++) {
+        lowbits = retval & ((1LL << qubits_sorted[j]) -1);
+        retval >>= qubits_sorted[j];
+        retval <<= qubits_sorted[j] + 1;
+        retval |= lowbits;
+    }
+    return retval;
+}
+
+using indexes_t = vector<uint>;
+inline indexes_t indexes(vector<pos_t> const& qbits, vector<pos_t> const& qubits_sorted, const uint k) {
+  const auto N = qubits_sorted.size();
+  indexes_t ret(1LL << N, 0);
+  // Get index0
+  ret[0] = index0(qubits_sorted, k);
+  for (size_t i = 0; i < N; i++) {
+    const auto n = 1LL << i;
+    const auto bit = 1ll << qbits[i];
+    for (size_t j = 0; j < n; j++)
+      ret[n + j] = ret[j] | bit;
+  }
+  return ret;
+}
+
+template <class real_t>
+vector<double> StateVector<real_t>::probabilities() const {
+    const int len = 1LL << num_;
+    vector<double> probs(len, 0.);
+#pragma omp parallel for
+    for (int j = 0; j < len; j++) {
+        probs[j] = std::real(data_[j] * std::conj(data_[j]));
+    }
+    return probs;
+}
+
+vector<std::complex<double>> convert(const vector<std::complex<double>> &v){
+    vector<std::complex<double>> ret(v.size(), 0.);
+    for (size_t i = 0; i< v.size(); ++i)
+        ret[i] = v[i];
+    return ret;
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_diagonal_matrix(vector<pos_t> const&qbits, vector<std::complex<double> > const&diag){
+    // just one qubit
+    if(qbits.size() == 1){
+        const uint qubit = qbits[0];
+        vector<pos_t> qbit0{qubit};
+        if (diag[0] == 1.0) { // [[1, 0], [0, z]] matrix
+            if (diag[1] == 1.0)
+                return; // Identity
+            if (diag[1] == std::complex<double>(0., -1.)) { // [[1, 0], [0, -i]]
+                auto func = [&](const indexes_t &inds) -> void {
+                    const auto k = inds[1];
+                    double cache = data_[k].imag();
+                    data_[k].imag(data_[k].real() * -1.);
+                    data_[k].real(cache);
+                };
+#pragma omp parallel for
+                for (int k = 0; k < (size_ >> 1); k+=1){
+                    const auto inds = indexes(qbit0, qbit0, k);
+                    func(inds);
+                }
+                return;
+            }
+            if (diag[1] == std::complex<double>(0., 1.)) {
+            // [[1, 0], [0, i]]
+                auto func = [&](const indexes_t &inds) -> void {
+                    const auto k = inds[1];
+                    double cache = data_[k].imag();
+                    data_[k].imag(data_[k].real());
+                    data_[k].real(cache * -1.);
+                };
+#pragma omp parallel for
+                for (int k = 0; k < (size_ >> 1); k+=1){
+                    const auto inds = indexes(qbit0, qbit0, k);
+                    func(inds);
+                }
+                return;
+            }
+            if (diag[0] == 0.0) {
+            // [[1, 0], [0, 0]]
+                auto func = [&](const indexes_t &inds) -> void {
+                    data_[inds[1]] = 0.0;
+                };
+#pragma omp parallel for
+                for (int k = 0; k < (size_ >> 1); k+=1){
+                    const auto inds = indexes(qbit0, qbit0, k);
+                    func(inds);
+                }
+                return;
+            }
+            // general [[1, 0], [0, z]]
+            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
+                const auto k = inds[1];
+                data_[k] *= _mat[1];
+            };
+#pragma omp parallel for
+            for (int k = 0; k < (size_ >> 1); k+=1){
+                const auto inds = indexes(qbit0, qbit0, k);
+                func(inds, convert(diag));
+            }
+            return;
+        } else if (diag[1] == 1.0) {
+            // [[z, 0], [0, 1]] matrix
+            if (diag[0] == std::complex<double>(0., -1.)) {
+                // [[-i, 0], [0, 1]]
+                auto func = [&](const indexes_t &inds) -> void {
+                    const auto k = inds[1];
+                    double cache = data_[k].imag();
+                    data_[k].imag(data_[k].real() * -1.);
+                    data_[k].real(cache);
+                };
+#pragma omp parallel for
+                for (int k = 0; k < (size_ >> 1); k+=1){
+                    const auto inds = indexes(qbit0, qbit0, k);
+                    func(inds);
+                }
+                return;
+            }
+            if (diag[0] == std::complex<double>(0., 1.)) {
+                // [[i, 0], [0, 1]]
+                auto func = [&](const indexes_t &inds) -> void {
+                    const auto k = inds[1];
+                    double cache = data_[k].imag();
+                    data_[k].imag(data_[k].real());
+                    data_[k].real(cache * -1.);
+                };
+#pragma omp parallel for
+                for (int k = 0; k < (size_ >> 1); k+=1){
+                    const auto inds = indexes(qbit0, qbit0, k);
+                    func(inds);
+                }
+                return;
+            }
+            if (diag[0] == 0.0) {
+                // [[0, 0], [0, 1]]
+                auto func = [&](const indexes_t &inds) -> void {
+                    data_[inds[0]] = 0.0;
+                };
+#pragma omp parallel for
+                for (int k = 0; k < (size_ >> 1); k+=1){
+                    const auto inds = indexes(qbit0, qbit0, k);
+                    func(inds);
+                }                
+                return;
+            }
+            // general [[z, 0], [0, 1]]
+            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
+                const auto k = inds[0];
+                data_[k] *= _mat[0];
+            };
+#pragma omp parallel for
+            for (int k = 0; k < (size_ >> 1); k+=1){
+                const auto inds = indexes(qbit0, qbit0, k);
+                func(inds, convert(diag));
+            }   
+            return;
+        } else {
+            // Lambda function for diagonal matrix multiplication
+            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
+                const auto k0 = inds[0];
+                const auto k1 = inds[1];
+                data_[k0] *= _mat[0];
+                data_[k1] *= _mat[1];
+            };
+#pragma omp parallel for
+            for (int k = 0; k < (size_ >> 1); k+=1){
+                const auto inds = indexes(qbit0, qbit0, k);
+                func(inds, convert(diag));
+            }  
+        }
+        return;
+    }
+    const uint N = qbits.size();
+    auto func = [&](const indexes_t &inds, const vector<std::complex<double>> &_diag) -> void { 
+        for(int i=0; i<2; ++i){
+            const int k = inds[i];
+            int iv = 0;
+            for(int j=0;j<N;j++){
+                if((k & (1ULL << qbits[j])) !=0 )
+                    iv += (1ULL << j);
+            }
+            if(_diag[iv] != (double)1.0)
+                data_[k] *= _diag[iv];
+        }
+    };
+    // apply func
+    vector<pos_t> qbit0{qbits[0]};
+#pragma omp parallel for
+    for (int k = 0; k < (size_ >> 1); k+=1){
+        const auto inds = indexes(qbit0, qbit0, k);
+        func(inds, convert(diag));
+    }
+}
+
+template <class real_t>
+void StateVector<real_t>::update(vector<pos_t> const& qbits, const uint final_state, const uint meas_state, const double meas_prob){
+    const uint dim = 1ULL << qbits.size();
+    vector<std::complex<double> >  matdiag(dim, 0.);
+    matdiag[meas_state] = 1./ std::sqrt(meas_prob);
+    apply_diagonal_matrix(qbits, matdiag);
+    
+    //TODO: Add reset
+    // for reset
+     if(final_state != meas_state){
+        if(qbits.size() == 1){
+            // apply a x gate
+            apply_x(qbits[0]);
+        }else{
+            // Diagonal matrix for projecting and renormalizing to measurement outcome
+            vector<std::complex<double> > perm(dim*dim, 0.);
+            perm[final_state * dim + meas_state] = 1.;
+            perm[meas_state * dim + final_state] = 1.;
+            for (uint j = 0; j < dim; j++){
+                if(j != final_state && j != meas_state) 
+                    perm[j * dim + j] = 1.;
+            }
+            // apply permutation to swap state
+            const uint N = qbits.size();
+            const uint DIM = 1ULL << N;
+            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
+                // std::array<std::complex<double>, 1ULL << N > cache;
+                vector<std::complex<double> > cache(1ULL << N, 0.);
+                for(uint i = 0; i< DIM; i++){
+                    const auto ii = inds[i];
+                    cache[i] = data_[ii];
+                    data_[ii] = 0.;
+                }
+                for(uint i=0; i< DIM; i++)
+                    for(uint j=0; j<DIM; j++)
+                        data_[inds[i]] += _mat[i + DIM * j] * cache[j];
+            };
+            vector<pos_t> qs(qbits.begin(), qbits.end());
+            vector<pos_t> qs_sorted(qs.begin(), qs.end());
+            std::sort(qs_sorted.begin(), qs_sorted.end());
+            uint END = size_ >> qs.size();
+#pragma omp parallel for
+            for (int k = 0; k < END; k+=1){
+                const auto inds = indexes(qs, qs_sorted, k);
+                func(inds, convert(perm));
+            }  
+        }
+    }
+}
+
+template <typename T>
+void printVector(const std::vector<T>& vec) {
+    for (const T& element : vec) {
+        std::cout << element << " ";
+    }
+    std::cout << std::endl;
+}
+
+template <class real_t>
+std::pair<uint, double> StateVector<real_t>::sample_measure_probs(vector<pos_t> const& qbits){
+    // 1. caculate actual measurement outcome
+    const int64_t N = qbits.size();
+    const int64_t DIM = 1LL << N;
+    const int64_t END = 1LL << (num_ - N);
+    vector<double> probs(DIM, 0.);
+    vector<uint> qubits_sorted(qbits.begin(), qbits.end());
+    
+    std::sort(qubits_sorted.begin(), qubits_sorted.end());
+    if ((num_ == N) && ( qubits_sorted == qbits )){
+        probs = probabilities();
+    }else{
+        vector<double> probs_private(DIM, 0.);
+#pragma omp parallel for
+        for (int64_t k = 0; k < END; k++){
+            auto idx = indexes(qbits, qubits_sorted, k);
+            // std::cout<<"indexes"<<k<<": ";
+            // printVector(idx);
+            for(int64_t m = 0; m < DIM; ++m){
+                double local_prob = std::real(data_[idx[m]] * std::conj(data_[idx[m]]));
+                #pragma omp critical
+                probs_private[m] += local_prob;
+            }
+        }
+        // std::cout<<"probs_private:";
+        // printVector(probs_private);
+        #pragma omp critical
+        for(int64_t m = 0; m < DIM; ++m){
+            probs[m] += probs_private[m];
+        }
+    }
+    set_rng();
+    // std::cout<<"probs:";
+    // printVector(probs);
+    uint outcome = std::discrete_distribution<uint>(probs.begin(), probs.end())(rng_);
+    return std::make_pair(outcome, probs[outcome]);
+}
+
+// change to bit endian
+vector<uint> int2vec(uint n, uint base){
+    vector<uint> ret;
+    while(n >= base){
+        ret.push_back(n%base);
+        n /= base;
+    }
+    ret.push_back(n);
+    return ret;
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_measure(vector<pos_t> const& qbits, vector<pos_t> const& cbits){
+    // 1. caculate actual measurement outcome
+    const auto meas = sample_measure_probs(qbits);
+    //2. update statevector
+    update(qbits, meas.first, meas.first, meas.second);
+    //3. store measure
+    vector<uint> outcome = int2vec(meas.first, 2);
+    if(outcome.size() < qbits.size()){
+        outcome.resize(qbits.size());
+    }
+    for(uint j=0; j < outcome.size(); j++){
+        creg_[cbits[j]] = outcome[j];
+    }
+}
+
+template <class real_t>
+void StateVector<real_t>::apply_reset(vector<pos_t> const& qbits){
+    const auto meas = sample_measure_probs(qbits);
+    update(qbits, 0, meas.first, meas.second);
+}
\ No newline at end of file
diff --git a/src/qfvm/types.hpp b/src/qfvm/types.hpp
new file mode 100644
index 0000000..5ece0f7
--- /dev/null
+++ b/src/qfvm/types.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <complex>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <tuple>
+#include <Eigen/Core>
+
+#ifdef _MSC_VER
+using omp_i = signed long long;
+#else
+using omp_i = size_t;
+#endif
+
+#ifdef _MSC_VER
+#else
+#ifndef __AVX2__
+#undef USE_SIMD
+#endif
+#endif
+
+
+typedef unsigned int uint;
+using pos_t = uint;
+using data_t = double;
+using std::complex;
+using std::vector;
+using std::string;
+using RowMatrixXcd = Eigen::Matrix<complex<double>, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+
+const complex<double> imag_I = complex<double>(0, 1.);
+const double PI = 3.14159265358979323846;
diff --git a/src/qfvm/util.h b/src/qfvm/util.h
new file mode 100644
index 0000000..950b48e
--- /dev/null
+++ b/src/qfvm/util.h
@@ -0,0 +1,99 @@
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <time.h>
+#include <regex>
+#include <type_traits>
+#include <bitset>
+
+namespace Qfutil{
+
+std::vector<int> randomArr(size_t length, size_t max){
+    srand((unsigned)time(NULL));
+    std::vector<int> arr(length);
+    for (size_t i=0;i < arr.size();i++){
+        arr[i] = rand()%max;
+    }
+    return arr;
+}
+
+int randomint(int min, int max){
+    srand((unsigned)time(NULL));
+    return (rand()%(max - min + 1)) + min;
+}
+
+static uint32_t randomize(uint32_t i){
+    i = (i^61)^(i>>16);
+    i *=9;
+    i ^= i<<4;
+    i *= 0x27d4eb2d;
+    i ^= i >> 15;
+    return i;
+}
+
+template<class T>
+void printVector(std::vector<T> const &arr){
+    for (auto i : arr){
+        std::cout << i << " ";
+    }
+    std::cout << std::endl;
+}
+
+
+std::vector<std::string> split_string(const std::string& str, char delim) {
+    std::vector<std::string> elems;
+    auto lastPos = str.find_first_not_of(delim, 0);
+    auto pos = str.find_first_of(delim, lastPos);
+    while (pos != std::string::npos || lastPos != std::string::npos) {
+        elems.push_back(str.substr(lastPos, pos - lastPos));
+        lastPos = str.find_first_not_of(delim, pos);
+        pos = str.find_first_of(delim, lastPos);
+    }
+    return elems;
+}
+
+std::vector<std::string> split_string(const std::string& str, char delim, uint num){
+    auto end = str.length();
+    std::vector<std::string> elems;
+    auto lastPos = str.find_first_not_of(delim, 0);
+    auto pos = str.find_first_of(delim, lastPos);
+    while ((pos != std::string::npos || lastPos != std::string::npos) && elems.size() < num) {
+        elems.push_back(str.substr(lastPos, pos - lastPos));
+        lastPos = str.find_first_not_of(delim, pos);
+        pos = str.find_first_of(delim, lastPos);
+    }
+
+    if ((pos != std::string::npos || lastPos != std::string::npos)){
+        elems.push_back(str.substr(lastPos, end));
+    }
+    return elems;
+}
+
+template<class real_t>
+std::vector<real_t> find_numbers(const std::string &str){
+    std::smatch matchs;
+    std::vector<real_t> res;
+    std::regex pattern;
+    if (std::is_unsigned<real_t>::value){
+        pattern = std::regex("\\d+");
+    }else if(std::is_floating_point<real_t>::value){
+        pattern = std::regex("-?(([1-9]\\d*\\.\\d*)|(0\\.\\d*[1-9]\\d*))|\\d+");
+    }
+
+    auto begin = std::sregex_iterator(str.begin(), str.end(), pattern);
+    const std::sregex_iterator end;
+
+    for (std::sregex_iterator i = begin; i != end; ++i) {
+        std::string match_str = i->str();
+         if (std::is_unsigned<real_t>::value){
+            res.push_back(std::stoi(match_str));
+        }else if (std::is_floating_point<real_t>::value){
+            res.push_back(std::stod(match_str));
+        }
+    }
+
+    return res;
+}
+
+}//namespace Qfutil
\ No newline at end of file
diff --git a/src/qfvm_gpu/apply_gate_custate.cuh b/src/qfvm_gpu/apply_gate_custate.cuh
new file mode 100644
index 0000000..2d9c2e7
--- /dev/null
+++ b/src/qfvm_gpu/apply_gate_custate.cuh
@@ -0,0 +1,46 @@
+
+#pragma once
+#include <custatevec.h>
+#include <helper_custatevec.hpp>
+
+
+void apply_gate_custate(cuDoubleComplex *psi_d, QuantumOperator &op, int n)
+{   
+         
+    //get information form op
+    auto pos = op.positions();
+    const int nTargets  = op.targe_num();
+    const int nControls = op.control_num();
+    const int adjoint   = 0;
+
+    vector<int> targets{pos.begin()+nControls, pos.end()};
+    vector<int> controls{pos.begin(), pos.begin()+nControls};
+
+    auto mat_temp = op.mat();
+    cuDoubleComplex *mat = 
+    reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
+
+    // custatevec handle initialization
+    custatevecHandle_t handle;
+    custatevecCreate(&handle) ;
+    void* extraWorkspace = nullptr;
+    size_t extraWorkspaceSizeInBytes = 0;
+
+    // check the size of external workspace
+    custatevecApplyMatrixGetWorkspaceSize(
+                handle, CUDA_C_64F, n, mat, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW,
+                adjoint, nTargets, nControls, CUSTATEVEC_COMPUTE_64F, &extraWorkspaceSizeInBytes) ;
+
+    // allocate external workspace if necessary
+    if (extraWorkspaceSizeInBytes > 0)
+        cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes);
+
+    custatevecApplyMatrix(
+                    handle, psi_d, CUDA_C_64F, n, mat, CUDA_C_64F,
+                    CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, targets.data(), nTargets, controls.data(), nullptr,
+                    nControls, CUSTATEVEC_COMPUTE_64F, extraWorkspace, extraWorkspaceSizeInBytes);
+
+                      // destroy handle
+    custatevecDestroy(handle);    
+}
+    
\ No newline at end of file
diff --git a/src/qfvm_gpu/apply_gate_gpu.cuh b/src/qfvm_gpu/apply_gate_gpu.cuh
new file mode 100644
index 0000000..e2d7be0
--- /dev/null
+++ b/src/qfvm_gpu/apply_gate_gpu.cuh
@@ -0,0 +1,411 @@
+#pragma once
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include <types.hpp>
+#include <algorithm>
+#include <util.h>
+
+struct targeIndex
+{
+    size_t ind0;
+    size_t ind1;
+};
+
+
+__constant__ uint posv_d[50];
+__constant__ uint posv_sorted_d[50];
+__constant__ cuDoubleComplex mat_d_const[32*32]; //If target qubit < 5, use const memory; 
+__constant__ uint mat_mask_d_const[32];
+
+
+//-------------Single-target gate-------------------------------
+template<class Func>
+__global__ void apply_one_targe_gate_kernel(cuDoubleComplex *psi_d, Func get_index, int rsize){
+    cuDoubleComplex mat00 = mat_d_const[0];
+    cuDoubleComplex mat01 = mat_d_const[1];
+    cuDoubleComplex mat10 = mat_d_const[2];
+    cuDoubleComplex mat11 = mat_d_const[3];
+
+    unsigned int gridSize = blockDim.x*gridDim.x;
+    for (int j = blockDim.x * blockIdx.x + threadIdx.x; j < rsize;j+= gridSize){
+        targeIndex ind = get_index(j);
+        cuDoubleComplex temp = psi_d[ind.ind0];
+        psi_d[ind.ind0] = cuCadd(cuCmul(mat00, psi_d[ind.ind0]), cuCmul(mat01, psi_d[ind.ind1]));
+        psi_d[ind.ind1] = cuCadd(cuCmul(mat10, temp), cuCmul(mat11, psi_d[ind.ind1]));
+    }
+}
+
+template<int ctrl_num>
+void apply_one_targe_gate_gpu(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){ 
+    //copy mat to device
+    auto mat_temp = op.mat();
+    cuDoubleComplex *mat = 
+    reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
+    checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, 4*sizeof(cuDoubleComplex)));
+    size_t rsize;
+    size_t offset;
+    size_t targe;
+    size_t control;
+    size_t setbit;
+    size_t poffset;
+    if (ctrl_num == 0){
+        targe = op.positions()[0];
+        offset = 1ll<<targe;
+        rsize = size>>1;
+        auto getind_func = [offset, targe] __device__ (size_t j)-> targeIndex {
+            size_t ind0 = (j&(offset-1)) | (j>>targe<<targe<<1);
+            size_t ind1 = ind0 + offset;
+            return {ind0, ind1};
+        };
+
+        size_t blockdim = rsize <= 1024 ? rsize : 1024;
+        size_t griddim = rsize / blockdim;
+        apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d,  getind_func, rsize);
+    }
+    else if(ctrl_num == 1){
+        control = op.positions()[0];
+        targe = op.positions()[1];
+        offset = 1ll<<targe;
+        setbit = 1ll<<control;
+        if (control>targe) {
+            control--;
+        }
+        poffset=1ll<<control;
+        rsize = size>>2;
+        auto getind_func = [control, targe, poffset, offset, setbit] __device__ (size_t j) -> targeIndex {
+            size_t ind0 = (j>>control<<(control+1))|(j&(poffset-1));
+            ind0 = (ind0 >> targe << (targe+1))|(ind0 &(offset-1))|setbit;
+            size_t ind1 = ind0 + offset;
+            return {ind0, ind1};
+        };
+        
+
+        size_t blockdim = rsize <= 1024 ? rsize : 1024;
+        size_t griddim = rsize / blockdim;
+       
+        apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d,  getind_func, rsize);      
+    }
+    else if(ctrl_num == 2){    
+        targe = op.positions().back();
+        offset = 1ll<<targe;
+        uint psize = op.positions().size();
+        rsize = size>>psize;
+
+        vector<pos_t> posv_sorted = op.positions();
+        std::sort(posv_sorted.begin(), posv_sorted.end());
+        //Copy pos to device
+        checkCudaErrors(cudaMemcpyToSymbol(posv_d, op.positions().data(), psize*sizeof(uint)));
+        checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
+
+        auto getind_func = [offset, psize] __device__ (size_t j)-> targeIndex{
+            size_t ind0 = j;
+            for (pos_t k=0;k < psize;k++)
+            {   
+                pos_t _pos = posv_sorted_d[k];
+                ind0 = (ind0&((1ll<<_pos)-1)) | (ind0>>_pos<<_pos<<1);
+            }
+            for (pos_t k=0;k < psize-1;k++){
+                ind0 |= 1ll<<posv_d[k];
+            }
+
+            size_t ind1 = ind0 + offset;
+            return {ind0, ind1};
+        };
+        size_t blockdim = rsize <= 1024 ? rsize : 1024;
+        size_t griddim = rsize / blockdim;
+        apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d, getind_func, rsize);
+    } 
+}
+
+template<int targe_num>
+__global__ void apply_2to4_targe_gate_kernel(cuDoubleComplex *psi_d,  uint ctrlnum, int psize){
+    constexpr uint matlen = 1<<targe_num;
+    uint block_length = blockDim.x;
+    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+    // Insert zeros
+    for(size_t k=0;k < psize;k++){
+        size_t _pos = posv_sorted_d[k];
+        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
+    }
+    // Set control
+    for (size_t k=0; k < ctrlnum;k++){
+        i |= 1ll<<posv_d[k];
+    }
+
+    cuDoubleComplex psi_d_buffer[matlen];
+    for (int y = 0; y < matlen;++y){
+        psi_d_buffer[y] = {0., 0.};
+        for (int x = 0; x < matlen;++x){
+            psi_d_buffer[y] = cuCadd(psi_d_buffer[y], cuCmul(psi_d[i | mat_mask_d_const[x]], mat_d_const[y*matlen+x]));
+        }
+    }
+    for (int y = 0; y < matlen;++y) psi_d[i | mat_mask_d_const[y]] = psi_d_buffer[y];
+}
+
+template<int targe_num>
+void apply_2to4_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){
+    // uint targe_num = op.targe_num();
+    uint matlen = 1<<targe_num;
+    auto pos = op.positions();
+    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
+    vector<uint> targ_mask(matlen);
+    //create target mask
+    for (size_t m = 0; m < matlen;m++){
+        for (size_t j = 0; j < targe_num; j++){
+            if ((m>>j)&1){
+                auto mask_pos = targs[j];
+                targ_mask[m] |= 1ll<<mask_pos;
+            }
+        }
+    }
+
+    vector<pos_t> posv_sorted = op.positions();
+    uint psize = pos.size();
+    std::sort(posv_sorted.begin(),posv_sorted.end());
+    //Copy pos to device
+    checkCudaErrors(cudaMemcpyToSymbol(posv_d,  pos.data(), psize*sizeof(uint)));
+    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
+
+    //copy mat to const memory
+    auto mat_temp = op.mat();
+    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
+
+    checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex)));
+    checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint)));
+    size_t rsize = size>>psize;
+    
+    uint max_thread_num = targe_num < 4 ? 1024 : 512;
+    size_t blockdim = rsize <= max_thread_num ? rsize : max_thread_num;
+    size_t griddim = rsize / blockdim;
+    apply_2to4_targe_gate_kernel<targe_num><<<griddim, blockdim>>>(psi_d, op.control_num(), psize);
+}
+
+
+
+// ------------Large target number gate---------------
+
+__global__ void apply_5_targe_gate_kernel_const(cuDoubleComplex *psi_d, uint ctrlnum, int psize, size_t size){
+    uint rsize = size>>psize;
+    uint targnum = psize-ctrlnum;
+    uint matlen = (1<<targnum);
+    uint block_length = blockDim.x;
+    size_t b = blockIdx.x; // < rsize
+    int idx = threadIdx.x;// 
+    int idy = threadIdx.y;// 
+    size_t i = b;
+    // Insert zeros
+    for(size_t k=0;k < psize;k++){
+        size_t _pos = posv_sorted_d[k];
+        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
+    }
+    // Set control
+    for (size_t k=0; k < ctrlnum;k++){
+        i |= 1ll<<posv_d[k];
+    }
+
+    __syncthreads();
+    cuDoubleComplex v;
+    v = cuCmul(psi_d[ i | mat_mask_d_const[idx]], mat_d_const[idy*matlen+idx]);
+    for (int offset = block_length>>1;offset > 0;offset >>=1){
+        v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
+        v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
+    }
+    __syncthreads(); 
+    if (!idx) psi_d[ i | mat_mask_d_const[idy]] = v;
+}
+
+
+void apply_5_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){
+    uint targe_num = op.targe_num();
+    uint matlen = 1<<targe_num;
+    auto pos = op.positions();
+    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
+    vector<uint> targ_mask(matlen);
+    //create target mask
+    for (size_t m = 0; m < matlen;m++){
+        for (size_t j = 0; j < targe_num; j++){
+            if ((m>>j)&1){
+                auto mask_pos = targs[j];
+                targ_mask[m] |= 1ll<<mask_pos;
+            }
+        }
+    }
+
+    vector<pos_t> posv_sorted = op.positions();
+    uint psize = pos.size();
+    std::sort(posv_sorted.begin(),posv_sorted.end());
+    //Copy pos to device
+    checkCudaErrors(cudaMemcpyToSymbol(posv_d,  pos.data(), psize*sizeof(uint)));
+    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
+
+    //copy mat to const memory
+    auto mat_temp = op.mat();
+    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
+
+    checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex)));
+    checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint)));
+    size_t rsize = size>>psize;
+    uint thread_num = matlen > 32 ? 32 : matlen;
+    dim3 blockdim = dim3(thread_num, thread_num);
+    apply_5_targe_gate_kernel_const<<<rsize, blockdim, thread_num*sizeof(cuDoubleComplex)>>>(psi_d, op.control_num(), psize, size);
+}
+
+
+//For target number 6-10
+__global__ void apply_multi_targe_gate_kernel_shared(cuDoubleComplex *psi_d, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){
+    
+    uint rsize = size>>psize;
+    uint targnum = psize-ctrlnum;
+    uint matlen = (1<<targnum);
+    uint block_length = blockDim.x;
+    size_t b = blockIdx.x; // < rsize
+    int idx = threadIdx.x;// 
+    int idy = threadIdx.y;// 
+    size_t i = b;
+    // Insert zeros
+    for(size_t k=0;k < psize;k++){
+        size_t _pos = posv_sorted_d[k];
+        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
+    }
+    // Set control
+    for (size_t k=0; k < ctrlnum;k++){
+        i |= 1ll<<posv_d[k];
+    }
+
+    __syncthreads();
+    __shared__ cuDoubleComplex local_sum[1024];
+    cuDoubleComplex v;
+    for (int y = idy; y < matlen;y+=blockDim.y){
+        local_sum[y] = {0, 0};
+        for (int x = idx; x < matlen;x+=blockDim.x){
+            v = cuCmul(psi_d[ i | mat_mask_d[x]], mat_d[y*matlen+x]);    
+            __syncthreads(); 
+            for (int offset = block_length>>1;offset > 0;offset >>=1){
+                v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
+                v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
+            }
+            __syncthreads();
+            if (!idx) local_sum[y] = cuCadd(local_sum[y], v);
+        }
+    }
+
+    for (int y = idy; y < matlen;y+=blockDim.y){
+    if (!idx) psi_d[ i | mat_mask_d[y]] = local_sum[y];
+    }
+}
+
+
+void apply_multi_targe_gate_gpu_shared(cuDoubleComplex *psi_d, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){
+    uint targe_num = op.targe_num();
+    uint matlen = 1<<targe_num;
+    auto pos = op.positions();
+    uint psize = pos.size();
+    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
+    vector<uint> targ_mask(matlen);
+    //create target mask
+    for (size_t m = 0; m < matlen;m++){
+        for (size_t j = 0; j < targe_num; j++){
+            if ((m>>j)&1){
+                auto mask_pos = targs[j];
+                targ_mask[m] |= 1ll<<mask_pos;
+            }
+        }
+    }
+
+    vector<pos_t> posv_sorted = pos;
+    std::sort(posv_sorted.begin(),posv_sorted.end());
+    //Copy pos to device
+    checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint)));
+    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
+
+     
+    //copy mat to global memory
+    auto mat_temp = op.mat();
+    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
+    checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice));
+
+    size_t rsize = size>>psize;
+    uint thread_num = matlen > 32 ? 32 : matlen;
+    dim3 blockdim = dim3(thread_num, thread_num);
+
+    apply_multi_targe_gate_kernel_shared<<<rsize, blockdim>>>(psi_d, op.control_num(), mat_d, mat_mask_d, psize, size);
+}
+
+//For target number > 10
+__global__ void apply_multi_targe_gate_kernel_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){
+    uint rsize = size>>psize;
+    uint targnum = psize-ctrlnum;
+    uint matlen = (1<<targnum);
+    uint block_length = blockDim.x;
+    size_t b = blockIdx.x; // < rsize
+    int idx = threadIdx.x;// 
+    int idy = threadIdx.y;// 
+    size_t i = b;
+    // Insert zeros
+    for(size_t k=0;k < psize;k++){
+        size_t _pos = posv_sorted_d[k];
+        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
+    }
+    // Set control
+    for (size_t k=0; k < ctrlnum;k++){
+        i |= 1ll<<posv_d[k];
+    }
+
+    __syncthreads();
+
+    cuDoubleComplex v;
+    cuDoubleComplex v_sum;
+    for (int y = idy; y < matlen;y+=blockDim.y){
+        v_sum = {0, 0};
+        for (int x = idx; x < matlen;x+=blockDim.x){
+            v = cuCmul(psi_d_copy[ i | mat_mask_d[x]], mat_d[y*matlen+x]);    
+            __syncthreads(); 
+            for (int offset = block_length>>1;offset > 0;offset >>=1){
+                v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
+                v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
+            }
+            __syncthreads();
+            if (!idx) v_sum = cuCadd(v_sum, v);
+        }
+        if (!idx) psi_d[ i | mat_mask_d[y]] = v_sum;
+    }
+}
+
+void apply_multi_targe_gate_gpu_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){
+    uint targe_num = op.targe_num();
+    uint matlen = 1<<targe_num;
+    auto pos = op.positions();
+    uint psize = pos.size();
+    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
+    vector<uint> targ_mask(matlen);
+    //create target mask
+    for (size_t m = 0; m < matlen;m++){
+        for (size_t j = 0; j < targe_num; j++){
+            if ((m>>j)&1){
+                auto mask_pos = targs[j];
+                targ_mask[m] |= 1ll<<mask_pos;
+            }
+        }
+    }
+
+    vector<pos_t> posv_sorted = pos;
+    std::sort(posv_sorted.begin(),posv_sorted.end());
+    //Copy pos to device
+    checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint)));
+    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
+
+     
+    //copy mat to global memory
+    auto mat_temp = op.mat();
+    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
+    checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
+    checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice));
+
+    size_t rsize = size>>psize;
+    uint thread_num = matlen > 32 ? 32 : matlen;
+    dim3 blockdim = dim3(thread_num, thread_num);
+
+    apply_multi_targe_gate_kernel_global<<<rsize, blockdim>>>(psi_d, psi_d_copy, op.control_num(), mat_d, mat_mask_d, psize, size);
+    checkCudaErrors(cudaMemcpy(psi_d_copy, psi_d, size*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice));
+}
\ No newline at end of file
diff --git a/src/qfvm_gpu/cuda_simulator.cuh b/src/qfvm_gpu/cuda_simulator.cuh
new file mode 100644
index 0000000..99ee5e4
--- /dev/null
+++ b/src/qfvm_gpu/cuda_simulator.cuh
@@ -0,0 +1,91 @@
+#pragma once
+#include "cuda_statevector.cuh"
+#include <circuit.hpp>
+#include <types.hpp>
+#include <statevector.hpp>
+#include <ticktock.h>
+#include "apply_gate_gpu.cuh"
+
+void simulate_gpu(Circuit & circuit, CudaStateVector & psi_d){
+    size_t size = psi_d.size();
+    //initialize mat
+    cuDoubleComplex *mat_d;
+    uint *mat_mask_d;
+    CudaStateVector psi_d_copy{};
+    if (circuit.max_targe_num() > 5)
+    {
+        uint max_matlen = 1<<circuit.max_targe_num();
+        checkCudaErrors(cudaMalloc(&mat_d, (max_matlen*max_matlen)*sizeof(cuDoubleComplex)));
+        checkCudaErrors(cudaMalloc(&mat_mask_d, max_matlen*sizeof(cuDoubleComplex)));
+        if (circuit.max_targe_num() > 10){
+            psi_d_copy = psi_d;
+        }
+    }
+
+    //apply_gate
+    for (auto gate : circuit.gates()){
+        uint targnum = gate.targe_num();
+        uint ctrlnum = gate.control_num();
+
+        if (targnum  == 1){
+            if (ctrlnum == 0){ 
+                apply_one_targe_gate_gpu<0>(psi_d.data(), gate, size);
+                }
+            else if (ctrlnum == 1){
+                apply_one_targe_gate_gpu<1>(psi_d.data(), gate, size);
+                }
+            else{
+                 apply_one_targe_gate_gpu<2>(psi_d.data(), gate, size);
+                }
+        }
+        else if(targnum > 1){
+            if (targnum == 2){
+                apply_2to4_targe_gate_gpu_const<2>(psi_d.data(), gate, size);
+            }
+            else if(targnum == 3){
+                apply_2to4_targe_gate_gpu_const<3>(psi_d.data(), gate, size);
+            }
+            else if(targnum == 4){
+                apply_2to4_targe_gate_gpu_const<4>(psi_d.data(), gate, size);
+            }
+            else if (targnum == 5){
+                apply_5_targe_gate_gpu_const(psi_d.data(), gate, size);
+            }else if (targnum > 5 && targnum <= 10){
+                apply_multi_targe_gate_gpu_shared(psi_d.data(), gate, mat_d, mat_mask_d, size);
+            }
+            else{
+                apply_multi_targe_gate_gpu_global(psi_d.data(), psi_d_copy.data(), gate, mat_d, mat_mask_d, size);
+            }
+        }   
+        else{
+            throw "Invalid target number";
+        }
+    }
+
+    //free source
+    if (circuit.max_targe_num() > 5){
+        checkCudaErrors(cudaFree(mat_d));
+        checkCudaErrors(cudaFree(mat_mask_d));
+    }
+}
+
+void simulate_gpu(Circuit & circuit, StateVector<data_t> & state){
+    //initialize psi
+    state.set_num(circuit.qubit_num());
+    size_t size = state.size();
+    CudaStateVector psi_d(state);
+    
+    simulate_gpu(circuit, psi_d);
+    cudaDeviceSynchronize();
+
+    //copy back
+    complex<double>* psi = reinterpret_cast<complex<double>*>(psi_d.data());
+    checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex<double>), cudaMemcpyDeviceToHost));
+    psi=nullptr;
+}   
+
+StateVector<double> simulate_gpu(Circuit & circuit){
+    StateVector<double> state(circuit.qubit_num());
+    simulate_gpu(circuit, state);
+    return std::move(state);
+}
\ No newline at end of file
diff --git a/src/qfvm_gpu/cuda_statevector.cuh b/src/qfvm_gpu/cuda_statevector.cuh
new file mode 100644
index 0000000..19e8cb4
--- /dev/null
+++ b/src/qfvm_gpu/cuda_statevector.cuh
@@ -0,0 +1,58 @@
+
+#pragma once
+#include <cuda_runtime.h>
+#include <helper_cuda.h>
+#include <cuComplex.h>
+#include <statevector.hpp>
+
+class CudaStateVector
+{
+    protected:
+        uint num_;
+        size_t size_;
+        cuDoubleComplex *data_;
+
+    public:
+        //construct function
+        CudaStateVector(){checkCudaErrors(cudaMalloc(&data_, 0));}
+        CudaStateVector(CudaStateVector const &other);
+       
+        explicit CudaStateVector(StateVector<double> &sv);
+        ~CudaStateVector() {
+            checkCudaErrors(cudaFree(data_));
+        }
+
+        CudaStateVector &operator=(CudaStateVector const &other)
+        {   
+            num_ = other.num();
+            size_  = other.size();
+            checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex)));
+            checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice));
+            return *this;
+        }
+
+        cuDoubleComplex* data() const { return data_;}
+        size_t size() const { return size_; }
+        uint num() const { return num_; }
+};
+
+
+CudaStateVector::CudaStateVector(StateVector<double> &sv)
+:
+num_(sv.num()),
+size_(sv.size())
+{   
+    cuDoubleComplex *psi_h = reinterpret_cast<cuDoubleComplex*>(sv.data());
+    checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex)));
+    checkCudaErrors(cudaMemcpy(data_, psi_h, size_*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
+}
+
+CudaStateVector::CudaStateVector(CudaStateVector const &other)
+:
+num_(other.num()), 
+size_(other.size())
+{
+    checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex)));
+    checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice));
+}
+
diff --git a/src/qfvm_gpu/cuda_utils/CudaTexture.h b/src/qfvm_gpu/cuda_utils/CudaTexture.h
new file mode 100644
index 0000000..0661d0d
--- /dev/null
+++ b/src/qfvm_gpu/cuda_utils/CudaTexture.h
@@ -0,0 +1,39 @@
+#pragma once
+
+
+#include "helper_cuda.h"
+#include <cuda_runtime.h>
+
+
+struct CudaTexture {
+    cudaTextureObject_t tex;
+
+    CudaTexture(CudaTexture const &) = delete;
+    CudaTexture(CudaTexture &&) = default;
+    CudaTexture &operator=(CudaTexture const &) = delete;
+    CudaTexture &operator=(CudaTexture &&) = default;
+
+    template <class T>
+    CudaTexture(T *dataDev, int width, int height) {
+        cudaTextureObject_t tex;
+        cudaResourceDesc resDesc;
+        memset(&resDesc, 0, sizeof(resDesc));
+        resDesc.resType = cudaResourceTypePitch2D;
+        resDesc.res.pitch2D.devPtr = dataDev;
+        resDesc.res.pitch2D.width = width;
+        resDesc.res.pitch2D.height = height;
+        resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+        resDesc.res.pitch2D.pitchInBytes = width * sizeof(T);
+        cudaTextureDesc texDesc;
+        memset(&texDesc, 0, sizeof(texDesc));
+        checkCudaErrors(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
+    }
+
+    ~CudaTexture() {
+        checkCudaErrors(cudaDestroyTextureObject(tex));
+    }
+
+    constexpr operator cudaTextureObject_t() const {
+        return tex;
+    }
+};
diff --git a/src/qfvm_gpu/cuda_utils/helper_cuda.h b/src/qfvm_gpu/cuda_utils/helper_cuda.h
new file mode 100644
index 0000000..516dd57
--- /dev/null
+++ b/src/qfvm_gpu/cuda_utils/helper_cuda.h
@@ -0,0 +1,953 @@
+/**
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "helper_string.h"
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_PARSE_ERROR:
+      return "CUFFT_PARSE_ERROR";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_LICENSE_ERROR:
+      return "CUFFT_LICENSE_ERROR";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {0x87, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char* name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {0x87, "Ampere"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1) {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+  // end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  if (computeMode == cudaComputeModeProhibited) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaSetDevice(devID));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    if (computeMode != cudaComputeModeProhibited) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
+      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if(result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        }
+        else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    checkCudaErrors(cudaSetDevice(devID));
+    int major = 0, minor = 0; 
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    if (integrated && (computeMode != cudaComputeModeProhibited)) {
+      checkCudaErrors(cudaSetDevice(current_device));
+
+      int major = 0, minor = 0; 
+      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cudaGetDevice(&dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+  // end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_
diff --git a/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp b/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp
new file mode 100644
index 0000000..59fdbc3
--- /dev/null
+++ b/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#define HANDLE_ERROR(x)                                                        \
+{   const auto err = x;                                                        \
+    if (err != CUSTATEVEC_STATUS_SUCCESS ) {                                   \
+        printf("Error: %s in line %d\n",                                       \
+               custatevecGetErrorString(err), __LINE__); return err; }         \
+};
+
+#define HANDLE_CUDA_ERROR(x)                                                   \
+{   const auto err = x;                                                        \
+    if (err != cudaSuccess ) {                                                 \
+        printf("Error: %s in line %d\n",                                       \
+               cudaGetErrorString(err), __LINE__); return err; }               \
+};
+
+bool almost_equal(cuDoubleComplex x, cuDoubleComplex y) {
+    const double eps = 1.0e-5;
+    const cuDoubleComplex diff = cuCsub(x, y);
+    return (cuCabs(diff) < eps);
+}
+
+bool almost_equal(double x, double y) {
+    const double eps = 1.0e-5;
+    const double diff = x - y;
+    return (abs(diff) < eps);
+}
diff --git a/src/qfvm_gpu/cuda_utils/helper_string.h b/src/qfvm_gpu/cuda_utils/helper_string.h
new file mode 100644
index 0000000..77864b8
--- /dev/null
+++ b/src/qfvm_gpu/cuda_utils/helper_string.h
@@ -0,0 +1,683 @@
+/**
+ * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+  int string_start = 0;
+
+  while (string[string_start] == delimiter) {
+    string_start++;
+  }
+
+  if (string_start >= static_cast<int>(strlen(string) - 1)) {
+    return 0;
+  }
+
+  return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+  int string_length = static_cast<int>(strlen(filename));
+
+  while (filename[string_length--] != '.') {
+    if (string_length == 0) break;
+  }
+
+  if (string_length > 0) string_length += 2;
+
+  if (string_length == 0)
+    *extension = NULL;
+  else
+    *extension = &filename[string_length];
+
+  return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+
+      const char *equal_pos = strchr(string_argv, '=');
+      int argv_length = static_cast<int>(
+          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (length == argv_length &&
+          !STRNCASECMP(string_argv, string_ref, length)) {
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          *value = (T)atoi(&string_argv[length + auto_inc]);
+        }
+
+        bFound = true;
+        i = argc;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+  bool bFound = false;
+  int value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = atoi(&string_argv[length + auto_inc]);
+        } else {
+          value = 0;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+  bool bFound = false;
+  float value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+        } else {
+          value = 0.f;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      char *string_argv = const_cast<char*>(&argv[i][string_start]);
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        *string_retval = &string_argv[length + 1];
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (!bFound) {
+    *string_retval = NULL;
+  }
+
+  return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+  // <executable_name> defines a variable that is replaced with the name of the
+  // executable
+
+  // Typical relative search paths to locate needed companion files (e.g. sample
+  // input data, or JIT source files) The origin for the relative search may be
+  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+  // .exe or .bat, etc
+  const char *searchPath[] = {
+      "./",  // same dir
+      "./<executable_name>_data_files/",
+      "./common/",                      // "/common/" subdir
+      "./common/data/",                 // "/common/data/" subdir
+      "./data/",                        // "/data/" subdir
+      "./src/",                         // "/src/" subdir
+      "./src/<executable_name>/data/",  // "/src/<executable_name>/data/" subdir
+      "./inc/",                         // "/inc/" subdir
+      "./0_Simple/",                    // "/0_Simple/" subdir
+      "./1_Utilities/",                 // "/1_Utilities/" subdir
+      "./2_Graphics/",                  // "/2_Graphics/" subdir
+      "./3_Imaging/",                   // "/3_Imaging/" subdir
+      "./4_Finance/",                   // "/4_Finance/" subdir
+      "./5_Simulations/",               // "/5_Simulations/" subdir
+      "./6_Advanced/",                  // "/6_Advanced/" subdir
+      "./7_CUDALibraries/",             // "/7_CUDALibraries/" subdir
+      "./8_Android/",                   // "/8_Android/" subdir
+      "./samples/",                     // "/samples/" subdir
+
+      "./0_Simple/<executable_name>/data/",  // "/0_Simple/<executable_name>/data/"
+                                             // subdir
+      "./1_Utilities/<executable_name>/data/",  // "/1_Utilities/<executable_name>/data/"
+                                                // subdir
+      "./2_Graphics/<executable_name>/data/",  // "/2_Graphics/<executable_name>/data/"
+                                               // subdir
+      "./3_Imaging/<executable_name>/data/",  // "/3_Imaging/<executable_name>/data/"
+                                              // subdir
+      "./4_Finance/<executable_name>/data/",  // "/4_Finance/<executable_name>/data/"
+                                              // subdir
+      "./5_Simulations/<executable_name>/data/",  // "/5_Simulations/<executable_name>/data/"
+                                                  // subdir
+      "./6_Advanced/<executable_name>/data/",  // "/6_Advanced/<executable_name>/data/"
+                                               // subdir
+      "./7_CUDALibraries/<executable_name>/",  // "/7_CUDALibraries/<executable_name>/"
+                                               // subdir
+      "./7_CUDALibraries/<executable_name>/data/",  // "/7_CUDALibraries/<executable_name>/data/"
+                                                    // subdir
+
+      "../",              // up 1 in tree
+      "../common/",       // up 1 in tree, "/common/" subdir
+      "../common/data/",  // up 1 in tree, "/common/data/" subdir
+      "../data/",         // up 1 in tree, "/data/" subdir
+      "../src/",          // up 1 in tree, "/src/" subdir
+      "../inc/",          // up 1 in tree, "/inc/" subdir
+
+      "../0_Simple/<executable_name>/data/",  // up 1 in tree,
+                                              // "/0_Simple/<executable_name>/"
+                                              // subdir
+      "../1_Utilities/<executable_name>/data/",  // up 1 in tree,
+                                                 // "/1_Utilities/<executable_name>/"
+                                                 // subdir
+      "../2_Graphics/<executable_name>/data/",  // up 1 in tree,
+                                                // "/2_Graphics/<executable_name>/"
+                                                // subdir
+      "../3_Imaging/<executable_name>/data/",  // up 1 in tree,
+                                               // "/3_Imaging/<executable_name>/"
+                                               // subdir
+      "../4_Finance/<executable_name>/data/",  // up 1 in tree,
+                                               // "/4_Finance/<executable_name>/"
+                                               // subdir
+      "../5_Simulations/<executable_name>/data/",  // up 1 in tree,
+                                                   // "/5_Simulations/<executable_name>/"
+                                                   // subdir
+      "../6_Advanced/<executable_name>/data/",  // up 1 in tree,
+                                                // "/6_Advanced/<executable_name>/"
+                                                // subdir
+      "../7_CUDALibraries/<executable_name>/data/",  // up 1 in tree,
+                                                     // "/7_CUDALibraries/<executable_name>/"
+                                                     // subdir
+      "../8_Android/<executable_name>/data/",  // up 1 in tree,
+                                               // "/8_Android/<executable_name>/"
+                                               // subdir
+      "../samples/<executable_name>/data/",  // up 1 in tree,
+                                             // "/samples/<executable_name>/"
+                                             // subdir
+      "../../",                              // up 2 in tree
+      "../../common/",                       // up 2 in tree, "/common/" subdir
+      "../../common/data/",  // up 2 in tree, "/common/data/" subdir
+      "../../data/",         // up 2 in tree, "/data/" subdir
+      "../../src/",          // up 2 in tree, "/src/" subdir
+      "../../inc/",          // up 2 in tree, "/inc/" subdir
+      "../../sandbox/<executable_name>/data/",  // up 2 in tree,
+                                                // "/sandbox/<executable_name>/"
+                                                // subdir
+      "../../0_Simple/<executable_name>/data/",  // up 2 in tree,
+                                                 // "/0_Simple/<executable_name>/"
+                                                 // subdir
+      "../../1_Utilities/<executable_name>/data/",  // up 2 in tree,
+                                                    // "/1_Utilities/<executable_name>/"
+                                                    // subdir
+      "../../2_Graphics/<executable_name>/data/",  // up 2 in tree,
+                                                   // "/2_Graphics/<executable_name>/"
+                                                   // subdir
+      "../../3_Imaging/<executable_name>/data/",  // up 2 in tree,
+                                                  // "/3_Imaging/<executable_name>/"
+                                                  // subdir
+      "../../4_Finance/<executable_name>/data/",  // up 2 in tree,
+                                                  // "/4_Finance/<executable_name>/"
+                                                  // subdir
+      "../../5_Simulations/<executable_name>/data/",  // up 2 in tree,
+                                                      // "/5_Simulations/<executable_name>/"
+                                                      // subdir
+      "../../6_Advanced/<executable_name>/data/",  // up 2 in tree,
+                                                   // "/6_Advanced/<executable_name>/"
+                                                   // subdir
+      "../../7_CUDALibraries/<executable_name>/data/",  // up 2 in tree,
+                                                        // "/7_CUDALibraries/<executable_name>/"
+                                                        // subdir
+      "../../8_Android/<executable_name>/data/",  // up 2 in tree,
+                                                  // "/8_Android/<executable_name>/"
+                                                  // subdir
+      "../../samples/<executable_name>/data/",  // up 2 in tree,
+                                                // "/samples/<executable_name>/"
+                                                // subdir
+      "../../../",                              // up 3 in tree
+      "../../../src/<executable_name>/",        // up 3 in tree,
+                                          // "/src/<executable_name>/" subdir
+      "../../../src/<executable_name>/data/",  // up 3 in tree,
+                                               // "/src/<executable_name>/data/"
+                                               // subdir
+      "../../../src/<executable_name>/src/",   // up 3 in tree,
+                                               // "/src/<executable_name>/src/"
+                                               // subdir
+      "../../../src/<executable_name>/inc/",   // up 3 in tree,
+                                               // "/src/<executable_name>/inc/"
+                                               // subdir
+      "../../../sandbox/<executable_name>/",   // up 3 in tree,
+                                               // "/sandbox/<executable_name>/"
+                                               // subdir
+      "../../../sandbox/<executable_name>/data/",  // up 3 in tree,
+                                                   // "/sandbox/<executable_name>/data/"
+                                                   // subdir
+      "../../../sandbox/<executable_name>/src/",  // up 3 in tree,
+                                                  // "/sandbox/<executable_name>/src/"
+                                                  // subdir
+      "../../../sandbox/<executable_name>/inc/",  // up 3 in tree,
+                                                  // "/sandbox/<executable_name>/inc/"
+                                                  // subdir
+      "../../../0_Simple/<executable_name>/data/",  // up 3 in tree,
+                                                    // "/0_Simple/<executable_name>/"
+                                                    // subdir
+      "../../../1_Utilities/<executable_name>/data/",  // up 3 in tree,
+                                                       // "/1_Utilities/<executable_name>/"
+                                                       // subdir
+      "../../../2_Graphics/<executable_name>/data/",  // up 3 in tree,
+                                                      // "/2_Graphics/<executable_name>/"
+                                                      // subdir
+      "../../../3_Imaging/<executable_name>/data/",  // up 3 in tree,
+                                                     // "/3_Imaging/<executable_name>/"
+                                                     // subdir
+      "../../../4_Finance/<executable_name>/data/",  // up 3 in tree,
+                                                     // "/4_Finance/<executable_name>/"
+                                                     // subdir
+      "../../../5_Simulations/<executable_name>/data/",  // up 3 in tree,
+                                                         // "/5_Simulations/<executable_name>/"
+                                                         // subdir
+      "../../../6_Advanced/<executable_name>/data/",  // up 3 in tree,
+                                                      // "/6_Advanced/<executable_name>/"
+                                                      // subdir
+      "../../../7_CUDALibraries/<executable_name>/data/",  // up 3 in tree,
+                                                           // "/7_CUDALibraries/<executable_name>/"
+                                                           // subdir
+      "../../../8_Android/<executable_name>/data/",  // up 3 in tree,
+                                                     // "/8_Android/<executable_name>/"
+                                                     // subdir
+      "../../../0_Simple/<executable_name>/",  // up 3 in tree,
+                                               // "/0_Simple/<executable_name>/"
+                                               // subdir
+      "../../../1_Utilities/<executable_name>/",  // up 3 in tree,
+                                                  // "/1_Utilities/<executable_name>/"
+                                                  // subdir
+      "../../../2_Graphics/<executable_name>/",  // up 3 in tree,
+                                                 // "/2_Graphics/<executable_name>/"
+                                                 // subdir
+      "../../../3_Imaging/<executable_name>/",  // up 3 in tree,
+                                                // "/3_Imaging/<executable_name>/"
+                                                // subdir
+      "../../../4_Finance/<executable_name>/",  // up 3 in tree,
+                                                // "/4_Finance/<executable_name>/"
+                                                // subdir
+      "../../../5_Simulations/<executable_name>/",  // up 3 in tree,
+                                                    // "/5_Simulations/<executable_name>/"
+                                                    // subdir
+      "../../../6_Advanced/<executable_name>/",  // up 3 in tree,
+                                                 // "/6_Advanced/<executable_name>/"
+                                                 // subdir
+      "../../../7_CUDALibraries/<executable_name>/",  // up 3 in tree,
+                                                      // "/7_CUDALibraries/<executable_name>/"
+                                                      // subdir
+      "../../../8_Android/<executable_name>/",  // up 3 in tree,
+                                                // "/8_Android/<executable_name>/"
+                                                // subdir
+      "../../../samples/<executable_name>/data/",  // up 3 in tree,
+                                                   // "/samples/<executable_name>/"
+                                                   // subdir
+      "../../../common/",       // up 3 in tree, "../../../common/" subdir
+      "../../../common/data/",  // up 3 in tree, "../../../common/data/" subdir
+      "../../../data/",         // up 3 in tree, "../../../data/" subdir
+      "../../../../",           // up 4 in tree
+      "../../../../src/<executable_name>/",  // up 4 in tree,
+                                             // "/src/<executable_name>/" subdir
+      "../../../../src/<executable_name>/data/",  // up 4 in tree,
+                                                  // "/src/<executable_name>/data/"
+                                                  // subdir
+      "../../../../src/<executable_name>/src/",  // up 4 in tree,
+                                                 // "/src/<executable_name>/src/"
+                                                 // subdir
+      "../../../../src/<executable_name>/inc/",  // up 4 in tree,
+                                                 // "/src/<executable_name>/inc/"
+                                                 // subdir
+      "../../../../sandbox/<executable_name>/",  // up 4 in tree,
+                                                 // "/sandbox/<executable_name>/"
+                                                 // subdir
+      "../../../../sandbox/<executable_name>/data/",  // up 4 in tree,
+                                                      // "/sandbox/<executable_name>/data/"
+                                                      // subdir
+      "../../../../sandbox/<executable_name>/src/",  // up 4 in tree,
+                                                     // "/sandbox/<executable_name>/src/"
+                                                     // subdir
+      "../../../../sandbox/<executable_name>/inc/",  // up 4 in tree,
+                                                     // "/sandbox/<executable_name>/inc/"
+                                                     // subdir
+      "../../../../0_Simple/<executable_name>/data/",  // up 4 in tree,
+                                                       // "/0_Simple/<executable_name>/"
+                                                       // subdir
+      "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree,
+                                                          // "/1_Utilities/<executable_name>/"
+                                                          // subdir
+      "../../../../2_Graphics/<executable_name>/data/",  // up 4 in tree,
+                                                         // "/2_Graphics/<executable_name>/"
+                                                         // subdir
+      "../../../../3_Imaging/<executable_name>/data/",  // up 4 in tree,
+                                                        // "/3_Imaging/<executable_name>/"
+                                                        // subdir
+      "../../../../4_Finance/<executable_name>/data/",  // up 4 in tree,
+                                                        // "/4_Finance/<executable_name>/"
+                                                        // subdir
+      "../../../../5_Simulations/<executable_name>/data/",  // up 4 in tree,
+                                                            // "/5_Simulations/<executable_name>/"
+                                                            // subdir
+      "../../../../6_Advanced/<executable_name>/data/",  // up 4 in tree,
+                                                         // "/6_Advanced/<executable_name>/"
+                                                         // subdir
+      "../../../../7_CUDALibraries/<executable_name>/data/",  // up 4 in tree,
+                                                              // "/7_CUDALibraries/<executable_name>/"
+                                                              // subdir
+      "../../../../8_Android/<executable_name>/data/",  // up 4 in tree,
+                                                        // "/8_Android/<executable_name>/"
+                                                        // subdir
+      "../../../../0_Simple/<executable_name>/",  // up 4 in tree,
+                                                  // "/0_Simple/<executable_name>/"
+                                                  // subdir
+      "../../../../1_Utilities/<executable_name>/",  // up 4 in tree,
+                                                     // "/1_Utilities/<executable_name>/"
+                                                     // subdir
+      "../../../../2_Graphics/<executable_name>/",  // up 4 in tree,
+                                                    // "/2_Graphics/<executable_name>/"
+                                                    // subdir
+      "../../../../3_Imaging/<executable_name>/",  // up 4 in tree,
+                                                   // "/3_Imaging/<executable_name>/"
+                                                   // subdir
+      "../../../../4_Finance/<executable_name>/",  // up 4 in tree,
+                                                   // "/4_Finance/<executable_name>/"
+                                                   // subdir
+      "../../../../5_Simulations/<executable_name>/",  // up 4 in tree,
+                                                       // "/5_Simulations/<executable_name>/"
+                                                       // subdir
+      "../../../../6_Advanced/<executable_name>/",  // up 4 in tree,
+                                                    // "/6_Advanced/<executable_name>/"
+                                                    // subdir
+      "../../../../7_CUDALibraries/<executable_name>/",  // up 4 in tree,
+                                                         // "/7_CUDALibraries/<executable_name>/"
+                                                         // subdir
+      "../../../../8_Android/<executable_name>/",  // up 4 in tree,
+                                                   // "/8_Android/<executable_name>/"
+                                                   // subdir
+      "../../../../samples/<executable_name>/data/",  // up 4 in tree,
+                                                      // "/samples/<executable_name>/"
+                                                      // subdir
+      "../../../../common/",       // up 4 in tree, "../../../common/" subdir
+      "../../../../common/data/",  // up 4 in tree, "../../../common/data/"
+                                   // subdir
+      "../../../../data/",         // up 4 in tree, "../../../data/" subdir
+      "../../../../../",           // up 5 in tree
+      "../../../../../src/<executable_name>/",  // up 5 in tree,
+                                                // "/src/<executable_name>/"
+                                                // subdir
+      "../../../../../src/<executable_name>/data/",  // up 5 in tree,
+                                                     // "/src/<executable_name>/data/"
+                                                     // subdir
+      "../../../../../src/<executable_name>/src/",  // up 5 in tree,
+                                                    // "/src/<executable_name>/src/"
+                                                    // subdir
+      "../../../../../src/<executable_name>/inc/",  // up 5 in tree,
+                                                    // "/src/<executable_name>/inc/"
+                                                    // subdir
+      "../../../../../sandbox/<executable_name>/",  // up 5 in tree,
+                                                    // "/sandbox/<executable_name>/"
+                                                    // subdir
+      "../../../../../sandbox/<executable_name>/data/",  // up 5 in tree,
+                                                         // "/sandbox/<executable_name>/data/"
+                                                         // subdir
+      "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree,
+                                                        // "/sandbox/<executable_name>/src/"
+                                                        // subdir
+      "../../../../../sandbox/<executable_name>/inc/",  // up 5 in tree,
+                                                        // "/sandbox/<executable_name>/inc/"
+                                                        // subdir
+      "../../../../../0_Simple/<executable_name>/data/",  // up 5 in tree,
+                                                          // "/0_Simple/<executable_name>/"
+                                                          // subdir
+      "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree,
+                                                             // "/1_Utilities/<executable_name>/"
+                                                             // subdir
+      "../../../../../2_Graphics/<executable_name>/data/",  // up 5 in tree,
+                                                            // "/2_Graphics/<executable_name>/"
+                                                            // subdir
+      "../../../../../3_Imaging/<executable_name>/data/",  // up 5 in tree,
+                                                           // "/3_Imaging/<executable_name>/"
+                                                           // subdir
+      "../../../../../4_Finance/<executable_name>/data/",  // up 5 in tree,
+                                                           // "/4_Finance/<executable_name>/"
+                                                           // subdir
+      "../../../../../5_Simulations/<executable_name>/data/",  // up 5 in tree,
+                                                               // "/5_Simulations/<executable_name>/"
+                                                               // subdir
+      "../../../../../6_Advanced/<executable_name>/data/",  // up 5 in tree,
+                                                            // "/6_Advanced/<executable_name>/"
+                                                            // subdir
+      "../../../../../7_CUDALibraries/<executable_name>/data/",  // up 5 in
+                                                                 // tree,
+                                                                 // "/7_CUDALibraries/<executable_name>/"
+                                                                 // subdir
+      "../../../../../8_Android/<executable_name>/data/",  // up 5 in tree,
+                                                           // "/8_Android/<executable_name>/"
+                                                           // subdir
+      "../../../../../samples/<executable_name>/data/",  // up 5 in tree,
+                                                         // "/samples/<executable_name>/"
+                                                         // subdir
+      "../../../../../common/",       // up 5 in tree, "../../../common/" subdir
+      "../../../../../common/data/",  // up 5 in tree, "../../../common/data/"
+                                      // subdir
+  };
+
+  // Extract the executable name
+  std::string executable_name;
+
+  if (executable_path != 0) {
+    executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+  }
+
+  // Loop over all search paths and return the first hit
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+    std::string path(searchPath[i]);
+    size_t executable_name_pos = path.find("<executable_name>");
+
+    // If there is executable_name variable in the searchPath
+    // replace it with the value
+    if (executable_name_pos != std::string::npos) {
+      if (executable_path != 0) {
+        path.replace(executable_name_pos, strlen("<executable_name>"),
+                     executable_name);
+      } else {
+        // Skip this path entry if no executable argument is given
+        continue;
+      }
+    }
+
+#ifdef _DEBUG
+    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+    // Test if the file exists
+    path.append(filename);
+    FILE *fp;
+    FOPEN(fp, path.c_str(), "rb");
+
+    if (fp != NULL) {
+      fclose(fp);
+      // File found
+      // returning an allocated array here for backwards compatibility reasons
+      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+      STRCPY(file_path, path.length() + 1, path.c_str());
+      return file_path;
+    }
+
+    if (fp) {
+      fclose(fp);
+    }
+  }
+
+  // File not found
+  return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_
diff --git a/src/qfvm_gpu/cuda_utils/ticktock.h b/src/qfvm_gpu/cuda_utils/ticktock.h
new file mode 100644
index 0000000..1adb8a9
--- /dev/null
+++ b/src/qfvm_gpu/cuda_utils/ticktock.h
@@ -0,0 +1,9 @@
+#pragma once
+
+//#include <chrono>
+//#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
+//#define TOCK(x) std::cout << #x ": " << std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - bench_##x).count() << "s" << std::endl;
+
+#include <tbb/tick_count.h>
+#define TICK(x) auto bench_##x = tbb::tick_count::now();
+#define TOCK(x) std::cout << #x ": " << (tbb::tick_count::now() - bench_##x).seconds() << "s" << std::endl;
diff --git a/src/qfvm_gpu/custate_simu.cuh b/src/qfvm_gpu/custate_simu.cuh
new file mode 100644
index 0000000..6456c7e
--- /dev/null
+++ b/src/qfvm_gpu/custate_simu.cuh
@@ -0,0 +1,35 @@
+#pragma once
+#include "cuda_statevector.cuh"
+#include <circuit.hpp>
+#include <types.hpp>
+#include <statevector.hpp>
+#include "apply_gate_custate.cuh"
+
+void simulate_custate(Circuit & circuit, CudaStateVector & psi_d){
+    size_t size = psi_d.size();
+    int n = psi_d.num();
+    for (auto gate : circuit.gates()){
+        apply_gate_custate(psi_d.data(), gate, n);
+    }
+}
+
+void simulate_custate(Circuit & circuit, StateVector<data_t> & state){
+    //initialize psi
+    state.set_num(circuit.qubit_num());
+    size_t size = state.size();
+    CudaStateVector psi_d(state);
+    
+    simulate_custate(circuit, psi_d);
+    cudaDeviceSynchronize();
+
+    //copy back
+    complex<double>* psi = reinterpret_cast<complex<double>*>(psi_d.data());
+    checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex<double>), cudaMemcpyDeviceToHost));
+    psi=nullptr;
+}   
+
+StateVector<double> simulate_custate(Circuit & circuit){
+    StateVector<double> state(circuit.qubit_num());
+    simulate_custate(circuit, state);
+    return std::move(state);
+}
\ No newline at end of file

From 3baa9683899965df1bb95f3e1c5cb8bd8351b220 Mon Sep 17 00:00:00 2001
From: chensgit169 <weichen191@mails.ucas.ac.cn>
Date: Thu, 30 Nov 2023 16:11:43 +0800
Subject: [PATCH 4/5] fix: use lower case of instruction name

---
 quafu/visualisation/circuitPlot.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/quafu/visualisation/circuitPlot.py b/quafu/visualisation/circuitPlot.py
index 4866ce7..dcc5eac 100644
--- a/quafu/visualisation/circuitPlot.py
+++ b/quafu/visualisation/circuitPlot.py
@@ -187,10 +187,9 @@ def __call__(self,
             plt.show()
 
     def _process_ins(self, ins: Instruction, append: bool = True):
-        name = ins.name
-        assert name in Instruction.ins_classes, 'If this should occur, please report a bug.'
+        name = ins.name.lower()
+        assert name in Instruction.ins_classes, 'Name: %s not registered, if this should occur, please report a bug.' % name
 
-        name = name.lower()
         _which = slice(np.min(ins.pos), np.max(ins.pos) + 1)
         depth = np.max(self.dorders[_which])
         paras = ins.paras

From bc7ea5f8550999d1529ff0b019bf7b8276d0ca9d Mon Sep 17 00:00:00 2001
From: chensgit169 <weichen191@mails.ucas.ac.cn>
Date: Thu, 30 Nov 2023 16:21:32 +0800
Subject: [PATCH 5/5] chore: add licence

---
 quafu/elements/oracle.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/quafu/elements/oracle.py b/quafu/elements/oracle.py
index 2ce6e47..b49ebdf 100644
--- a/quafu/elements/oracle.py
+++ b/quafu/elements/oracle.py
@@ -1,3 +1,17 @@
+#  (C) Copyright 2023 Beijing Academy of Quantum Information Sciences
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
 from abc import ABCMeta
 from quafu.elements import QuantumGate, Instruction
 from typing import Dict, Iterable, List