diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000..69898c6
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,2 @@
+DerivePointerAlignment: false
+PointerAlignment: Left
diff --git a/.github/workflows/commitlint.yml b/.github/workflows/commitlint.yml
index daf956c..47daed2 100644
--- a/.github/workflows/commitlint.yml
+++ b/.github/workflows/commitlint.yml
@@ -2,8 +2,8 @@ name: Lint Commit Messages
 on: [push, pull_request]
 
 jobs:
-  commitlint:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: wagoid/commitlint-github-action@v5
+    commitlint:
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v3
+        -   uses: wagoid/commitlint-github-action@v5
diff --git a/.github/workflows/pre-commit.yaml b/.github/workflows/pre-commit.yaml
new file mode 100644
index 0000000..2eee254
--- /dev/null
+++ b/.github/workflows/pre-commit.yaml
@@ -0,0 +1,16 @@
+name: pre-commit
+
+on:
+    pull_request:
+    push:
+        branches: [main]
+
+jobs:
+    pre-commit:
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v3
+        -   uses: actions/setup-python@v3
+            with:
+                python-version: '3.10'
+        -   uses: pre-commit/action@v3.0.0
diff --git a/.github/workflows/unittest.yml b/.github/workflows/unittest.yml
index 197b729..105b787 100644
--- a/.github/workflows/unittest.yml
+++ b/.github/workflows/unittest.yml
@@ -1,28 +1,28 @@
 name: Unit test
 on:
-  push:
-  pull_request:
-    branches: ['master']
+    push:
+    pull_request:
+        branches: [master]
 jobs:
-  unnittest:
-    name: Unit test
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-python@v4
-        name: Install Python
-        with:
-          python-version: '3.10'
+    unnittest:
+        name: Unit test
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v3
+        -   uses: actions/setup-python@v4
+            name: Install Python
+            with:
+                python-version: '3.10'
 
-      - name: Install dependency
-        run: python -m pip install -r requirements.txt && python -m pip install pytest
+        -   name: Install dependency
+            run: python -m pip install -r requirements.txt && python -m pip install pytest
 
       # TODO(zhaoyilun): Build seperate package for pyquafu-torch
-      - name: Install torch
-        run: python -m pip install torch torchvision torchaudio
+        -   name: Install torch
+            run: python -m pip install torch torchvision torchaudio
 
-      - name: Install pyquafu
-        run: python -m pip install .
+        -   name: Install pyquafu
+            run: python -m pip install .
 
-      - name: Run unit tests
-        run: pytest tests/
+        -   name: Run unit tests
+            run: pytest tests/
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
index c1c2a30..032a0a9 100644
--- a/.github/workflows/wheels.yml
+++ b/.github/workflows/wheels.yml
@@ -1,59 +1,59 @@
 name: Build
 
 on:
-  push:
-    branches:
-      - "master"
-    tags:
-      - "v*"
-  pull_request:
-  pull_request_review:
-    types: [submitted, edited]
-  workflow_dispatch:
+    push:
+        branches:
+        -   master
+        tags:
+        -   v*
+    pull_request:
+    pull_request_review:
+        types: [submitted, edited]
+    workflow_dispatch:
 
 
 
 jobs:
-  build_wheels:
-    name: Build python wheels
-    strategy:
-      matrix:
-        os-arch: ["manylinux_x86_64", "win_amd64", "macosx_x86_64", "macosx_arm64"]
-        python-version: ["3.10"]
-        cibw-python: ["cp38", "cp39", "cp310","cp311"]
-        include:
-          - os-arch: "manylinux_x86_64"
-            os: "ubuntu-20.04"
-          - os-arch: "win_amd64"
-            os: "windows-2019"
-          - os-arch: "macosx_x86_64"
-            os: "macos-11"
-          - os-arch: "macosx_arm64"
-            os: "macos-11"
-    runs-on: ${{ matrix.os }}
-
-    env:
-      CIBW_BUILD: ${{ matrix.cibw-python }}-${{ matrix.os-arch }}
-      PYTHON: ${{ matrix.python-version }}
-      TWINE_USERNAME: "__token__"
-
-    steps:
-      - uses: actions/checkout@v3
+    build_wheels:
+        name: Build python wheels
+        strategy:
+            matrix:
+                os-arch: [manylinux_x86_64, win_amd64, macosx_x86_64, macosx_arm64]
+                python-version: ['3.10']
+                cibw-python: [cp38, cp39, cp310, cp311]
+                include:
+                -   os-arch: manylinux_x86_64
+                    os: ubuntu-20.04
+                -   os-arch: win_amd64
+                    os: windows-2019
+                -   os-arch: macosx_x86_64
+                    os: macos-13
+                -   os-arch: macosx_arm64
+                    os: macos-13
+        runs-on: ${{ matrix.os }}
 
-      # Used to host cibuildwheel
-      - uses: actions/setup-python@v3
-        with:
-          python-version: ${{ matrix.python-version }}
-          
-      - name: Install dependence
-        run: python -m pip install pybind11 cibuildwheel scikit-build twine pytest
-         
-      - name: Build wheels
-        run: python -m cibuildwheel --output-dir dist
-
-      - name: Publish package
-        run: python -m twine upload dist/*.whl
-        if: ${{ contains(github.ref, '/tags/') }}
         env:
-          TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
-          TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
+            CIBW_BUILD: ${{ matrix.cibw-python }}-${{ matrix.os-arch }}
+            PYTHON: ${{ matrix.python-version }}
+            TWINE_USERNAME: __token__
+
+        steps:
+        -   uses: actions/checkout@v3
+
+      # Used to host cibuildwheel
+        -   uses: actions/setup-python@v3
+            with:
+                python-version: ${{ matrix.python-version }}
+
+        -   name: Install dependence
+            run: python -m pip install pybind11 cibuildwheel scikit-build twine pytest
+
+        -   name: Build wheels
+            run: python -m cibuildwheel --output-dir dist
+
+        -   name: Publish package
+            run: python -m twine upload dist/*.whl
+            if: ${{ contains(github.ref, '/tags/') }}
+            env:
+                TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
+                TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
diff --git a/.gitignore b/.gitignore
index 2c417ea..56c3f40 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,5 @@ src/site
 test
 thirdparty
 wheelhouse
+*.zhaoyilun
+*.png
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..eaf346c
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,53 @@
+repos:
+-   repo: https://github.com/Lucas-C/pre-commit-hooks.git
+    rev: v1.5.4
+    hooks:
+    -   id: remove-crlf
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+    -   id: check-added-large-files
+        args:
+        -   --maxkb=20480
+    -   id: check-merge-conflict
+    -   id: check-symlinks
+    -   id: detect-private-key
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
+    -   id: end-of-file-fixer
+    -   id: trailing-whitespace
+    -   id: requirements-txt-fixer
+    -   id: sort-simple-yaml
+-   repo: https://github.com/pylint-dev/pylint
+    rev: v3.0.0a6
+    hooks:
+    -   id: pylint
+        args:
+        -   --disable=all
+        -   --load-plugins=docstring_checker
+        -   --enable=doc-string-one-line,doc-string-end-with,doc-string-with-all-args,doc-string-triple-quotes,doc-string-missing,doc-string-indent-error,doc-string-with-returns,doc-string-with-raises
+-   repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks.git
+    rev: v2.10.0
+    hooks:
+    -   id: pretty-format-yaml
+        args: [--autofix, --indent, '4']
+-   repo: https://github.com/hadialqattan/pycln
+    rev: v2.2.2
+    hooks:
+    -   id: pycln
+-   repo: https://github.com/pre-commit/mirrors-clang-format
+    rev: v15.0.7
+    hooks:
+    -   id: clang-format
+        args: [-style=file]
+  # Using this mirror lets us use mypyc-compiled black, which is about 2x faster
+-   repo: https://github.com/psf/black-pre-commit-mirror
+    rev: 23.12.0
+    hooks:
+    -   id: black
+-   repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+    -   id: isort
+        name: isort (python)
+        args: [--profile, black, --filter-files]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee1910a..ab2d4e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,8 +9,9 @@ if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
 
-set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CUDA_ARCHITECTURES 70;75;80;90)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if(SKBUILD)
 
 execute_process(
@@ -46,7 +47,7 @@ include(ExternalProject)
 ExternalProject_Add(Eigen3
 		PREFIX             ${EIGEN3_ROOT}
 		GIT_REPOSITORY     https://gitlab.com/libeigen/eigen.git
-		GIT_TAG            3.3.9
+		GIT_TAG            3.4
 
 		CONFIGURE_COMMAND ""
 		BUILD_COMMAND ""
@@ -56,7 +57,6 @@ ExternalProject_Add(Eigen3
 )
 list (APPEND PRJ_INCLUDE_DIRS ${EIGEN3_INCLUDE_DIR})
 
-
 find_package(pybind11 CONFIG)
 list (APPEND PRJ_INCLUDE_DIRS ${PYBIND11_INCLUDE_DIR})
 
@@ -74,7 +74,7 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_HOST_SYSTEM_PROCESSOR
 	endif()
 endif()
 
-list (APPEND PRJ_INCLUDE_DIRS src/qfvm)
+list (APPEND PRJ_INCLUDE_DIRS src/qfvm src/qfvm_clifford)
 pybind11_add_module(${PROJECT_NAME} MODULE src/${PROJECT_NAME}/${PROJECT_NAME}.cpp)
 add_dependencies(${PROJECT_NAME} Eigen3) #must add dependence for ninja
 target_compile_options(${PROJECT_NAME} PUBLIC ${PRJ_COMPILE_OPTIONS})
diff --git a/LICENSE b/LICENSE
index 753842b..2915c0b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -198,4 +198,4 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
+   limitations under the License.
diff --git a/examples/quantum_rl/misc/utils.py b/examples/quantum_rl/misc/utils.py
index 2e88173..caf6d22 100644
--- a/examples/quantum_rl/misc/utils.py
+++ b/examples/quantum_rl/misc/utils.py
@@ -7,6 +7,7 @@
 import numpy as np
 import tensorflow as tf
 from models.quantum_models import generate_circuit, get_model_circuit_params
+
 from quafu import QuantumCircuit as quafuQC
 from quafu import Task, User
 
@@ -14,12 +15,12 @@
 def create_exp_dir(path, scripts_to_save=None):
     if not os.path.exists(path):
         os.mkdir(path)
-    print('Experiment dir : {}'.format(path))
+    print("Experiment dir : {}".format(path))
 
     if scripts_to_save is not None:
-        os.mkdir(os.path.join(path, 'scripts'))
+        os.mkdir(os.path.join(path, "scripts"))
         for script in scripts_to_save:
-            dst_file = os.path.join(path, 'scripts', os.path.basename(script))
+            dst_file = os.path.join(path, "scripts", os.path.basename(script))
             shutil.copyfile(script, dst_file)
 
 
@@ -32,7 +33,7 @@ def get_res_exp(res):
     for k, v in prob.items():
         count = 0
         for i in range(len(k)):
-            if k[i] == '1':
+            if k[i] == "1":
                 count += 1
         if count % 2 == 0:
             sumexp += v
@@ -46,28 +47,28 @@ def get_quafu_exp(circuit, n_qubits, backend_quafu, shots):
     Execute circuits on quafu cloud platform and return the expectation.
     """
     # convert Cirq circuts to qasm
-    openqasm = circuit.to_qasm(header='')
-    openqasm = re.sub('//.*\n', '', openqasm)
+    openqasm = circuit.to_qasm(header="")
+    openqasm = re.sub("//.*\n", "", openqasm)
     openqasm = "".join([s for s in openqasm.splitlines(True) if s.strip()])
-    
+
     # fill in with your token, register on website http://quafu.baqis.ac.cn/
     user = User()
     user.save_apitoken(" ")
-    
+
     # initialize to Quafu circuits
     q = quafuQC(n_qubits)
     q.from_openqasm(openqasm)
-    
+
     # create the task
     task = Task()
-      
+
     task.config(backend_quafu, shots, compile=True, priority=3)
     task_id = task.send(q, wait=True).taskid
-    print('task_id:', task_id)
-    
+    print("task_id:", task_id)
+
     # retrieve the result of completed tasks and compute expectations
     task_status = task.retrieve(task_id).task_status
-    if task_status == 'Completed':
+    if task_status == "Completed":
         task = Task()
         res = task.retrieve(task_id)
         OB = get_res_exp(res)
@@ -78,24 +79,24 @@ def get_compiled_gates_depth(circuit, n_qubits, backend_quafu, shots):
     """
     Get the gates and layered circuits of compiled circuits.
     """
-    openqasm = circuit.to_qasm(header='')
-    openqasm = re.sub('//.*\n', '', openqasm)
+    openqasm = circuit.to_qasm(header="")
+    openqasm = re.sub("//.*\n", "", openqasm)
     openqasm = "".join([s for s in openqasm.splitlines(True) if s.strip()])
-    
+
     user = User()
     user.save_apitoken(" ")
-    
+
     q = quafuQC(n_qubits)
     q.from_openqasm(openqasm)
-    
+
     task = Task()
-    
+
     task.config(backend_quafu, shots, compile=True)
     task_id = task.send(q, wait=True).taskid
-    print('task_id:', task_id)
-    
+    print("task_id:", task_id)
+
     task_status = task.retrieve(task_id).task_status
-    if task_status == 'Completed':
+    if task_status == "Completed":
         task = Task()
         res = task.retrieve(task_id)
         gates = res.transpiled_circuit.gates
@@ -107,10 +108,15 @@ class Alternating_(tf.keras.layers.Layer):
     """
     Load observable weights of pre-trained models.
     """
+
     def __init__(self, obsw):
         super(Alternating_, self).__init__()
         self.w = tf.Variable(
-            initial_value=tf.constant(obsw), dtype="float32", trainable=True, name="obsw")
+            initial_value=tf.constant(obsw),
+            dtype="float32",
+            trainable=True,
+            name="obsw",
+        )
 
     def call(self, inputs):
         return tf.matmul(inputs, self.w)
@@ -120,10 +126,14 @@ def get_obs_policy(obsw, beta):
     """
     Output the final policy.
     """
-    process = tf.keras.Sequential([ Alternating_(obsw),
-                                    tf.keras.layers.Lambda(lambda x: x * beta),
-                                    tf.keras.layers.Softmax()
-                                ], name="obs_policy")
+    process = tf.keras.Sequential(
+        [
+            Alternating_(obsw),
+            tf.keras.layers.Lambda(lambda x: x * beta),
+            tf.keras.layers.Softmax(),
+        ],
+        name="obs_policy",
+    )
     return process
 
 
@@ -131,11 +141,23 @@ def get_height(position):
     """
     Get the height of position in MountainCar-v0.
     """
-    return np.sin(3 * position)*.45+.55
+    return np.sin(3 * position) * 0.45 + 0.55
 
 
-def gather_episodes(state_bounds, n_actions, model, n_episodes, env_name, beta, backend, backend_quafu='ScQ-P10', shots=1000, 
-                    n_qubits=4, qubits=None, genotype=None):
+def gather_episodes(
+    state_bounds,
+    n_actions,
+    model,
+    n_episodes,
+    env_name,
+    beta,
+    backend,
+    backend_quafu="ScQ-P10",
+    shots=1000,
+    n_qubits=4,
+    qubits=None,
+    genotype=None,
+):
     """
     Interact with environment, you can choose the backend between `cirq` simulator and `quafu` cloud platform.
     """
@@ -149,43 +171,47 @@ def gather_episodes(state_bounds, n_actions, model, n_episodes, env_name, beta,
 
     while not all(done):
         unfinished_ids = [i for i in range(n_episodes) if not done[i]]
-        normalized_states = [s/state_bounds for i, s in enumerate(states) if not done[i]]
+        normalized_states = [
+            s / state_bounds for i, s in enumerate(states) if not done[i]
+        ]
         # height = [get_height(s[0]) for i, s in enumerate(states) if not done[i]]
 
         for i, state in zip(unfinished_ids, normalized_states):
-            trajectories[i]['states'].append(state)
+            trajectories[i]["states"].append(state)
 
         # Compute policy for all unfinished envs in parallel
         states = tf.convert_to_tensor(normalized_states)
 
-        if backend == 'cirq':
+        if backend == "cirq":
             action_probs = model([states])
-        elif backend == 'quafu':
+        elif backend == "quafu":
             newtheta, newlamda = get_model_circuit_params(qubits, genotype, model)
-            circuit, _, _ = generate_circuit(qubits, genotype, newtheta, newlamda, states.numpy()[0])
+            circuit, _, _ = generate_circuit(
+                qubits, genotype, newtheta, newlamda, states.numpy()[0]
+            )
             taskid, expectation = get_quafu_exp(circuit, n_qubits, backend_quafu, shots)
             tasklist.append(taskid)
             # print('gather_episodes_exp:', expectation)
 
-            obsw = model.get_layer('observables-policy').get_weights()[0]
+            obsw = model.get_layer("observables-policy").get_weights()[0]
             obspolicy = get_obs_policy(obsw, beta)
             action_probs = obspolicy(expectation)
         else:
-            print('This backend is not supported now.')
+            print("This backend is not supported now.")
 
         # Store action and transition all environments to the next state
         states = [None for i in range(n_episodes)]
         for i, policy in zip(unfinished_ids, action_probs.numpy()):
-            trajectories[i]['action_probs'].append(policy)
+            trajectories[i]["action_probs"].append(policy)
             action = np.random.choice(n_actions, p=policy)
             states[i], reward, done[i], _ = envs[i].step(action)
-            trajectories[i]['actions'].append(action)
+            trajectories[i]["actions"].append(action)
             if env_name == "CartPole-v1":
-                trajectories[i]['rewards'].append(reward)
+                trajectories[i]["rewards"].append(reward)
             elif env_name == "MountainCar-v0":
-                trajectories[i]['rewards'].append(reward + get_height(states[i][0]))
+                trajectories[i]["rewards"].append(reward + get_height(states[i][0]))
             else:
-                print('This environment is not supported now.')
+                print("This environment is not supported now.")
 
     return tasklist, trajectories
 
@@ -206,5 +232,3 @@ def compute_returns(rewards_history, gamma):
     returns = returns.tolist()
 
     return returns
-
-
diff --git a/examples/quantum_rl/models/quantum_genotypes.py b/examples/quantum_rl/models/quantum_genotypes.py
index 746424f..ad611d6 100644
--- a/examples/quantum_rl/models/quantum_genotypes.py
+++ b/examples/quantum_rl/models/quantum_genotypes.py
@@ -1,130 +1,96 @@
 from collections import namedtuple
 
-Genotype = namedtuple('Genotype', 'measure vpqc dpqc entangle')
+Genotype = namedtuple("Genotype", "measure vpqc dpqc entangle")
 
-PRIMITIVES = [
-    'measurement',
-    'variationalPQC',
-    'dataencodingPQC',
-    'entanglement'
-]
+PRIMITIVES = ["measurement", "variationalPQC", "dataencodingPQC", "entanglement"]
 
 NSGANet_id10 = Genotype(
-    measure=[
-        ('measurement', 12)
-    ], 
+    measure=[("measurement", 12)],
     vpqc=[
-        ('variationalPQC', 1), 
-        ('variationalPQC', 2), 
-        ('variationalPQC', 4), 
-        ('variationalPQC', 5), 
-        ('variationalPQC', 7), 
-        ('variationalPQC', 10)
-    ], 
-    dpqc=[
-        ('dataencodingPQC', 3), 
-        ('dataencodingPQC', 9), 
-        ('dataencodingPQC', 11)
-    ], 
-    entangle=[
-        ('entanglement', 0), 
-        ('entanglement', 6), 
-        ('entanglement', 8)
-    ]
+        ("variationalPQC", 1),
+        ("variationalPQC", 2),
+        ("variationalPQC", 4),
+        ("variationalPQC", 5),
+        ("variationalPQC", 7),
+        ("variationalPQC", 10),
+    ],
+    dpqc=[("dataencodingPQC", 3), ("dataencodingPQC", 9), ("dataencodingPQC", 11)],
+    entangle=[("entanglement", 0), ("entanglement", 6), ("entanglement", 8)],
 )
 
 NSGANet_id21 = Genotype(
-    measure=[
-        ('measurement', 16)
-    ], 
+    measure=[("measurement", 16)],
     vpqc=[
-        ('variationalPQC', 1), 
-        ('variationalPQC', 2), 
-        ('variationalPQC', 5), 
-        ('variationalPQC', 6), 
-        ('variationalPQC', 11)
-    ], 
+        ("variationalPQC", 1),
+        ("variationalPQC", 2),
+        ("variationalPQC", 5),
+        ("variationalPQC", 6),
+        ("variationalPQC", 11),
+    ],
     dpqc=[
-        ('dataencodingPQC', 0), 
-        ('dataencodingPQC', 3), 
-        ('dataencodingPQC', 4), 
-        ('dataencodingPQC', 7), 
-        ('dataencodingPQC', 9), 
-        ('dataencodingPQC', 10), 
-        ('dataencodingPQC', 13)
-    ], 
+        ("dataencodingPQC", 0),
+        ("dataencodingPQC", 3),
+        ("dataencodingPQC", 4),
+        ("dataencodingPQC", 7),
+        ("dataencodingPQC", 9),
+        ("dataencodingPQC", 10),
+        ("dataencodingPQC", 13),
+    ],
     entangle=[
-        ('entanglement', 8), 
-        ('entanglement', 12), 
-        ('entanglement', 14), 
-        ('entanglement', 15)
-    ]
+        ("entanglement", 8),
+        ("entanglement", 12),
+        ("entanglement", 14),
+        ("entanglement", 15),
+    ],
 )
 
 NSGANet_id97 = Genotype(
-    measure=[
-        ('measurement', 5)
-    ], 
-    vpqc=[
-        ('variationalPQC', 1)
-    ], 
-    dpqc=[
-        ('dataencodingPQC', 0), 
-        ('dataencodingPQC', 3), 
-        ('dataencodingPQC', 4)
-    ], 
-    entangle=[
-        ('entanglement', 2)
-    ]
+    measure=[("measurement", 5)],
+    vpqc=[("variationalPQC", 1)],
+    dpqc=[("dataencodingPQC", 0), ("dataencodingPQC", 3), ("dataencodingPQC", 4)],
+    entangle=[("entanglement", 2)],
 )
 
 Layer5_CP = Genotype(
-    measure=[
-        ('measurement', 15)
-    ], 
+    measure=[("measurement", 15)],
     vpqc=[
-        ('variationalPQC', 0),
-        ('variationalPQC', 3), 
-        ('variationalPQC', 6), 
-        ('variationalPQC', 9), 
-        ('variationalPQC', 12)
-    ], 
+        ("variationalPQC", 0),
+        ("variationalPQC", 3),
+        ("variationalPQC", 6),
+        ("variationalPQC", 9),
+        ("variationalPQC", 12),
+    ],
     dpqc=[
-        ('dataencodingPQC', 2), 
-        ('dataencodingPQC', 5), 
-        ('dataencodingPQC', 8),
-        ('dataencodingPQC', 11), 
-        ('dataencodingPQC', 14)
-    ], 
+        ("dataencodingPQC", 2),
+        ("dataencodingPQC", 5),
+        ("dataencodingPQC", 8),
+        ("dataencodingPQC", 11),
+        ("dataencodingPQC", 14),
+    ],
     entangle=[
-        ('entanglement', 1), 
-        ('entanglement', 4),
-        ('entanglement', 7), 
-        ('entanglement', 10),
-        ('entanglement', 13)
-    ]
+        ("entanglement", 1),
+        ("entanglement", 4),
+        ("entanglement", 7),
+        ("entanglement", 10),
+        ("entanglement", 13),
+    ],
 )
 
 Eqas_PQC = Genotype(
-    measure=[
-        ('measurement', 12)
-    ], 
-    vpqc=[
-        ('variationalPQC', 5), 
-        ('variationalPQC', 7)
-    ], 
+    measure=[("measurement", 12)],
+    vpqc=[("variationalPQC", 5), ("variationalPQC", 7)],
     dpqc=[
-        ('dataencodingPQC', 2), 
-        ('dataencodingPQC', 6), 
-        ('dataencodingPQC', 9),
-        ('dataencodingPQC', 11)
-    ], 
+        ("dataencodingPQC", 2),
+        ("dataencodingPQC", 6),
+        ("dataencodingPQC", 9),
+        ("dataencodingPQC", 11),
+    ],
     entangle=[
-        ('entanglement', 0), 
-        ('entanglement', 1),
-        ('entanglement', 3), 
-        ('entanglement', 4),
-        ('entanglement', 8),
-        ('entanglement', 10)
-    ]
-)
\ No newline at end of file
+        ("entanglement", 0),
+        ("entanglement", 1),
+        ("entanglement", 3),
+        ("entanglement", 4),
+        ("entanglement", 8),
+        ("entanglement", 10),
+    ],
+)
diff --git a/examples/quantum_rl/models/quantum_models.py b/examples/quantum_rl/models/quantum_models.py
index 5e90b52..d35a4cc 100644
--- a/examples/quantum_rl/models/quantum_models.py
+++ b/examples/quantum_rl/models/quantum_models.py
@@ -12,7 +12,7 @@ def generate_circuit(qubits, genotype, newtheta=None, newlamda=None, state=None)
     op_dpqc, pos_dpqc = zip(*genotype.dpqc)
     op_entangle, pos_entangle = zip(*genotype.entangle)
     op_measure, pos_measure = zip(*genotype.measure)
-    
+
     dict = {}
     for name, pos in zip(op_vpqc, pos_vpqc):
         dict[pos] = name
@@ -30,26 +30,26 @@ def generate_circuit(qubits, genotype, newtheta=None, newlamda=None, state=None)
     p_count = 0
     i_count = 0
     for i in range(length):
-        if dict[i] == 'variationalPQC':
+        if dict[i] == "variationalPQC":
             cir, pa = OPS[dict[i]](qubits, p_count, newtheta)
             circuit += cir
             params += pa
             p_count += 1
-        elif dict[i] == 'dataencodingPQC':
+        elif dict[i] == "dataencodingPQC":
             cir, inp = OPS[dict[i]](qubits, i, i_count, newlamda, state)
             circuit += cir
             inputs += inp
-            i_count += 1            
-        elif dict[i] == 'entanglement':
+            i_count += 1
+        elif dict[i] == "entanglement":
             cir = OPS[dict[i]](qubits)
             circuit += cir
-        elif dict[i] == 'measurement':
+        elif dict[i] == "measurement":
             pass
         else:
-            raise NameError('Unknown quantum genotype operation')
+            raise NameError("Unknown quantum genotype operation")
 
     # Last varitional layer
-    cir, pa = OPS['variationalPQC'](qubits, len(pos_vpqc), newtheta)
+    cir, pa = OPS["variationalPQC"](qubits, len(pos_vpqc), newtheta)
     circuit += cir
     params += pa
 
@@ -58,7 +58,7 @@ def generate_circuit(qubits, genotype, newtheta=None, newlamda=None, state=None)
 
 def get_model_circuit_params(qubits, genotype, model):
     """Get parameters from trained model"""
-    theta, lamda = model.get_layer('nsganet_PQC').get_weights()
+    theta, lamda = model.get_layer("nsganet_PQC").get_weights()
     theta = theta[0]
 
     _, theta_symbols, input_symbols = generate_circuit(qubits, genotype)
@@ -71,14 +71,17 @@ def get_model_circuit_params(qubits, genotype, model):
     newlamda = []
     for i in range(len(theta)):
         newtheta.append(theta[newindex[i]])
-    for i in range(len(theta), len(theta)+len(lamda)):
-        newlamda.append(lamda[newindex[i]-len(theta)])
+    for i in range(len(theta), len(theta) + len(lamda)):
+        newlamda.append(lamda[newindex[i] - len(theta)])
     return newtheta, newlamda
 
 
 class NSGANetPQC(tf.keras.layers.Layer):
     """Define NSGANet PQC based on keras layer"""
-    def __init__(self, qubits, genotype, observables, activation="linear", name="nsganet_PQC"):
+
+    def __init__(
+        self, qubits, genotype, observables, activation="linear", name="nsganet_PQC"
+    ):
         super(NSGANetPQC, self).__init__(name=name)
         self.n_qubits = len(qubits)
         _, pos_dpqc = zip(*genotype.dpqc)
@@ -89,7 +92,8 @@ def __init__(self, qubits, genotype, observables, activation="linear", name="nsg
         theta_init = tf.random_uniform_initializer(minval=0.0, maxval=np.pi)
         self.theta = tf.Variable(
             initial_value=theta_init(shape=(1, len(theta_symbols)), dtype="float32"),
-            trainable=True, name="thetas"
+            trainable=True,
+            name="thetas",
         )
 
         lmbd_init = tf.ones(shape=(self.n_qubits * self.n_layers,))
@@ -103,7 +107,7 @@ def __init__(self, qubits, genotype, observables, activation="linear", name="nsg
 
         self.activation = activation
         self.empty_circuit = tfq.convert_to_tensor([cirq.Circuit()])
-        self.computation_layer = tfq.layers.ControlledPQC(circuit, observables)        
+        self.computation_layer = tfq.layers.ControlledPQC(circuit, observables)
 
     def call(self, inputs):
         # inputs[0] = encoding data for the state.
@@ -122,16 +126,29 @@ def call(self, inputs):
 
 class Alternating(tf.keras.layers.Layer):
     """Apply action-specific weights."""
+
     def __init__(self, output_dim, env):
         super(Alternating, self).__init__()
         if env == "CartPole-v1":
             self.w = tf.Variable(
-                initial_value=tf.constant([[(-1.)**i for i in range(output_dim)]]), dtype="float32",
-                trainable=True, name="obs-weights")
+                initial_value=tf.constant([[(-1.0) ** i for i in range(output_dim)]]),
+                dtype="float32",
+                trainable=True,
+                name="obs-weights",
+            )
         elif env == "MountainCar-v0":
             self.w = tf.Variable(
-                initial_value=tf.constant([[(-1.)**i for i in range(output_dim)], [(-1.)**i for i in range(output_dim)],
-                [(-1.)**i for i in range(output_dim)]]), dtype="float32", trainable=True, name="obs-weights")
+                initial_value=tf.constant(
+                    [
+                        [(-1.0) ** i for i in range(output_dim)],
+                        [(-1.0) ** i for i in range(output_dim)],
+                        [(-1.0) ** i for i in range(output_dim)],
+                    ]
+                ),
+                dtype="float32",
+                trainable=True,
+                name="obs-weights",
+            )
 
     def call(self, inputs):
         return tf.matmul(inputs, self.w)
@@ -139,20 +156,19 @@ def call(self, inputs):
 
 def generate_model_policy(qubits, genotype, n_actions, beta, observables, env):
     """Generate a Keras model for a NSGANet PQC policy."""
-    input_tensor = tf.keras.Input(shape=(len(qubits), ), dtype=tf.dtypes.float32, name='input')
+    input_tensor = tf.keras.Input(
+        shape=(len(qubits),), dtype=tf.dtypes.float32, name="input"
+    )
     nsganet_pqc = NSGANetPQC(qubits, genotype, observables)([input_tensor])
-    process = tf.keras.Sequential([
-        Alternating(n_actions, env),
-        tf.keras.layers.Lambda(lambda x: x * beta),
-        tf.keras.layers.Softmax()
-    ], name="observables-policy")
+    process = tf.keras.Sequential(
+        [
+            Alternating(n_actions, env),
+            tf.keras.layers.Lambda(lambda x: x * beta),
+            tf.keras.layers.Softmax(),
+        ],
+        name="observables-policy",
+    )
     policy = process(nsganet_pqc)
     model = tf.keras.Model(inputs=[input_tensor], outputs=policy)
 
     return model
-
-
-
-
-
-
diff --git a/examples/quantum_rl/models/quantum_operations.py b/examples/quantum_rl/models/quantum_operations.py
index b504c8a..75bfd56 100644
--- a/examples/quantum_rl/models/quantum_operations.py
+++ b/examples/quantum_rl/models/quantum_operations.py
@@ -4,9 +4,13 @@
 
 # Organize components defined below to quantum operations
 OPS = {
-    'variationalPQC': lambda qubits, position, params: generate_vpqc(qubits, position, params),
-    'dataencodingPQC': lambda qubits, position, count, params, state: generate_dpqc(qubits, position, count, params, state),
-    'entanglement': lambda qubits: generate_entangle(qubits)
+    "variationalPQC": lambda qubits, position, params: generate_vpqc(
+        qubits, position, params
+    ),
+    "dataencodingPQC": lambda qubits, position, count, params, state: generate_dpqc(
+        qubits, position, count, params, state
+    ),
+    "entanglement": lambda qubits: generate_entangle(qubits),
 }
 
 
@@ -15,9 +19,11 @@ def one_qubit_rotation(qubit, symbols):
     Return Cirq gates that apply a rotation of the bloch sphere about the X,
     Y and Z axis, specified by the values in `symbols`.
     """
-    return [cirq.rx(symbols[0])(qubit),
-            cirq.ry(symbols[1])(qubit),
-            cirq.rz(symbols[2])(qubit)]
+    return [
+        cirq.rx(symbols[0])(qubit),
+        cirq.ry(symbols[1])(qubit),
+        cirq.rz(symbols[2])(qubit),
+    ]
 
 
 def entangling_layer(qubits):
@@ -35,18 +41,22 @@ def generate_vpqc(qubits, position, params=None):
     # Number of qubits
     n_qubits = len(qubits)
 
-    # Sympy symbols or load parameters for variational angles 
+    # Sympy symbols or load parameters for variational angles
     if params == None:
-        params = sympy.symbols(f'theta({3*position*n_qubits}:{3*(position+1)*n_qubits})')
+        params = sympy.symbols(
+            f"theta({3*position*n_qubits}:{3*(position+1)*n_qubits})"
+        )
     else:
-        params = params[3*position*n_qubits:3*(position+1)*n_qubits]
+        params = params[3 * position * n_qubits : 3 * (position + 1) * n_qubits]
     params = np.asarray(params).reshape((n_qubits, 3))
 
     # Define circuit
     circuit = cirq.Circuit()
 
     # Variational layer
-    circuit += cirq.Circuit(one_qubit_rotation(q, params[i]) for i, q in enumerate(qubits))
+    circuit += cirq.Circuit(
+        one_qubit_rotation(q, params[i]) for i, q in enumerate(qubits)
+    )
 
     return circuit, list(params.flat)
 
@@ -58,9 +68,9 @@ def generate_dpqc(qubits, position, count, params=None, state=None):
 
     # Sympy symbols or load parameters for encoding angles
     if params == None:
-        inputs = sympy.symbols(f'x{position}'+f'_(0:{n_qubits})')
+        inputs = sympy.symbols(f"x{position}" + f"_(0:{n_qubits})")
     else:
-        inputs = params[count*n_qubits:(count+1)*n_qubits]
+        inputs = params[count * n_qubits : (count + 1) * n_qubits]
         for i in range(len(state)):
             inputs[i] *= state[i]
     inputs = np.asarray(inputs).reshape((n_qubits))
@@ -78,19 +88,7 @@ def generate_entangle(qubits):
     """Prepare a entangle circuit on `qubits`."""
     # Define circuit
     circuit = cirq.Circuit()
-    
+
     circuit += entangling_layer(qubits)
 
     return circuit
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/examples/quantum_rl/search/nsganet.py b/examples/quantum_rl/search/nsganet.py
index 8c72df6..69552ad 100644
--- a/examples/quantum_rl/search/nsganet.py
+++ b/examples/quantum_rl/search/nsganet.py
@@ -6,8 +6,7 @@
 from pymoo.operators.crossover.point_crossover import PointCrossover
 from pymoo.operators.mutation.polynomial_mutation import PolynomialMutation
 from pymoo.operators.sampling.random_sampling import RandomSampling
-from pymoo.operators.selection.tournament_selection import (
-    TournamentSelection, compare)
+from pymoo.operators.selection.tournament_selection import TournamentSelection, compare
 from pymoo.util.display import disp_multi_objective
 from pymoo.util.dominator import Dominator
 from pymoo.util.non_dominated_sorting import NonDominatedSorting
@@ -20,12 +19,11 @@
 
 
 class NSGANet(GeneticAlgorithm):
-
     def __init__(self, **kwargs):
-        kwargs['individual'] = Individual(rank=np.inf, crowding=-1)
+        kwargs["individual"] = Individual(rank=np.inf, crowding=-1)
         super().__init__(**kwargs)
 
-        self.tournament_type = 'comp_by_dom_and_crowding'
+        self.tournament_type = "comp_by_dom_and_crowding"
         self.func_display_attrs = disp_multi_objective
 
 
@@ -42,34 +40,46 @@ def binary_tournament(pop, P, algorithm, **kwargs):
     S = np.full(P.shape[0], np.nan)
 
     for i in range(P.shape[0]):
-
         a, b = P[i, 0], P[i, 1]
 
         # if at least one solution is infeasible
         if pop[a].CV > 0.0 or pop[b].CV > 0.0:
-            S[i] = compare(a, pop[a].CV, b, pop[b].CV, method='smaller_is_better', return_random_if_equal=True)
+            S[i] = compare(
+                a,
+                pop[a].CV,
+                b,
+                pop[b].CV,
+                method="smaller_is_better",
+                return_random_if_equal=True,
+            )
 
         # both solutions are feasible
         else:
-
-            if tournament_type == 'comp_by_dom_and_crowding':
+            if tournament_type == "comp_by_dom_and_crowding":
                 rel = Dominator.get_relation(pop[a].F, pop[b].F)
                 if rel == 1:
                     S[i] = a
                 elif rel == -1:
                     S[i] = b
 
-            elif tournament_type == 'comp_by_rank_and_crowding':
-                S[i] = compare(a, pop[a].rank, b, pop[b].rank,
-                               method='smaller_is_better')
+            elif tournament_type == "comp_by_rank_and_crowding":
+                S[i] = compare(
+                    a, pop[a].rank, b, pop[b].rank, method="smaller_is_better"
+                )
 
             else:
                 raise Exception("Unknown tournament type.")
 
             # if rank or domination relation didn't make a decision compare by crowding
             if np.isnan(S[i]):
-                S[i] = compare(a, pop[a].get("crowding"), b, pop[b].get("crowding"),
-                               method='larger_is_better', return_random_if_equal=True)
+                S[i] = compare(
+                    a,
+                    pop[a].get("crowding"),
+                    b,
+                    pop[b].get("crowding"),
+                    method="larger_is_better",
+                    return_random_if_equal=True,
+                )
 
     return S[:, None].astype(np.int)
 
@@ -80,12 +90,10 @@ def binary_tournament(pop, P, algorithm, **kwargs):
 
 
 class RankAndCrowdingSurvival(Survival):
-
     def __init__(self) -> None:
         super().__init__(True)
 
     def _do(self, pop, n_survive, D=None, **kwargs):
-
         # get the objective space values and objects
         F = pop.get("F")
 
@@ -96,7 +104,6 @@ def _do(self, pop, n_survive, D=None, **kwargs):
         fronts = NonDominatedSorting().do(F, n_stop_if_ranked=n_survive)
 
         for k, front in enumerate(fronts):
-
             # calculate the crowding distance of the front
             crowding_of_front = calc_crowding_distance(F[front, :])
 
@@ -107,8 +114,10 @@ def _do(self, pop, n_survive, D=None, **kwargs):
 
             # current front sorted by crowding distance if splitting
             if len(survivors) + len(front) > n_survive:
-                I = randomized_argsort(crowding_of_front, order='descending', method='numpy')
-                I = I[:(n_survive - len(survivors))]
+                I = randomized_argsort(
+                    crowding_of_front, order="descending", method="numpy"
+                )
+                I = I[: (n_survive - len(survivors))]
 
             # otherwise take the whole front unsorted
             else:
@@ -121,7 +130,7 @@ def _do(self, pop, n_survive, D=None, **kwargs):
 
 
 def calc_crowding_distance(F):
-    infinity = 1e+14
+    infinity = 1e14
 
     n_points = F.shape[0]
     n_obj = F.shape[1]
@@ -129,16 +138,16 @@ def calc_crowding_distance(F):
     if n_points <= 2:
         return np.full(n_points, infinity)
     else:
-
         # sort each column and get index
-        I = np.argsort(F, axis=0, kind='mergesort')
+        I = np.argsort(F, axis=0, kind="mergesort")
 
         # now really sort the whole array
         F = F[I, np.arange(n_obj)]
 
         # get the distance to the last element in sorted list and replace zeros with actual values
-        dist = np.concatenate([F, np.full((1, n_obj), np.inf)]) \
-               - np.concatenate([np.full((1, n_obj), -np.inf), F])
+        dist = np.concatenate([F, np.full((1, n_obj), np.inf)]) - np.concatenate(
+            [np.full((1, n_obj), -np.inf), F]
+        )
 
         index_dist_is_zero = np.where(dist == 0)
 
@@ -161,7 +170,13 @@ def calc_crowding_distance(F):
 
         # sum up the distance to next and last and norm by objectives - also reorder from sorted list
         J = np.argsort(I, axis=0)
-        crowding = np.sum(dist_to_last[J, np.arange(n_obj)] + dist_to_next[J, np.arange(n_obj)], axis=1) / n_obj
+        crowding = (
+            np.sum(
+                dist_to_last[J, np.arange(n_obj)] + dist_to_next[J, np.arange(n_obj)],
+                axis=1,
+            )
+            / n_obj
+        )
 
     # replace infinity with a large number
     crowding[np.isinf(crowding)] = infinity
@@ -175,14 +190,15 @@ def calc_crowding_distance(F):
 
 
 def nsganet(
-        pop_size=100,
-        sampling=RandomSampling(var_type=np.int),
-        selection=TournamentSelection(func_comp=binary_tournament),
-        crossover=PointCrossover(n_points=2),
-        mutation=PolynomialMutation(eta=3, var_type=np.int),
-        eliminate_duplicates=True,
-        n_offsprings=None,
-        **kwargs):
+    pop_size=100,
+    sampling=RandomSampling(var_type=np.int),
+    selection=TournamentSelection(func_comp=binary_tournament),
+    crossover=PointCrossover(n_points=2),
+    mutation=PolynomialMutation(eta=3, var_type=np.int),
+    eliminate_duplicates=True,
+    n_offsprings=None,
+    **kwargs
+):
     """
 
     Parameters
@@ -203,15 +219,17 @@ def nsganet(
 
     """
 
-    return NSGANet(pop_size=pop_size,
-                   sampling=sampling,
-                   selection=selection,
-                   crossover=crossover,
-                   mutation=mutation,
-                   survival=RankAndCrowdingSurvival(),
-                   eliminate_duplicates=eliminate_duplicates,
-                   n_offsprings=n_offsprings,
-                   **kwargs)
+    return NSGANet(
+        pop_size=pop_size,
+        sampling=sampling,
+        selection=selection,
+        crossover=crossover,
+        mutation=mutation,
+        survival=RankAndCrowdingSurvival(),
+        eliminate_duplicates=eliminate_duplicates,
+        n_offsprings=n_offsprings,
+        **kwargs
+    )
 
 
 parse_doc_string(nsganet)
diff --git a/examples/quantum_rl/search/quantum_encoding.py b/examples/quantum_rl/search/quantum_encoding.py
index a37c3fd..a7eba4a 100644
--- a/examples/quantum_rl/search/quantum_encoding.py
+++ b/examples/quantum_rl/search/quantum_encoding.py
@@ -1,17 +1,11 @@
 from collections import namedtuple
-from hashlib import new
 
 import numpy as np
 
-Genotype = namedtuple('Genotype', 'measure vpqc dpqc entangle')
+Genotype = namedtuple("Genotype", "measure vpqc dpqc entangle")
 
 # what you want to search should be defined here and in quantum_operations
-PRIMITIVES = [
-    'measurement',
-    'variationalPQC',
-    'dataencodingPQC',
-    'entanglement'
-]
+PRIMITIVES = ["measurement", "variationalPQC", "dataencodingPQC", "entanglement"]
 
 
 def convert2arch(bit_string):
@@ -22,19 +16,19 @@ def convert2arch(bit_string):
     entangle = []
     new_bit = 0
     for i in range(len(bit_string)):
-        if PRIMITIVES[bit_string[i]] == 'variationalPQC':
+        if PRIMITIVES[bit_string[i]] == "variationalPQC":
             vpqc.append((PRIMITIVES[bit_string[i]], i))
-        elif PRIMITIVES[bit_string[i]] == 'dataencodingPQC':
+        elif PRIMITIVES[bit_string[i]] == "dataencodingPQC":
             dpqc.append((PRIMITIVES[bit_string[i]], i))
-        elif PRIMITIVES[bit_string[i]] == 'entanglement':
+        elif PRIMITIVES[bit_string[i]] == "entanglement":
             entangle.append((PRIMITIVES[bit_string[i]], i))
-        elif PRIMITIVES[bit_string[i]] == 'measurement':
+        elif PRIMITIVES[bit_string[i]] == "measurement":
             measure.append((PRIMITIVES[bit_string[i]], i))
             if vpqc != [] and dpqc != [] and entangle != []:
-                new_bit = bit_string[0:(i+1)]
+                new_bit = bit_string[0 : (i + 1)]
                 break
         else:
-            raise NameError('Unknown quantum architecture.')
+            raise NameError("Unknown quantum architecture.")
     if type(new_bit) == int:
         new_bit = bit_string
     new_bit = list(new_bit)
@@ -50,17 +44,16 @@ def convert2arch(bit_string):
         dpqc = []
         entangle = []
         for i in range(len(new_bit)):
-            if PRIMITIVES[new_bit[i]] == 'variationalPQC':
+            if PRIMITIVES[new_bit[i]] == "variationalPQC":
                 vpqc.append((PRIMITIVES[new_bit[i]], i))
-            elif PRIMITIVES[new_bit[i]] == 'dataencodingPQC':
+            elif PRIMITIVES[new_bit[i]] == "dataencodingPQC":
                 dpqc.append((PRIMITIVES[new_bit[i]], i))
-            elif PRIMITIVES[new_bit[i]] == 'entanglement':
+            elif PRIMITIVES[new_bit[i]] == "entanglement":
                 entangle.append((PRIMITIVES[new_bit[i]], i))
-            elif PRIMITIVES[new_bit[i]] == 'measurement':
+            elif PRIMITIVES[new_bit[i]] == "measurement":
                 measure.append((PRIMITIVES[new_bit[i]], i))
             else:
-                raise NameError('Unknown quantum architecture.')
+                raise NameError("Unknown quantum architecture.")
         g = Genotype(measure=measure, vpqc=vpqc, dpqc=dpqc, entangle=entangle)
-    
-    return new_bit, g
 
+    return new_bit, g
diff --git a/examples/quantum_rl/search/quantum_evolution_search.py b/examples/quantum_rl/search/quantum_evolution_search.py
index df5d7ee..d9a685d 100644
--- a/examples/quantum_rl/search/quantum_evolution_search.py
+++ b/examples/quantum_rl/search/quantum_evolution_search.py
@@ -4,7 +4,7 @@
 import os
 import sys
 
-sys.path.insert(0, ' ')
+sys.path.insert(0, " ")
 import time
 from functools import reduce
 
@@ -17,35 +17,67 @@
 from search import quantum_encoding, quantum_train_search
 
 parser = argparse.ArgumentParser("Multi-objetive Genetic Algorithm for quantum NAS")
-parser.add_argument('--save', type=str, default='quantumGA', help='experiment name')
-parser.add_argument('--n_var', type=int, default=30, help='the maximum length of architecture')
-parser.add_argument('--pop_size', type=int, default=10, help='population size of networks')
-parser.add_argument('--n_gens', type=int, default=10, help='number of generation')
-parser.add_argument('--n_offspring', type=int, default=10, help='number of offspring created per generation')
-parser.add_argument('--n_episodes', type=int, default=300, help='number of episodes to train during architecture search')
+parser.add_argument("--save", type=str, default="quantumGA", help="experiment name")
+parser.add_argument(
+    "--n_var", type=int, default=30, help="the maximum length of architecture"
+)
+parser.add_argument(
+    "--pop_size", type=int, default=10, help="population size of networks"
+)
+parser.add_argument("--n_gens", type=int, default=10, help="number of generation")
+parser.add_argument(
+    "--n_offspring",
+    type=int,
+    default=10,
+    help="number of offspring created per generation",
+)
+parser.add_argument(
+    "--n_episodes",
+    type=int,
+    default=300,
+    help="number of episodes to train during architecture search",
+)
 
 args = parser.parse_args(args=[])
-args.save = 'search-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
+args.save = "search-{}-{}".format(args.save, time.strftime("%Y%m%d-%H%M%S"))
 create_exp_dir(args.save)
 
-log_format = '%(asctime)s %(message)s'
-logging.basicConfig(stream=sys.stdout, level=logging.INFO,
-                    format=log_format, datefmt='%m/%d %I:%M:%S %p')
-fh = logging.FileHandler(os.path.join(args.save, 'log.txt'))
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format=log_format,
+    datefmt="%m/%d %I:%M:%S %p",
+)
+fh = logging.FileHandler(os.path.join(args.save, "log.txt"))
 fh.setFormatter(logging.Formatter(log_format))
 logging.getLogger().addHandler(fh)
 
 pop_hist = []  # keep track of every evaluated architecture
 
+
 # ---------------------------------------------------------------------------------------------------------
 # Define your NAS Problem
 # ---------------------------------------------------------------------------------------------------------
 class NAS(Problem):
     """Define the multi-objetive problem of quantum architecture search.
-       The first aim is to maximize the perfoemance of the task and the second need is control the number of entanglements."""
+    The first aim is to maximize the perfoemance of the task and the second need is control the number of entanglements.
+    """
+
     # first define the NAS problem (inherit from pymop)
-    def __init__(self, qubits, n_actions, observables, n_var=30, n_obj=2, n_constr=0, 
-                 lb=None, ub=None, n_episodes=300, save_dir=None):
+    def __init__(
+        self,
+        qubits,
+        n_actions,
+        observables,
+        n_var=30,
+        n_obj=2,
+        n_constr=0,
+        lb=None,
+        ub=None,
+        n_episodes=300,
+        save_dir=None,
+    ):
         super().__init__(n_var=n_var, n_obj=n_obj, n_constr=n_constr, type_var=np.int)
         self.xl = lb
         self.xu = ub
@@ -62,32 +94,36 @@ def _evaluate(self, x, out, *args, **kwargs):
         objs = np.full((x.shape[0], self.n_obj), np.nan)
         for i in range(x.shape[0]):
             arch_id = self._n_evaluated + 1
-            print('\n')
-            logging.info('Network id = {}'.format(arch_id))
+            print("\n")
+            logging.info("Network id = {}".format(arch_id))
 
             bit_string = x[i, :]
             nb, _ = quantum_encoding.convert2arch(bit_string)
             if list(nb) not in self.nb_list:
                 self.nb_list.append(list(nb))
-                performance = quantum_train_search.main(bit_string, qubits=self.qubits, n_actions=self.n_actions,
-                                                        observables=self.observables,
-                                                        n_episodes=self.n_episodes,
-                                                        save='arch_{}'.format(arch_id),
-                                                        expr_root=self._save_dir)
-            
+                performance = quantum_train_search.main(
+                    bit_string,
+                    qubits=self.qubits,
+                    n_actions=self.n_actions,
+                    observables=self.observables,
+                    n_episodes=self.n_episodes,
+                    save="arch_{}".format(arch_id),
+                    expr_root=self._save_dir,
+                )
 
                 # all objectives assume to be MINIMIZED !!!!!
                 objs[i, 0] = -np.mean(performance)
-                objs[i, 1] = np.sum(nb==3)
+                objs[i, 1] = np.sum(nb == 3)
             else:
-                objs[i, 0] = objs[i-1, 0]
-                objs[i, 1] = objs[i-1, 1]
+                objs[i, 0] = objs[i - 1, 0]
+                objs[i, 1] = objs[i - 1, 1]
             self._n_evaluated += 1
 
         out["F"] = objs
         # if your NAS problem has constraints, use the following line to set constraints
         # out["G"] = np.column_stack([g1, g2, g3, g4, g5, g6]) in case 6 constraints
 
+
 # ---------------------------------------------------------------------------------------------------------
 # Define what statistics to print or save for each generation
 # ---------------------------------------------------------------------------------------------------------
@@ -99,12 +135,26 @@ def do_every_generations(algorithm):
 
     # report generation info to files
     logging.info("generation = {}".format(gen))
-    logging.info("population collected rewards: best = {}, mean = {}, "
-                 "median = {}, worst = {}, best_pos = {}".format(-np.min(pop_obj[:, 0]), -np.mean(pop_obj[:, 0]),
-                                                  -np.median(pop_obj[:, 0]), -np.max(pop_obj[:, 0]), np.where(pop_obj[:, 0]==np.min(pop_obj[:, 0]))))
-    logging.info("population entangle number: best = {}, mean = {}, "
-                 "median = {}, worst = {}, best_pos = {}".format(np.min(pop_obj[:, 1]), np.mean(pop_obj[:, 1]),
-                                                  np.median(pop_obj[:, 1]), np.max(pop_obj[:, 1]), np.where(pop_obj[:, 1]==np.min(pop_obj[:, 1]))))
+    logging.info(
+        "population collected rewards: best = {}, mean = {}, "
+        "median = {}, worst = {}, best_pos = {}".format(
+            -np.min(pop_obj[:, 0]),
+            -np.mean(pop_obj[:, 0]),
+            -np.median(pop_obj[:, 0]),
+            -np.max(pop_obj[:, 0]),
+            np.where(pop_obj[:, 0] == np.min(pop_obj[:, 0])),
+        )
+    )
+    logging.info(
+        "population entangle number: best = {}, mean = {}, "
+        "median = {}, worst = {}, best_pos = {}".format(
+            np.min(pop_obj[:, 1]),
+            np.mean(pop_obj[:, 1]),
+            np.median(pop_obj[:, 1]),
+            np.max(pop_obj[:, 1]),
+            np.where(pop_obj[:, 1] == np.min(pop_obj[:, 1])),
+        )
+    )
 
 
 def main(qubits, n_actions, observables):
@@ -117,26 +167,36 @@ def main(qubits, n_actions, observables):
     lb = np.zeros(args.n_var)
     ub = np.ones(args.n_var) * 3
 
-    problem = NAS(qubits, n_actions, observables, lb=lb, ub=ub, n_var=args.n_var,
-                  n_episodes=args.n_episodes, save_dir=args.save)
+    problem = NAS(
+        qubits,
+        n_actions,
+        observables,
+        lb=lb,
+        ub=ub,
+        n_var=args.n_var,
+        n_episodes=args.n_episodes,
+        save_dir=args.save,
+    )
 
     # configure the nsga-net method
-    method = engine.nsganet(pop_size=args.pop_size,
-                            n_offsprings=args.n_offspring,
-                            eliminate_duplicates=True)
+    method = engine.nsganet(
+        pop_size=args.pop_size, n_offsprings=args.n_offspring, eliminate_duplicates=True
+    )
 
-    res = minimize(problem,
-                   method,
-                   callback=do_every_generations,
-                   termination=('n_gen', args.n_gens))
+    res = minimize(
+        problem,
+        method,
+        callback=do_every_generations,
+        termination=("n_gen", args.n_gens),
+    )
 
     return
 
 
 if __name__ == "__main__":
-    n_qubits = 4 # Dimension of the state vectors in CartPole
-    n_actions = 2 # Number of actions in CartPole
+    n_qubits = 4  # Dimension of the state vectors in CartPole
+    n_actions = 2  # Number of actions in CartPole
     qubits = cirq.GridQubit.rect(1, n_qubits)
     ops = [cirq.Z(q) for q in qubits]
-    observables = [reduce((lambda x, y: x * y), ops)] # Z_0*Z_1*Z_2*Z_3
-    main(qubits, n_actions, observables)
\ No newline at end of file
+    observables = [reduce((lambda x, y: x * y), ops)]  # Z_0*Z_1*Z_2*Z_3
+    main(qubits, n_actions, observables)
diff --git a/examples/quantum_rl/search/quantum_train_search.py b/examples/quantum_rl/search/quantum_train_search.py
index 10a983c..5861ef6 100644
--- a/examples/quantum_rl/search/quantum_train_search.py
+++ b/examples/quantum_rl/search/quantum_train_search.py
@@ -1,8 +1,6 @@
 import logging
 import os
 import sys
-import time
-from functools import reduce
 
 import numpy as np
 import tensorflow as tf
@@ -11,24 +9,43 @@
 from search import quantum_encoding
 
 
-def main(bit_string, qubits, n_actions, observables, n_episodes = 1000, batch_size = 10, gamma = 1, beta = 1.0,
-         state_bounds = np.array([2.4, 2.5, 0.21, 2.5]), env_name = "CartPole-v1", save='quantum', expr_root='search',
-         lr_in = 0.1, lr_var = 0.01, lr_out = 0.1, backend = 'cirq'):
+def main(
+    bit_string,
+    qubits,
+    n_actions,
+    observables,
+    n_episodes=1000,
+    batch_size=10,
+    gamma=1,
+    beta=1.0,
+    state_bounds=np.array([2.4, 2.5, 0.21, 2.5]),
+    env_name="CartPole-v1",
+    save="quantum",
+    expr_root="search",
+    lr_in=0.1,
+    lr_var=0.01,
+    lr_out=0.1,
+    backend="cirq",
+):
     """
     Main training process in multi-objective search.
     """
-    save_pth = os.path.join(expr_root, '{}'.format(save))
+    save_pth = os.path.join(expr_root, "{}".format(save))
     create_exp_dir(save_pth)
-    log_format = '%(asctime)s %(message)s'
-    logging.basicConfig(stream=sys.stdout, level=logging.INFO,
-                        format=log_format, datefmt='%m/%d %I:%M:%S %p')
-    fh = logging.FileHandler(os.path.join(save_pth, 'log.txt'))
+    log_format = "%(asctime)s %(message)s"
+    logging.basicConfig(
+        stream=sys.stdout,
+        level=logging.INFO,
+        format=log_format,
+        datefmt="%m/%d %I:%M:%S %p",
+    )
+    fh = logging.FileHandler(os.path.join(save_pth, "log.txt"))
     fh.setFormatter(logging.Formatter(log_format))
     logging.getLogger().addHandler(fh)
 
     nb, genotype = quantum_encoding.convert2arch(bit_string)
     model = Network(qubits, genotype, n_actions, beta, observables, env_name)
-    
+
     logging.info("Genome = %s", nb)
     logging.info("Architecture = %s", genotype)
 
@@ -52,20 +69,26 @@ def reinforce_update(states, actions, returns, model):
             log_probs = tf.math.log(p_actions)
             loss = tf.math.reduce_sum(-log_probs * returns) / batch_size
         grads = tape.gradient(loss, model.trainable_variables)
-        for optimizer, w in zip([optimizer_in, optimizer_var, optimizer_out], [w_in, w_var, w_out]):
+        for optimizer, w in zip(
+            [optimizer_in, optimizer_var, optimizer_out], [w_in, w_var, w_out]
+        ):
             optimizer.apply_gradients([(grads[w], model.trainable_variables[w])])
 
     # Start training the agent
     episode_reward_history = []
     for batch in range(n_episodes // batch_size):
         # Gather episodes
-        _, episodes = gather_episodes(state_bounds, n_actions, model, batch_size, env_name, beta, backend)
+        _, episodes = gather_episodes(
+            state_bounds, n_actions, model, batch_size, env_name, beta, backend
+        )
 
         # Group states, actions and returns in numpy arrays
-        states = np.concatenate([ep['states'] for ep in episodes])
-        actions = np.concatenate([ep['actions'] for ep in episodes])
-        rewards = [ep['rewards'] for ep in episodes]
-        returns = np.concatenate([compute_returns(ep_rwds, gamma) for ep_rwds in rewards])
+        states = np.concatenate([ep["states"] for ep in episodes])
+        actions = np.concatenate([ep["actions"] for ep in episodes])
+        rewards = [ep["rewards"] for ep in episodes]
+        returns = np.concatenate(
+            [compute_returns(ep_rwds, gamma) for ep_rwds in rewards]
+        )
         returns = np.array(returns, dtype=np.float32)
 
         id_action_pairs = np.array([[i, a] for i, a in enumerate(actions)])
@@ -79,9 +102,9 @@ def reinforce_update(states, actions, returns, model):
 
         avg_rewards = np.mean(episode_reward_history[-10:])
 
-        logging.info('Finished episode: %f', (batch + 1) * batch_size)
-        logging.info('Average rewards: %f', avg_rewards)
-    
+        logging.info("Finished episode: %f", (batch + 1) * batch_size)
+        logging.info("Average rewards: %f", avg_rewards)
+
         if avg_rewards >= 500.0 and env_name == "CartPole-v1":
             break
         elif avg_rewards >= -110 and env_name == "MountainCar-v0":
diff --git a/examples/quantum_rl/validation/quantum_test.py b/examples/quantum_rl/validation/quantum_test.py
index df1c0ca..ce176a7 100644
--- a/examples/quantum_rl/validation/quantum_test.py
+++ b/examples/quantum_rl/validation/quantum_test.py
@@ -3,7 +3,7 @@
 import os
 import sys
 
-sys.path.insert(0, ' ')
+sys.path.insert(0, " ")
 import time
 from functools import reduce
 
@@ -15,47 +15,76 @@
 from misc.utils import create_exp_dir, gather_episodes
 from models.quantum_models import generate_model_policy as Network
 
-parser = argparse.ArgumentParser('Quantum RL Inference')
-parser.add_argument('--save', type=str, default='qEXP_quafu', help='experiment name')
-parser.add_argument('--batch_size', type=int, default=1, help='batch size')
-parser.add_argument('--infer_episodes', type=int, default=100, help='the number of infer episodes')
-parser.add_argument('--gamma', type=float, default=1.0, help='discount parameter')
-parser.add_argument('--env_name', type=str, default="CartPole-v1", help='environment name')
-parser.add_argument('--state_bounds', type=np.array, default=np.array([2.4, 2.5, 0.21, 2.5]), help='state bounds')
-parser.add_argument('--n_qubits', type=int, default=4, help='the number of qubits')
-parser.add_argument('--n_actions', type=int, default=2, help='the number of actions')
-parser.add_argument('--arch', type=str, default='NSGANet_id10', help='which architecture to use')
-parser.add_argument('--model_path', type=str, default='./weights/train_p10/weights_id10_quafu_94.h5', help='path of pretrained model')
-parser.add_argument('--beta', type=float, default=1.0, help='output parameter')
-parser.add_argument('--backend', type=str, default='quafu', help='choose cirq simulator or quafu cloud platform')
-parser.add_argument('--shots', type=int, default=1000, help='the number of sampling')
-parser.add_argument('--backend_quafu', type=str, default='ScQ-P10', help='which quafu backend to use')
+parser = argparse.ArgumentParser("Quantum RL Inference")
+parser.add_argument("--save", type=str, default="qEXP_quafu", help="experiment name")
+parser.add_argument("--batch_size", type=int, default=1, help="batch size")
+parser.add_argument(
+    "--infer_episodes", type=int, default=100, help="the number of infer episodes"
+)
+parser.add_argument("--gamma", type=float, default=1.0, help="discount parameter")
+parser.add_argument(
+    "--env_name", type=str, default="CartPole-v1", help="environment name"
+)
+parser.add_argument(
+    "--state_bounds",
+    type=np.array,
+    default=np.array([2.4, 2.5, 0.21, 2.5]),
+    help="state bounds",
+)
+parser.add_argument("--n_qubits", type=int, default=4, help="the number of qubits")
+parser.add_argument("--n_actions", type=int, default=2, help="the number of actions")
+parser.add_argument(
+    "--arch", type=str, default="NSGANet_id10", help="which architecture to use"
+)
+parser.add_argument(
+    "--model_path",
+    type=str,
+    default="./weights/train_p10/weights_id10_quafu_94.h5",
+    help="path of pretrained model",
+)
+parser.add_argument("--beta", type=float, default=1.0, help="output parameter")
+parser.add_argument(
+    "--backend",
+    type=str,
+    default="quafu",
+    help="choose cirq simulator or quafu cloud platform",
+)
+parser.add_argument("--shots", type=int, default=1000, help="the number of sampling")
+parser.add_argument(
+    "--backend_quafu", type=str, default="ScQ-P10", help="which quafu backend to use"
+)
 
 args = parser.parse_args(args=[])
-args.save = 'infer-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
+args.save = "infer-{}-{}".format(args.save, time.strftime("%Y%m%d-%H%M%S"))
 create_exp_dir(args.save)
 
-log_format = '%(asctime)s %(message)s'
-logging.basicConfig(stream=sys.stdout, level=logging.INFO,
-                    format=log_format, datefmt='%m/%d %I:%M:%S %p')
-fh = logging.FileHandler(os.path.join(args.save, 'log.txt'))
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format=log_format,
+    datefmt="%m/%d %I:%M:%S %p",
+)
+fh = logging.FileHandler(os.path.join(args.save, "log.txt"))
 fh.setFormatter(logging.Formatter(log_format))
 logging.getLogger().addHandler(fh)
 
 qubits = cirq.GridQubit.rect(1, args.n_qubits)
 genotype = eval("genotypes.%s" % args.arch)
 ops = [cirq.Z(q) for q in qubits]
-observables = [reduce((lambda x, y: x * y), ops)] # Z_0*Z_1*Z_2*Z_3
+observables = [reduce((lambda x, y: x * y), ops)]  # Z_0*Z_1*Z_2*Z_3
 
 
 def main():
     logging.info("args = %s", args)
 
-    model = Network(qubits, genotype, args.n_actions, args.beta, observables, args.env_name)
+    model = Network(
+        qubits, genotype, args.n_actions, args.beta, observables, args.env_name
+    )
 
     model.load_weights(args.model_path)
-    
-    # inference 
+
+    # inference
     valid_reward = infer(model)
 
 
@@ -63,15 +92,27 @@ def infer(model):
     episode_reward_history = []
     for batch in range(args.infer_episodes // args.batch_size):
         # Gather episodes
-        tasklist, episodes = gather_episodes(args.state_bounds, args.n_actions, model, args.batch_size, 
-                                              args.env_name, args.beta, args.backend, args.backend_quafu, args.shots, args.n_qubits, qubits, genotype)
+        tasklist, episodes = gather_episodes(
+            args.state_bounds,
+            args.n_actions,
+            model,
+            args.batch_size,
+            args.env_name,
+            args.beta,
+            args.backend,
+            args.backend_quafu,
+            args.shots,
+            args.n_qubits,
+            qubits,
+            genotype,
+        )
         logging.info(tasklist)
         logging.info(episodes)
 
         # Group states, actions and returns in numpy arrays
-        states = np.concatenate([ep['states'] for ep in episodes])
-        actions = np.concatenate([ep['actions'] for ep in episodes])
-        rewards = [ep['rewards'] for ep in episodes]
+        states = np.concatenate([ep["states"] for ep in episodes])
+        actions = np.concatenate([ep["actions"] for ep in episodes])
+        rewards = [ep["rewards"] for ep in episodes]
 
         # Store collected rewards
         for ep_rwds in rewards:
@@ -79,13 +120,13 @@ def infer(model):
 
         # avg_rewards = np.mean(episode_reward_history[-10:])
 
-        logging.info('valid finished episode: %f', (batch + 1) * args.batch_size)
-        logging.info('valid average rewards: %f', episode_reward_history[-1])
-    
+        logging.info("valid finished episode: %f", (batch + 1) * args.batch_size)
+        logging.info("valid average rewards: %f", episode_reward_history[-1])
+
         if episode_reward_history[-1] >= 200.0:
             break
     return episode_reward_history
 
 
-if __name__ == '__main__':
-    main()
\ No newline at end of file
+if __name__ == "__main__":
+    main()
diff --git a/examples/quantum_rl/validation/quantum_train.py b/examples/quantum_rl/validation/quantum_train.py
index 726f786..7250875 100644
--- a/examples/quantum_rl/validation/quantum_train.py
+++ b/examples/quantum_rl/validation/quantum_train.py
@@ -4,7 +4,7 @@
 import os
 import sys
 
-sys.path.insert(0, ' ')
+sys.path.insert(0, " ")
 import time
 from functools import reduce
 
@@ -16,35 +16,70 @@
 from misc.utils import compute_returns, create_exp_dir, gather_episodes
 from models.quantum_models import generate_model_policy as Network
 
-parser = argparse.ArgumentParser('Quantum RL Training')
-parser.add_argument('--save', type=str, default='qEXP-quafu18_6', help='experiment name')
-parser.add_argument('--batch_size', type=int, default=1, help='batch size')
-parser.add_argument('--n_episodes', type=int, default=100, help='the number of episodes')
+parser = argparse.ArgumentParser("Quantum RL Training")
+parser.add_argument(
+    "--save", type=str, default="qEXP-quafu18_6", help="experiment name"
+)
+parser.add_argument("--batch_size", type=int, default=1, help="batch size")
+parser.add_argument(
+    "--n_episodes", type=int, default=100, help="the number of episodes"
+)
 # parser.add_argument('--infer_episodes', type=int, default=5, help='the number of infer episodes')
-parser.add_argument('--gamma', type=float, default=1.0, help='discount parameter')
-parser.add_argument('--env_name', type=str, default="CartPole-v1", help='environment name')
-parser.add_argument('--state_bounds', type=np.array, default=np.array([2.4, 2.5, 0.21, 2.5]), help='state bounds')
-parser.add_argument('--n_qubits', type=int, default=4, help='the number of qubits')
-parser.add_argument('--n_actions', type=int, default=2, help='the number of actions')
-parser.add_argument('--arch', type=str, default='NSGANet_id10', help='which architecture to use')
-parser.add_argument('--epochs', type=int, default=1, help='num of training epochs')
-parser.add_argument('--lr_in', type=float, default=0.1, help='learning rate of input parameter')
-parser.add_argument('--lr_var', type=float, default=0.01, help='learning rate of variational parameter')
-parser.add_argument('--lr_out', type=float, default=0.1, help='learning rate of output parameter')
-parser.add_argument('--beta', type=float, default=1.0, help='output parameter')
-parser.add_argument('--model_path', type=str, default='./weights/train_p18/weights_id10_quafu_86.h5', help='path of pretrained model')
-parser.add_argument('--backend', type=str, default='quafu', help='choose cirq simulator or quafu cloud platform')
-parser.add_argument('--shots', type=int, default=1000, help='the number of sampling')
-parser.add_argument('--backend_quafu', type=str, default='ScQ-P10', help='which quafu backend to use')
+parser.add_argument("--gamma", type=float, default=1.0, help="discount parameter")
+parser.add_argument(
+    "--env_name", type=str, default="CartPole-v1", help="environment name"
+)
+parser.add_argument(
+    "--state_bounds",
+    type=np.array,
+    default=np.array([2.4, 2.5, 0.21, 2.5]),
+    help="state bounds",
+)
+parser.add_argument("--n_qubits", type=int, default=4, help="the number of qubits")
+parser.add_argument("--n_actions", type=int, default=2, help="the number of actions")
+parser.add_argument(
+    "--arch", type=str, default="NSGANet_id10", help="which architecture to use"
+)
+parser.add_argument("--epochs", type=int, default=1, help="num of training epochs")
+parser.add_argument(
+    "--lr_in", type=float, default=0.1, help="learning rate of input parameter"
+)
+parser.add_argument(
+    "--lr_var", type=float, default=0.01, help="learning rate of variational parameter"
+)
+parser.add_argument(
+    "--lr_out", type=float, default=0.1, help="learning rate of output parameter"
+)
+parser.add_argument("--beta", type=float, default=1.0, help="output parameter")
+parser.add_argument(
+    "--model_path",
+    type=str,
+    default="./weights/train_p18/weights_id10_quafu_86.h5",
+    help="path of pretrained model",
+)
+parser.add_argument(
+    "--backend",
+    type=str,
+    default="quafu",
+    help="choose cirq simulator or quafu cloud platform",
+)
+parser.add_argument("--shots", type=int, default=1000, help="the number of sampling")
+parser.add_argument(
+    "--backend_quafu", type=str, default="ScQ-P10", help="which quafu backend to use"
+)
 
 args = parser.parse_args(args=[])
-args.save = 'train-{}-{}'.format(args.save, time.strftime("%Y%m%d-%H%M%S"))
+args.save = "train-{}-{}".format(args.save, time.strftime("%Y%m%d-%H%M%S"))
 create_exp_dir(args.save)
 
-log_format = '%(asctime)s %(message)s'
-logging.basicConfig(stream=sys.stdout, level=logging.INFO,
-                    format=log_format, datefmt='%m/%d %I:%M:%S %p')
-fh = logging.FileHandler(os.path.join(args.save, 'log.txt'))
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(
+    stream=sys.stdout,
+    level=logging.INFO,
+    format=log_format,
+    datefmt="%m/%d %I:%M:%S %p",
+)
+fh = logging.FileHandler(os.path.join(args.save, "log.txt"))
 fh.setFormatter(logging.Formatter(log_format))
 logging.getLogger().addHandler(fh)
 
@@ -52,7 +87,9 @@
 def main(qubits, genotype, observables):
     logging.info("args = %s", args)
 
-    model = Network(qubits, genotype, args.n_actions, args.beta, observables, args.env_name)
+    model = Network(
+        qubits, genotype, args.n_actions, args.beta, observables, args.env_name
+    )
     model.load_weights(args.model_path)
 
     n_epochs = args.epochs
@@ -66,9 +103,11 @@ def main(qubits, genotype, observables):
 
     # best_reward = 0
     for epoch in range(n_epochs):
-        logging.info('epoch %d', epoch)
+        logging.info("epoch %d", epoch)
 
-        reward_his = train(model, optimizer_in, optimizer_var, optimizer_out, w_in, w_var, w_out)
+        reward_his = train(
+            model, optimizer_in, optimizer_var, optimizer_out, w_in, w_var, w_out
+        )
         # valid_reward = infer(model)
 
         # if np.mean(valid_reward) >= best_reward:
@@ -78,7 +117,6 @@ def main(qubits, genotype, observables):
 
 # Training
 def train(model, optimizer_in, optimizer_var, optimizer_out, w_in, w_var, w_out):
-
     @tf.function
     def reinforce_update(states, actions, returns, logits2, model):
         states = tf.convert_to_tensor(states)
@@ -96,24 +134,40 @@ def reinforce_update(states, actions, returns, logits2, model):
             log_probs = tf.math.log(p_actions)
             loss = tf.math.reduce_sum(-log_probs * returns) / args.batch_size
         grads = tape.gradient(loss, model.trainable_variables)
-        for optimizer, w in zip([optimizer_in, optimizer_var, optimizer_out], [w_in, w_var, w_out]):
+        for optimizer, w in zip(
+            [optimizer_in, optimizer_var, optimizer_out], [w_in, w_var, w_out]
+        ):
             optimizer.apply_gradients([(grads[w], model.trainable_variables[w])])
 
     episode_reward_history = []
     best_reward = 0
     for batch in range(args.n_episodes // args.batch_size):
         # Gather episodes
-        tasklist, episodes = gather_episodes(args.state_bounds, args.n_actions, model, args.batch_size, 
-                                              args.env_name, args.beta, args.backend, args.backend_quafu, args.shots, args.n_qubits, qubits, genotype)
+        tasklist, episodes = gather_episodes(
+            args.state_bounds,
+            args.n_actions,
+            model,
+            args.batch_size,
+            args.env_name,
+            args.beta,
+            args.backend,
+            args.backend_quafu,
+            args.shots,
+            args.n_qubits,
+            qubits,
+            genotype,
+        )
         logging.info(tasklist)
         logging.info(episodes)
 
         # Group states, actions and returns in numpy arrays
-        states = np.concatenate([ep['states'] for ep in episodes])
-        actions = np.concatenate([ep['actions'] for ep in episodes])
-        logits = np.concatenate([ep['action_probs'] for ep in episodes])
-        rewards = [ep['rewards'] for ep in episodes]
-        returns = np.concatenate([compute_returns(ep_rwds, args.gamma) for ep_rwds in rewards])
+        states = np.concatenate([ep["states"] for ep in episodes])
+        actions = np.concatenate([ep["actions"] for ep in episodes])
+        logits = np.concatenate([ep["action_probs"] for ep in episodes])
+        rewards = [ep["rewards"] for ep in episodes]
+        returns = np.concatenate(
+            [compute_returns(ep_rwds, args.gamma) for ep_rwds in rewards]
+        )
         returns = np.array(returns, dtype=np.float32)
 
         id_action_pairs = np.array([[i, a] for i, a in enumerate(actions)])
@@ -121,23 +175,26 @@ def reinforce_update(states, actions, returns, logits2, model):
         # Store collected rewards
         for ep_rwds in rewards:
             episode_reward_history.append(np.sum(ep_rwds))
-        
 
         if episode_reward_history[-1] >= best_reward:
             best_reward = episode_reward_history[-1]
-            model.save_weights(os.path.join(args.save, 'weights_id10_quafu_{}.h5'.format(int(best_reward))))
+            model.save_weights(
+                os.path.join(
+                    args.save, "weights_id10_quafu_{}.h5".format(int(best_reward))
+                )
+            )
 
         # Update model parameters.
         reinforce_update(states, id_action_pairs, returns, logits, model)
 
         avg_rewards = np.mean(episode_reward_history[-5:])
 
-        logging.info('train finished episode: %f', (batch + 1) * args.batch_size)
-        logging.info('train average rewards: %f', episode_reward_history[-1])
-        logging.info('train moving average rewards: %f', avg_rewards)
+        logging.info("train finished episode: %f", (batch + 1) * args.batch_size)
+        logging.info("train average rewards: %f", episode_reward_history[-1])
+        logging.info("train moving average rewards: %f", avg_rewards)
+
+        model.save_weights(os.path.join(args.save, "weights_id10_quafu_latest.h5"))
 
-        model.save_weights(os.path.join(args.save, 'weights_id10_quafu_latest.h5'))
-    
         if avg_rewards >= 100.0:
             break
     return episode_reward_history
@@ -160,16 +217,15 @@ def reinforce_update(states, actions, returns, logits2, model):
 
 #         logging.info('valid finished episode: %f', (batch + 1) * args.batch_size)
 #         logging.info('valid average rewards: %f', avg_rewards)
-    
+
 #         if avg_rewards >= 500.0:
 #             break
 #     return episode_reward_history
 
-
-if __name__ == '__main__':
+if __name__ == "__main__":
     qubits = cirq.GridQubit.rect(1, args.n_qubits)
     genotype = eval("genotypes.%s" % args.arch)
     ops = [cirq.Z(q) for q in qubits]
-    observables = [reduce((lambda x, y: x * y), ops)] # Z_0*Z_1*Z_2*Z_3
+    observables = [reduce((lambda x, y: x * y), ops)]  # Z_0*Z_1*Z_2*Z_3
 
     main(qubits, genotype, observables)
diff --git a/examples/quantum_rl/visualization/plot_gif.py b/examples/quantum_rl/visualization/plot_gif.py
index d12bf47..97221c2 100644
--- a/examples/quantum_rl/visualization/plot_gif.py
+++ b/examples/quantum_rl/visualization/plot_gif.py
@@ -2,7 +2,7 @@
 import argparse
 import sys
 
-sys.path.insert(0, ' ')
+sys.path.insert(0, " ")
 from functools import reduce
 
 import cirq
@@ -16,27 +16,45 @@
 from models.quantum_models import get_model_circuit_params
 from PIL import Image
 
-parser = argparse.ArgumentParser('Plot gif of pre-trained quantum models on quafu cloud platform')
-parser.add_argument('--env_name', type=str, default='CartPole-v1', help='environment name')
-parser.add_argument('--state_bounds', type=np.array, default=np.array([2.4, 2.5, 0.21, 2.5]), help='state bounds')
-parser.add_argument('--n_qubits', type=int, default=4, help='the number of qubits')
-parser.add_argument('--n_actions', type=int, default=2, help='the number of actions')
-parser.add_argument('--arch', type=str, default='NSGANet_id10', help='which architecture to use')
-parser.add_argument('--shots', type=int, default=1000, help='the number of sampling')
-parser.add_argument('--backend_quafu', type=str, default='ScQ-P10', help='which quafu backend to use')
-parser.add_argument('--beta', type=float, default=1.0, help='output parameter')
-parser.add_argument('--model_path', type=str, default='./weights/train_p10/weights_id10_quafu_132.h5', help='path of pretrained model')
+parser = argparse.ArgumentParser(
+    "Plot gif of pre-trained quantum models on quafu cloud platform"
+)
+parser.add_argument(
+    "--env_name", type=str, default="CartPole-v1", help="environment name"
+)
+parser.add_argument(
+    "--state_bounds",
+    type=np.array,
+    default=np.array([2.4, 2.5, 0.21, 2.5]),
+    help="state bounds",
+)
+parser.add_argument("--n_qubits", type=int, default=4, help="the number of qubits")
+parser.add_argument("--n_actions", type=int, default=2, help="the number of actions")
+parser.add_argument(
+    "--arch", type=str, default="NSGANet_id10", help="which architecture to use"
+)
+parser.add_argument("--shots", type=int, default=1000, help="the number of sampling")
+parser.add_argument(
+    "--backend_quafu", type=str, default="ScQ-P10", help="which quafu backend to use"
+)
+parser.add_argument("--beta", type=float, default=1.0, help="output parameter")
+parser.add_argument(
+    "--model_path",
+    type=str,
+    default="./weights/train_p10/weights_id10_quafu_132.h5",
+    help="path of pretrained model",
+)
 args = parser.parse_args(args=[])
 
-
 if __name__ == "__main__":
     qubits = cirq.GridQubit.rect(1, args.n_qubits)
     genotype = eval("genotypes.%s" % args.arch)
     ops = [cirq.Z(q) for q in qubits]
-    observables = [reduce((lambda x, y: x * y), ops)] # Z_0*Z_1*Z_2*Z_3
-    model = Network(qubits, genotype, args.n_actions, args.beta, observables, args.env_name)
+    observables = [reduce((lambda x, y: x * y), ops)]  # Z_0*Z_1*Z_2*Z_3
+    model = Network(
+        qubits, genotype, args.n_actions, args.beta, observables, args.env_name
+    )
     model.load_weights(args.model_path)
-    
 
     for epoch in range(20):
         env = gym.make(args.env_name, render_mode="rgb_array")
@@ -44,27 +62,38 @@
         frames = []
         for epi in range(100):
             im = Image.fromarray(env.render())
-            frames.append(im)  
-    
+            frames.append(im)
+
             # get PQC model parameters and expectations
-            stateb = state/args.state_bounds
+            stateb = state / args.state_bounds
             newtheta, newlamda = get_model_circuit_params(qubits, genotype, model)
-            circuit, _, _ = generate_circuit(qubits, genotype, newtheta, newlamda, stateb)
-            _, expectation = get_quafu_exp(circuit, args.n_qubits, args.backend_quafu, args.shots)
-    
+            circuit, _, _ = generate_circuit(
+                qubits, genotype, newtheta, newlamda, stateb
+            )
+            _, expectation = get_quafu_exp(
+                circuit, args.n_qubits, args.backend_quafu, args.shots
+            )
+
             # get policy model parameters
-            obsw = model.get_layer('observables-policy').get_weights()[0]
+            obsw = model.get_layer("observables-policy").get_weights()[0]
             obspolicy = get_obs_policy(obsw, args.beta)
             policy = obspolicy(expectation)
-            print('policy:', policy)
-    
+            print("policy:", policy)
+
             # choose actions and make a step
             action = np.random.choice(args.n_actions, p=policy.numpy()[0])
             state, reward, terminated, truncated, _ = env.step(action)
             if terminated or truncated:
-                print(epi+1)
+                print(epi + 1)
                 break
         env.close()
 
         # save gif to your path
-        frames[1].save('./visualization/gif/test_{}.gif'.format(epoch), save_all=True, append_images=frames[2:], optimize=False, duration=40, loop=0)
\ No newline at end of file
+        frames[1].save(
+            "./visualization/gif/test_{}.gif".format(epoch),
+            save_all=True,
+            append_images=frames[2:],
+            optimize=False,
+            duration=40,
+            loop=0,
+        )
diff --git a/pyproject.toml b/pyproject.toml
index 6679926..1b10e0d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,6 +18,7 @@ archs = ["x86_64"]
 
 
 [tool.cibuildwheel.macos]
+environment = {MACOSX_DEPLOYMENT_TARGET = "13.6"}
 archs = ["x86_64", "arm64"]
 
 repair-wheel-command = [
diff --git a/quafu/__init__.py b/quafu/__init__.py
index d90e236..a7d6736 100644
--- a/quafu/__init__.py
+++ b/quafu/__init__.py
@@ -1,9 +1,9 @@
 from .circuits.quantum_circuit import QuantumCircuit
 from .circuits.quantum_register import QuantumRegister, Qubit
 from .results.results import ExecResult, SimuResult
+from .simulators.simulator import simulate
 from .tasks.tasks import Task
 from .users.userapi import User
-from .simulators.simulator import simulate
 
 __all__ = [
     "QuantumCircuit",
diff --git a/quafu/algorithms/__init__.py b/quafu/algorithms/__init__.py
index a099dbb..2a33ee5 100644
--- a/quafu/algorithms/__init__.py
+++ b/quafu/algorithms/__init__.py
@@ -1,8 +1,8 @@
 """Algorithm module"""
 
-from .hamiltonian import Hamiltonian
-from .ansatz import QAOAAnsatz, AlterLayeredAnsatz, QuantumNeuralNetwork
+from .ansatz import AlterLayeredAnsatz, QAOAAnsatz, QuantumNeuralNetwork
 from .estimator import Estimator
-from .templates.angle import AngleEmbedding
+from .hamiltonian import Hamiltonian
 from .templates.amplitude import AmplitudeEmbedding
+from .templates.angle import AngleEmbedding
 from .templates.basic_entangle import BasicEntangleLayers
diff --git a/quafu/algorithms/ansatz.py b/quafu/algorithms/ansatz.py
index bd7d835..41e64f8 100644
--- a/quafu/algorithms/ansatz.py
+++ b/quafu/algorithms/ansatz.py
@@ -11,16 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Ansatz circuits for VQA"""
 from abc import ABC, abstractmethod
 from typing import Any, List
-import numpy as np
 
+import numpy as np
 from quafu.circuits.quantum_circuit import QuantumCircuit
 from quafu.synthesis.evolution import ProductFormula
+
 from .hamiltonian import Hamiltonian
 from .interface_provider import InterfaceProvider
+from .templates import AngleEmbedding
 
 
 class Ansatz(QuantumCircuit, ABC):
@@ -155,6 +156,10 @@ def __init__(self, num_qubits: int, layers: List[Any], interface="torch"):
         self._weights = np.empty((1, 1))
         super().__init__(num_qubits)
 
+    def __call__(self, features):
+        """Compute outputs of QNN given input features"""
+        return self._transformer.execute(self, features)
+
     def _build(self):
         """Essentially initialize weights using transformer"""
         for layer in self._layers:
diff --git a/quafu/algorithms/estimator.py b/quafu/algorithms/estimator.py
index cdba8c4..e94ee5c 100644
--- a/quafu/algorithms/estimator.py
+++ b/quafu/algorithms/estimator.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Pre-build wrapper to calculate expectation value"""
-import numpy as np
-
 from typing import List, Optional
-from quafu import QuantumCircuit
+
+import numpy as np
+from quafu.algorithms.hamiltonian import Hamiltonian
 from quafu.simulators.simulator import simulate
 from quafu.tasks.tasks import Task
-from quafu.algorithms.hamiltonian import Hamiltonian
+
+from quafu import QuantumCircuit
 
 
 def execute_circuit(circ: QuantumCircuit, observables: Hamiltonian):
diff --git a/quafu/algorithms/gradients/__init__.py b/quafu/algorithms/gradients/__init__.py
index b3789a1..de792ee 100644
--- a/quafu/algorithms/gradients/__init__.py
+++ b/quafu/algorithms/gradients/__init__.py
@@ -13,4 +13,4 @@
 # limitations under the License.
 
 from .param_shift import ParamShift
-from .vjp import run_circ, compute_vjp, jacobian
+from .vjp import compute_vjp, jacobian, run_circ
diff --git a/quafu/algorithms/gradients/param_shift.py b/quafu/algorithms/gradients/param_shift.py
index 884e307..c725188 100644
--- a/quafu/algorithms/gradients/param_shift.py
+++ b/quafu/algorithms/gradients/param_shift.py
@@ -14,6 +14,7 @@
 """Quafu parameter shift"""
 
 from typing import List
+
 import numpy as np
 
 from ..estimator import Estimator
diff --git a/quafu/algorithms/gradients/vjp.py b/quafu/algorithms/gradients/vjp.py
index ea57380..aa314e1 100644
--- a/quafu/algorithms/gradients/vjp.py
+++ b/quafu/algorithms/gradients/vjp.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from typing import List, Optional
+
 import numpy as np
-from quafu import QuantumCircuit
-from quafu.algorithms import Hamiltonian
 from quafu.algorithms.estimator import Estimator
 from quafu.algorithms.gradients import ParamShift
+from quafu.algorithms.hamiltonian import Hamiltonian
+
+from quafu import QuantumCircuit
 
 
 def _generate_expval_z(num_qubits: int):
diff --git a/quafu/algorithms/hamiltonian.py b/quafu/algorithms/hamiltonian.py
index 6742acf..a4e1402 100644
--- a/quafu/algorithms/hamiltonian.py
+++ b/quafu/algorithms/hamiltonian.py
@@ -11,16 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Quafu Hamiltonian class"""
 
 from __future__ import annotations
 
 from collections.abc import Iterable
+
 import numpy as np
+from quafu.elements.matrices import IdMatrix, XMatrix, YMatrix, ZMatrix
 from quafu.exceptions import QuafuError
-from quafu.elements.matrices import XMatrix, YMatrix, ZMatrix, IdMatrix
-
 
 PAULI_MAT = {"I": IdMatrix, "X": XMatrix, "Y": YMatrix, "Z": ZMatrix}
 
diff --git a/quafu/algorithms/interface/torch.py b/quafu/algorithms/interface/torch.py
index f27a0bb..50e2b5f 100644
--- a/quafu/algorithms/interface/torch.py
+++ b/quafu/algorithms/interface/torch.py
@@ -11,21 +11,48 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """quafu PyTorch quantum layer"""
 
-import torch
 import numpy as np
+import torch
+
 from quafu import QuantumCircuit
+
 from ..gradients import compute_vjp, jacobian, run_circ
 
 
+# TODO(zhaoyilun): impl a ABC for transformers
 class TorchTransformer:
     @staticmethod
     def init_weights(shape):
         """Return torch gradient tensor with specified shape"""
         return torch.randn(*shape, requires_grad=True, dtype=torch.double)
 
+    # TODO(zhaoyilun): doc
+    @staticmethod
+    def execute(
+        circ: QuantumCircuit,
+        parameters: torch.Tensor,
+        run_fn=run_circ,
+        grad_fn=None,
+        method="internal",
+    ):
+        """execute.
+
+        Args:
+            circ:
+            run_fn:
+            grad_fn:
+        """
+
+        kwargs = {"circ": circ, "run_fn": run_fn, "grad_fn": grad_fn}
+
+        if method == "external":
+            return ExecuteCircuits.apply(parameters, kwargs)
+        if method == "internal":
+            return ExecuteCircuits.apply(circ.weights, kwargs)
+        raise NotImplementedError(f"Unsupported execution method: {method}")
+
 
 class ExecuteCircuits(torch.autograd.Function):
     """Parameters are input from previous layers"""
@@ -51,29 +78,3 @@ def backward(ctx, grad_out):
         vjp = compute_vjp(jac, grad_out.numpy())
         vjp = torch.from_numpy(vjp)
         return vjp, None
-
-
-# TODO(zhaoyilun): doc
-def execute(
-    circ: QuantumCircuit,
-    parameters: torch.Tensor,
-    run_fn=run_circ,
-    grad_fn=None,
-    method="internal",
-):
-    """execute.
-
-    Args:
-        circ:
-        run_fn:
-        grad_fn:
-    """
-
-    kwargs = {"circ": circ, "run_fn": run_fn, "grad_fn": grad_fn}
-
-    if method == "external":
-        return ExecuteCircuits.apply(parameters, kwargs)
-    elif method == "internal":
-        return ExecuteCircuits.apply(circ.weights, kwargs)
-    else:
-        raise NotImplementedError(f"Unsupported execution method: {method}")
diff --git a/quafu/algorithms/templates/__init__.py b/quafu/algorithms/templates/__init__.py
index e69de29..0e1b849 100644
--- a/quafu/algorithms/templates/__init__.py
+++ b/quafu/algorithms/templates/__init__.py
@@ -0,0 +1,16 @@
+# (C) Copyright 2023 Beijing Academy of Quantum Information Sciences
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .angle import AngleEmbedding
+from .basic_entangle import BasicEntangleLayers
diff --git a/quafu/algorithms/templates/amplitude.py b/quafu/algorithms/templates/amplitude.py
index c72ff18..e36b550 100644
--- a/quafu/algorithms/templates/amplitude.py
+++ b/quafu/algorithms/templates/amplitude.py
@@ -11,11 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Amplitude Embedding by a decomposition into gates"""
-from quafu.circuits import QuantumCircuit
-import quafu.elements.element_gates as qeg
 import numpy as np
+import quafu.elements.element_gates as qeg
+from quafu.circuits import QuantumCircuit
 
 
 class AmplitudeEmbedding:
@@ -38,7 +37,7 @@ def __iter__(self):
 
     def __getitem__(self, index):
         return self.gate_list[index]
-    
+
     def _preprocess(self, state, num_qubits, pad_with, normalize):
         batched = np.ndim(state) > 1
         ##TODO(qtzhuang): If state are batched, additional processing is required
@@ -53,10 +52,12 @@ def _preprocess(self, state, num_qubits, pad_with, normalize):
 
             # check shape
             if len(shape) != 1:
-                raise ValueError(f"state must be a one-dimensional tensor; got shape {shape}.")
+                raise ValueError(
+                    f"state must be a one-dimensional tensor; got shape {shape}."
+                )
 
             n_state = shape[0]
-            dim = 2 ** num_qubits
+            dim = 2**num_qubits
             if pad_with is None and n_state != dim:
                 raise ValueError(
                     f"The length of state should be {dim}; got length {n_state}.Please check num_qubits "
@@ -89,10 +90,13 @@ def _preprocess(self, state, num_qubits, pad_with, normalize):
                     )
             new_state_batch.append(feature_set)
 
-        return np.stack(new_state_batch).astype(np.complex128) if batched else new_state_batch[0].astype(np.complex128)
-
-    def _build(self):  
+        return (
+            np.stack(new_state_batch).astype(np.complex128)
+            if batched
+            else new_state_batch[0].astype(np.complex128)
+        )
 
+    def _build(self):
         a = np.abs(self.state)
         omega = np.angle(self.state)
         # change order of qubits, since original code was written for IBM machines
@@ -104,7 +108,9 @@ def _build(self):
             alpha_y_k = _get_alpha_y(a, len(qubits_reverse), k)
             control = qubits_reverse[k:]
             target = qubits_reverse[k - 1]
-            gate_list.extend(_apply_uniform_rotation_dagger(qeg.RYGate, alpha_y_k, control, target))
+            gate_list.extend(
+                _apply_uniform_rotation_dagger(qeg.RYGate, alpha_y_k, control, target)
+            )
 
         # If necessary, apply inverse z rotation cascade to prepare correct phases of amplitudes
         if not np.allclose(omega, 0):
@@ -114,11 +120,14 @@ def _build(self):
                 target = qubits_reverse[k - 1]
                 if len(alpha_z_k) > 0:
                     gate_list.extend(
-                        _apply_uniform_rotation_dagger(qeg.RZGate, alpha_z_k, control, target)
+                        _apply_uniform_rotation_dagger(
+                            qeg.RZGate, alpha_z_k, control, target
+                        )
                     )
 
         return gate_list
-    
+
+
 ## MottonenStatePreparation related functions.
 def gray_code(rank):
     """Generates the Gray code of given rank.
@@ -145,18 +154,19 @@ def gray_code_recurse(g, rank):
 
     return g
 
+
 def _matrix_M_entry(row, col):
-        
-        # (col >> 1) ^ col is the Gray code of col
-        b_and_g = row & ((col >> 1) ^ col)
-        sum_of_ones = 0
-        while b_and_g > 0:
-            if b_and_g & 0b1:
-                sum_of_ones += 1
+    # (col >> 1) ^ col is the Gray code of col
+    b_and_g = row & ((col >> 1) ^ col)
+    sum_of_ones = 0
+    while b_and_g > 0:
+        if b_and_g & 0b1:
+            sum_of_ones += 1
+
+        b_and_g = b_and_g >> 1
 
-            b_and_g = b_and_g >> 1
+    return (-1) ** sum_of_ones
 
-        return (-1) ** sum_of_ones
 
 def compute_theta(alpha):
     ln = alpha.shape[-1]
@@ -173,7 +183,6 @@ def compute_theta(alpha):
 
 
 def _apply_uniform_rotation_dagger(gate, alpha, control_wires, target_wire):
-
     gate_list = []
     theta = compute_theta(alpha)
 
@@ -181,7 +190,7 @@ def _apply_uniform_rotation_dagger(gate, alpha, control_wires, target_wire):
 
     if gray_code_rank == 0:
         if np.all(theta[..., 0] != 0.0):
-            gate_list.append(gate(pos = target_wire, paras = theta[0]))
+            gate_list.append(gate(pos=target_wire, paras=theta[0]))
         return gate_list
 
     code = gray_code(gray_code_rank)
@@ -198,8 +207,8 @@ def _apply_uniform_rotation_dagger(gate, alpha, control_wires, target_wire):
         gate_list.append(qeg.CXGate(control_wires[control_index], target_wire))
     return gate_list
 
-def _get_alpha_z(omega, n, k):
 
+def _get_alpha_z(omega, n, k):
     indices1 = [
         [(2 * j - 1) * 2 ** (k - 1) + l - 1 for l in range(1, 2 ** (k - 1) + 1)]
         for j in range(1, 2 ** (n - k) + 1)
@@ -215,8 +224,8 @@ def _get_alpha_z(omega, n, k):
 
     return np.sum(diff, axis=-1)
 
-def _get_alpha_y(a, n, k):
 
+def _get_alpha_y(a, n, k):
     indices_numerator = [
         [(2 * (j + 1) - 1) * 2 ** (k - 1) + l for l in range(2 ** (k - 1))]
         for j in range(2 ** (n - k))
@@ -224,7 +233,9 @@ def _get_alpha_y(a, n, k):
     numerator = np.take(a, indices=indices_numerator, axis=-1)
     numerator = np.sum(np.abs(numerator) ** 2, axis=-1)
 
-    indices_denominator = [[j * 2**k + l for l in range(2**k)] for j in range(2 ** (n - k))]
+    indices_denominator = [
+        [j * 2**k + l for l in range(2**k)] for j in range(2 ** (n - k))
+    ]
     denominator = np.take(a, indices=indices_denominator, axis=-1)
     denominator = np.sum(np.abs(denominator) ** 2, axis=-1)
 
diff --git a/quafu/algorithms/templates/angle.py b/quafu/algorithms/templates/angle.py
index 6818264..33299c6 100644
--- a/quafu/algorithms/templates/angle.py
+++ b/quafu/algorithms/templates/angle.py
@@ -11,11 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Angel Embedding in Quantum Data embedding"""
-from quafu.circuits import QuantumCircuit
-import quafu.elements.element_gates as qeg
 import numpy as np
+import quafu.elements.element_gates as qeg
+from quafu.circuits import QuantumCircuit
 
 ROT = {"X": qeg.RXGate, "Y": qeg.RYGate, "Z": qeg.RZGate}
 
@@ -38,7 +37,6 @@ def __init__(self, features, num_qubits: int, rotation="X"):
         self.features = features
         self.num_qubits = num_qubits
         self.op = ROT[rotation]
-
         """Build the embedding circuit and get the gate_list"""
         self.gate_list = self._build()
 
diff --git a/quafu/algorithms/templates/basic_entangle.py b/quafu/algorithms/templates/basic_entangle.py
index 1ed3252..b141ab3 100644
--- a/quafu/algorithms/templates/basic_entangle.py
+++ b/quafu/algorithms/templates/basic_entangle.py
@@ -11,11 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Layers consisting of one-parameter single-qubit rotations on each qubit, followed by a closed chain or ring of CNOT gates"""
-from quafu.circuits import QuantumCircuit
-import quafu.elements.element_gates as qeg
 import numpy as np
+import quafu.elements.element_gates as qeg
+from quafu.circuits import QuantumCircuit
 
 ROT = {"X": qeg.RXGate, "Y": qeg.RYGate, "Z": qeg.RZGate}
 
@@ -52,7 +51,6 @@ def __init__(self, weights, num_qubits, rotation="X"):
         self.weights = weights
         self.num_qubits = num_qubits
         self.op = ROT[rotation]
-
         """Build the quantum basic_entangle layer and get the gate_list"""
         self.gate_list = self._build()
 
diff --git a/quafu/backends/backends.py b/quafu/backends/backends.py
index a39bed8..3d4fb21 100644
--- a/quafu/backends/backends.py
+++ b/quafu/backends/backends.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import requests
 import json
 import re
+
+import matplotlib.pyplot as plt
 import networkx as nx
 import numpy as np
-import matplotlib.pyplot as plt
-
+import requests
 from quafu.users.userapi import User
 
 
diff --git a/quafu/benchmark/adder.py b/quafu/benchmark/adder.py
index 6bac7b9..6c0c946 100644
--- a/quafu/benchmark/adder.py
+++ b/quafu/benchmark/adder.py
@@ -1,11 +1,9 @@
-from quafu.circuits.quantum_circuit import QuantumCircuit
 import matplotlib.pyplot as plt
+from quafu.circuits.quantum_circuit import QuantumCircuit
 
-"""
-// quantum ripple-carry adder from Cuccaro et al, quant-ph/0410184
-OPENQASM 2.0;
-include "qelib1.inc";
-"""
+# // quantum ripple-carry adder from Cuccaro et al, quant-ph/0410184
+# OPENQASM 2.0;
+# include "qelib1.inc";
 
 n = 10
 qc = QuantumCircuit(n)
@@ -76,7 +74,6 @@ def creg(_i, name):
 qc.x(qreg(0, "a"))
 for i in range(4):
     qc.x(qreg(i, "b"))
-
 """
 // add a to b, storing result in b
 majority cin[0],b[0],a[0];
@@ -97,7 +94,6 @@ def creg(_i, name):
 unmaj(qreg(2, "a"), qreg(3, "b"), qreg(3, "a"))
 unmaj(qreg(1, "a"), qreg(2, "b"), qreg(2, "a"))
 unmaj(qreg(0, "a"), qreg(1, "b"), qreg(1, "a"))
-
 """
 measure b[0] -> ans[0];
 measure b[1] -> ans[1];
diff --git a/quafu/benchmark/deutsch_jozsa.py b/quafu/benchmark/deutsch_jozsa.py
index 0f3b4ae..4fce01f 100644
--- a/quafu/benchmark/deutsch_jozsa.py
+++ b/quafu/benchmark/deutsch_jozsa.py
@@ -1,7 +1,7 @@
 import random
+
 import matplotlib.pyplot as plt
 import numpy as np
-
 from quafu.circuits.quantum_circuit import QuantumCircuit
 from quafu.visualisation.circuitPlot import CircuitPlotManager
 
diff --git a/quafu/benchmark/unitary_test.py b/quafu/benchmark/unitary_test.py
index 7b91e8e..b2d3e65 100644
--- a/quafu/benchmark/unitary_test.py
+++ b/quafu/benchmark/unitary_test.py
@@ -1,4 +1,5 @@
 from scipy.stats import unitary_group
+
 from quafu import QuantumCircuit
 
 nqubit = 5
diff --git a/quafu/benchmark/variational/brickwall_circuit.py b/quafu/benchmark/variational/brickwall_circuit.py
index d18f082..786c439 100644
--- a/quafu/benchmark/variational/brickwall_circuit.py
+++ b/quafu/benchmark/variational/brickwall_circuit.py
@@ -1,5 +1,5 @@
-from quafu.circuits.quantum_circuit import QuantumCircuit
 from numpy import random
+from quafu.circuits.quantum_circuit import QuantumCircuit
 
 qubit_num = 6
 n_layers = 2
diff --git a/quafu/benchmark/variational/ladder_circuit.py b/quafu/benchmark/variational/ladder_circuit.py
index e882e66..44b3ceb 100644
--- a/quafu/benchmark/variational/ladder_circuit.py
+++ b/quafu/benchmark/variational/ladder_circuit.py
@@ -1,6 +1,5 @@
-from quafu.circuits.quantum_circuit import QuantumCircuit
 from numpy import random
-
+from quafu.circuits.quantum_circuit import QuantumCircuit
 
 # number of qubits, number of layers
 bit_num, n_layers = 4, 2
diff --git a/quafu/circuits/__init__.py b/quafu/circuits/__init__.py
index 5696904..0772fef 100644
--- a/quafu/circuits/__init__.py
+++ b/quafu/circuits/__init__.py
@@ -1,5 +1,5 @@
 """Quafu quantum circuits"""
 
+from .classical_register import ClassicalRegister
 from .quantum_circuit import QuantumCircuit
 from .quantum_register import QuantumRegister
-from .classical_register import ClassicalRegister 
diff --git a/quafu/circuits/classical_register.py b/quafu/circuits/classical_register.py
index d42b5d2..59fa098 100644
--- a/quafu/circuits/classical_register.py
+++ b/quafu/circuits/classical_register.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 class ClassicalRegister:
     """
     Collection of cbit(s)
@@ -28,7 +29,7 @@ def __getitem__(self, item):
         if item < self.num:
             return self.pos_start + item
         else:
-            raise IndexError('Index out of range:', item)
+            raise IndexError("Index out of range:", item)
 
     def __iter__(self):
         self._i = 0
diff --git a/quafu/circuits/quantum_circuit.py b/quafu/circuits/quantum_circuit.py
index 7ec4f3b..253a1d8 100644
--- a/quafu/circuits/quantum_circuit.py
+++ b/quafu/circuits/quantum_circuit.py
@@ -16,26 +16,24 @@
 from typing import Any, List
 
 import numpy as np
-
 import quafu.elements.element_gates as qeg
+from quafu.elements import Measure, Reset
 from quafu.elements.classical_element import Cif
 from quafu.elements.instruction import Instruction
-from quafu.elements import Measure, Reset
 from quafu.elements.pulses import QuantumPulse
+
 from ..elements import (
     Barrier,
+    ControlledGate,
     Delay,
     MultiQubitGate,
     QuantumGate,
-    ControlledGate,
     SingleQubitGate,
     XYResonance,
 )
-from .quantum_register import QuantumRegister
-from .classical_register import ClassicalRegister
 from ..exceptions import CircuitError
-
-import copy
+from .classical_register import ClassicalRegister
+from .quantum_register import QuantumRegister
 
 
 class QuantumCircuit(object):
@@ -97,8 +95,9 @@ def measures(self, measures: dict):
     @property
     def gates(self):
         """Deprecated warning: due to historical reason, ``gates`` contains not only instances of
-                      QuantumGate, meanwhile not contains measurements. This attributes might be deprecated in
-                      the future. Better to use ``instructions`` which contains all the instructions."""
+        QuantumGate, meanwhile not contains measurements. This attributes might be deprecated in
+        the future. Better to use ``instructions`` which contains all the instructions.
+        """
         return self._gates
 
     @gates.setter
@@ -159,18 +158,18 @@ def layered_circuit(self) -> np.ndarray:
         used_qubits = []
         for gate in gatelist:
             if (
-                    isinstance(gate, SingleQubitGate)
-                    or isinstance(gate, Delay)
-                    or isinstance(gate, QuantumPulse)
+                isinstance(gate, SingleQubitGate)
+                or isinstance(gate, Delay)
+                or isinstance(gate, QuantumPulse)
             ):
                 gateQlist[gate.pos].append(gate)
                 if gate.pos not in used_qubits:
                     used_qubits.append(gate.pos)
 
             elif (
-                    isinstance(gate, Barrier)
-                    or isinstance(gate, MultiQubitGate)
-                    or isinstance(gate, XYResonance)
+                isinstance(gate, Barrier)
+                or isinstance(gate, MultiQubitGate)
+                or isinstance(gate, XYResonance)
             ):
                 pos1 = min(gate.pos)
                 pos2 = max(gate.pos)
@@ -195,17 +194,17 @@ def layered_circuit(self) -> np.ndarray:
         def get_used_qubits(instructions):
             used_q = []
             for ins in instructions:
-                if (isinstance(ins, Cif)):
+                if isinstance(ins, Cif):
                     used_q_h = get_used_qubits(ins.instructions)
                     for pos in used_q_h:
                         if pos not in used_q:
                             used_q.append(pos)
-                elif (isinstance(ins, Barrier)):
+                elif isinstance(ins, Barrier):
                     continue
-                elif (isinstance(ins.pos, int)):
+                elif isinstance(ins.pos, int):
                     if ins.pos not in used_q:
                         used_q.append(ins.pos)
-                elif (isinstance(ins.pos, list)):
+                elif isinstance(ins.pos, list):
                     for pos in ins.pos:
                         if pos not in used_q:
                             used_q.append(pos)
@@ -263,9 +262,9 @@ def draw_circuit(self, width: int = 4, return_str: bool = False):
             for i in range(num):
                 gate = layergates[i]
                 if (
-                        isinstance(gate, SingleQubitGate)
-                        or isinstance(gate, Delay)
-                        or (isinstance(gate, QuantumPulse))
+                    isinstance(gate, SingleQubitGate)
+                    or isinstance(gate, Delay)
+                    or (isinstance(gate, QuantumPulse))
                 ):
                     printlist[i * 2, l] = gate.symbol
                     maxlen = max(maxlen, len(gate.symbol) + width)
@@ -273,7 +272,7 @@ def draw_circuit(self, width: int = 4, return_str: bool = False):
                 elif isinstance(gate, MultiQubitGate) or isinstance(gate, XYResonance):
                     q1 = reduce_map[min(gate.pos)]
                     q2 = reduce_map[max(gate.pos)]
-                    printlist[2 * q1 + 1: 2 * q2, l] = "|"
+                    printlist[2 * q1 + 1 : 2 * q2, l] = "|"
                     printlist[q1 * 2, l] = "#"
                     printlist[q2 * 2, l] = "#"
                     if isinstance(gate, ControlledGate):  # Controlled-Multiqubit gate
@@ -309,7 +308,7 @@ def draw_circuit(self, width: int = 4, return_str: bool = False):
                     pos = [i for i in gate.pos if i in reduce_map.keys()]
                     q1 = reduce_map[min(pos)]
                     q2 = reduce_map[max(pos)]
-                    printlist[2 * q1: 2 * q2 + 1, l] = "||"
+                    printlist[2 * q1 : 2 * q2 + 1, l] = "||"
                     maxlen = max(maxlen, len("||"))
 
             printlist[-1, l] = maxlen
@@ -356,6 +355,7 @@ def from_openqasm(self, openqasm: str):
             openqasm: input openqasm str.
         """
         from quafu.qfasm.qfasm_convertor import qasm2_to_quafu_qc
+
         return qasm2_to_quafu_qc(self, openqasm)
 
     def to_openqasm(self) -> str:
@@ -381,9 +381,10 @@ def wrap_to_gate(self, name: str):
         """
         Wrap the circuit to a subclass of QuantumGate, create by metaclass.
         """
-        from quafu.elements.oracle import customize_gate
         from copy import deepcopy
 
+        from quafu.elements.oracle import customize_gate
+
         # TODO: check validity of instructions
         gate_structure = [deepcopy(ins) for ins in self.instructions]
         customized = customize_gate(name, gate_structure, self.num)
@@ -827,10 +828,10 @@ def delay(self, pos, duration, unit="ns") -> "QuantumCircuit":
     def reset(self, qlist: List[int] = None) -> "QuantumCircuit":
         """
         Add reset for qubits in qlist.
-     
+
         Args:
             qlist (list[int]): A list contain the qubit need add reset. When qlist contain at least two qubit, the barrier will be added from minimum qubit to maximum qubit. For example: barrier([0, 2]) create barrier for qubits 0, 1, 2. To create discrete barrier, using barrier([0]), barrier([2]).
-        
+
         Note: reset only support for simulator `qfvm_circ`.
         """
         if qlist is None:
@@ -862,7 +863,9 @@ def measure(self, pos: List[int] = None, cbits: List[int] = None) -> None:
             if not len(set(cbits)) == len(cbits):
                 raise ValueError("Classical bits not uniquely assigned.")
             if not len(cbits) == n_num:
-                raise ValueError("Number of measured bits should equal to the number of classical bits")
+                raise ValueError(
+                    "Number of measured bits should equal to the number of classical bits"
+                )
         else:
             cbits = list(range(e_num, e_num + n_num))
 
@@ -879,26 +882,26 @@ def measure(self, pos: List[int] = None, cbits: List[int] = None) -> None:
     @contextmanager
     def cif(self, cbits: List[int], condition: int):
         """
-        Create an `if` statement on this circuit. 
-        If cbits equals to condition, the subsequent operaterations will be performed. 
+        Create an `if` statement on this circuit.
+        If cbits equals to condition, the subsequent operaterations will be performed.
         Use  the `measure` statement to explicitly assign value to the cbit before using it as `cbits` argument
 
         Args:
             cbits: List of cbit that are used for comparison.
-            condition(int): A condition to be evaluated with cbits that filled by `measure` operation. 
+            condition(int): A condition to be evaluated with cbits that filled by `measure` operation.
+
 
-        
         For example::
             from quafu import QuantumCircuit
             qc = QuantumCircuit(2,2)
-            
+
             qc.h(0)
             qc.cx(0,1)
             qc.measure([0],[0])
             with qc.cif(cbits=[0], condition=1):
                 qc.x(2)
             qc.measure([2],[2])
-            
+
         Note: cif only support for simulator `qfvm_circ`.
         """
         # check cbits
@@ -917,10 +920,13 @@ def cif(self, cbits: List[int], condition: int):
 
         instructions = []
         for i in range(len(self.instructions) - 1, -1, -1):
-            if isinstance(self.instructions[i], Cif) and self.instructions[i].instructions is None:
+            if (
+                isinstance(self.instructions[i], Cif)
+                and self.instructions[i].instructions is None
+            ):
                 instructions.reverse()
                 self.instructions[i].set_ins(instructions)
-                self.instructions = self.instructions[0:i + 1]
+                self.instructions = self.instructions[0 : i + 1]
                 return
             else:
                 instructions.append(self.instructions[i])
diff --git a/quafu/circuits/quantum_register.py b/quafu/circuits/quantum_register.py
index 9f9f32c..37a5fb0 100644
--- a/quafu/circuits/quantum_register.py
+++ b/quafu/circuits/quantum_register.py
@@ -58,13 +58,15 @@ class QuantumRegister:
 
     def __init__(self, num: int = 0, name: str = None):
         self.name = name
-        self.qubits = OrderedDict({i: Qubit(logic_pos=i, reg_name=name) for i in range(num)})
+        self.qubits = OrderedDict(
+            {i: Qubit(logic_pos=i, reg_name=name) for i in range(num)}
+        )
 
     def __getitem__(self, item):
         if item < len(self.qubits):
             return self.qubits[item]
         else:
-            raise IndexError('Index out of range:', item)
+            raise IndexError("Index out of range:", item)
 
     def __iter__(self):
         self._i = 0
diff --git a/quafu/dagcircuits/__init__.py b/quafu/dagcircuits/__init__.py
index 8b13789..e69de29 100755
--- a/quafu/dagcircuits/__init__.py
+++ b/quafu/dagcircuits/__init__.py
@@ -1 +0,0 @@
-
diff --git a/quafu/dagcircuits/circuit_dag.py b/quafu/dagcircuits/circuit_dag.py
index 9be4d44..dbdfea7 100644
--- a/quafu/dagcircuits/circuit_dag.py
+++ b/quafu/dagcircuits/circuit_dag.py
@@ -2,17 +2,17 @@
 from typing import Any, List
 
 import networkx as nx
-
-from quafu import QuantumCircuit
-from quafu.dagcircuits.dag_circuit import (
+from quafu.dagcircuits.dag_circuit import (  # dag_circuit.py in the same folder as circuit_dag.py now
     DAGCircuit,
-)  # dag_circuit.py in the same folder as circuit_dag.py now
-from quafu.dagcircuits.instruction_node import (
+)
+from quafu.dagcircuits.instruction_node import (  # instruction_node.py in the same folder as circuit_dag.py now
     InstructionNode,
-)  # instruction_node.py in the same folder as circuit_dag.py now
-from quafu.elements.element_gates import *
+)
 from quafu.elements import Barrier, Delay, Measure, XYResonance
-from quafu.elements.pulses import GaussianPulse, RectPulse, FlattopPulse
+from quafu.elements.element_gates import *
+from quafu.elements.pulses import FlattopPulse, GaussianPulse, RectPulse
+
+from quafu import QuantumCircuit
 
 
 # transform a gate in quantumcircuit of quafu(not include measure_gate),
diff --git a/quafu/dagcircuits/dag_circuit.py b/quafu/dagcircuits/dag_circuit.py
index b3f9aec..5f66211 100644
--- a/quafu/dagcircuits/dag_circuit.py
+++ b/quafu/dagcircuits/dag_circuit.py
@@ -2,7 +2,6 @@
 
 import networkx as nx
 from networkx.classes.multidigraph import MultiDiGraph
-
 from quafu.dagcircuits.instruction_node import InstructionNode
 
 
diff --git a/quafu/dagcircuits/instruction_node.py b/quafu/dagcircuits/instruction_node.py
index a474d7f..24343c1 100644
--- a/quafu/dagcircuits/instruction_node.py
+++ b/quafu/dagcircuits/instruction_node.py
@@ -1,5 +1,5 @@
-from typing import Dict, Any, List, Union
 import dataclasses
+from typing import Any, Dict, List, Union
 
 
 @dataclasses.dataclass
diff --git a/quafu/elements/classical_element.py b/quafu/elements/classical_element.py
index 8e425c2..3e1172e 100644
--- a/quafu/elements/classical_element.py
+++ b/quafu/elements/classical_element.py
@@ -14,12 +14,13 @@
 
 # Classes of classical operation.
 
-from typing import List, Dict
+from typing import Dict, List
+
 from quafu.elements.instruction import Instruction
 
 
 class Cif(Instruction):
-    name = 'cif'
+    name = "cif"
     named_paras = None
 
     def __init__(self, cbits: List[int], condition: int, instructions=None):
@@ -31,7 +32,7 @@ def __init__(self, cbits: List[int], condition: int, instructions=None):
 
     @property
     def named_pos(self) -> Dict:
-        return {'cbits': self.cbits}
+        return {"cbits": self.cbits}
 
     def to_qasm(self):
         raise NotImplementedError
diff --git a/quafu/elements/instruction.py b/quafu/elements/instruction.py
index 55a78de..58ae2f3 100644
--- a/quafu/elements/instruction.py
+++ b/quafu/elements/instruction.py
@@ -13,10 +13,9 @@
 #  limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Union, List, Dict
+from typing import Dict, List, Union
 
-
-__all__ = ['Instruction', 'Barrier', 'Measure', 'PosType', 'ParaType', 'Reset']
+__all__ = ["Instruction", "Barrier", "Measure", "PosType", "ParaType", "Reset"]
 
 PosType = Union[int, List[int]]
 ParaType = Union[float, int, List]
@@ -30,6 +29,7 @@ class Instruction(ABC):
         paras: Parameters of the instruction.
 
     """
+
     ins_classes = {}
 
     def __init__(self, pos: PosType, paras: ParaType = None, *args, **kwargs):
@@ -54,7 +54,7 @@ def named_paras(self) -> Dict:
     @abstractmethod
     def named_pos(self) -> Dict:
         """dict-mapping for positions"""
-        return {'pos': self.pos}
+        return {"pos": self.pos}
 
     @name.setter
     def name(self, _):
@@ -91,6 +91,7 @@ class Barrier(Instruction):
     """
     Barrier instruction.
     """
+
     name = "barrier"
 
     # def to_dag_node(self):
@@ -123,11 +124,13 @@ def __repr__(self):
         return f"{self.__class__.__name__}"
 
     def to_qasm(self):
-        return "barrier " + ",".join(["q[%d]" % p for p in range(min(self.pos), max(self.pos) + 1)])
+        return "barrier " + ",".join(
+            ["q[%d]" % p for p in range(min(self.pos), max(self.pos) + 1)]
+        )
 
 
 class Reset(Instruction):
-    name = 'reset'
+    name = "reset"
 
     def __init__(self, pos):
         super().__init__(pos)
@@ -161,6 +164,7 @@ class Measure(Instruction):
     """
     Measure instruction.
     """
+
     name = "measure"
 
     def __init__(self, bitmap: dict):
@@ -170,18 +174,20 @@ def __init__(self, bitmap: dict):
 
     @property
     def named_pos(self):
-        return {'pos': self.pos}  # TODO
+        return {"pos": self.pos}  # TODO
 
     @property
     def named_paras(self):
         return self.named_paras
 
     def to_qasm(self):
-        lines = ["measure q[%d] -> meas[%d];\n" % (q, c) for q, c in zip(self.qbits, self.cbits)]
-        qasm = ''.join(lines)
+        lines = [
+            "measure q[%d] -> meas[%d];\n" % (q, c)
+            for q, c in zip(self.qbits, self.cbits)
+        ]
+        qasm = "".join(lines)
         return qasm
 
 
 Instruction.register_ins(Barrier)
 Instruction.register_ins(Measure)
-
diff --git a/quafu/elements/matrices/__init__.py b/quafu/elements/matrices/__init__.py
index f69bd9f..90066fa 100644
--- a/quafu/elements/matrices/__init__.py
+++ b/quafu/elements/matrices/__init__.py
@@ -1,2 +1,2 @@
 from .mat_lib import *
-from .mat_utils import stack_matrices, is_zero, is_hermitian
+from .mat_utils import is_hermitian, is_zero, stack_matrices
diff --git a/quafu/elements/matrices/mat_utils.py b/quafu/elements/matrices/mat_utils.py
index 8d55ed2..cde8619 100644
--- a/quafu/elements/matrices/mat_utils.py
+++ b/quafu/elements/matrices/mat_utils.py
@@ -1,17 +1,16 @@
 import cmath
+from typing import List
 
 import numpy as np
 from numpy import ndarray
 
 from .mat_lib import IdMatrix
 
-from typing import List
-
 
 def reorder_matrix(matrix: np.ndarray, pos: List):
-    """Reorder the input sorted matrix to the pos order """
+    """Reorder the input sorted matrix to the pos order"""
     qnum = len(pos)
-    dim = 2 ** qnum
+    dim = 2**qnum
     inds = np.argsort(pos)
     inds = np.concatenate([inds, inds + qnum])
     tensorm = matrix.reshape([2] * 2 * qnum)
@@ -105,7 +104,7 @@ def is_kron_with_id2(matrix):
 
 #######################################################
 def get_global_phase(unitary):
-    """ Get the global phase of arbitrary unitary, and get the special unitary.
+    """Get the global phase of arbitrary unitary, and get the special unitary.
 
     Args:
         unitary (np.array): arbitrary unitary
@@ -120,7 +119,7 @@ def get_global_phase(unitary):
 
 
 def matrix_distance_squared(unitary1, unitary2):
-    """ Used to compare the distance of two matrices. The global phase is ignored.
+    """Used to compare the distance of two matrices. The global phase is ignored.
 
     Args:
         unitary1 (np.array): A unitary matrix in the form of a numpy ndarray.
@@ -130,4 +129,6 @@ def matrix_distance_squared(unitary1, unitary2):
         Float : A single value between 0 and 1 indicating how closely unitary1 and unitary2 match.
         A value close to 0 indicates that unitary1 and unitary2 are the same unitary.
     """
-    return np.abs(1 - np.abs(np.sum(np.multiply(unitary1, np.conj(unitary2)))) / unitary1.shape[0])
+    return np.abs(
+        1 - np.abs(np.sum(np.multiply(unitary1, np.conj(unitary2)))) / unitary1.shape[0]
+    )
diff --git a/quafu/elements/oracle.py b/quafu/elements/oracle.py
index 0fed127..794eb3d 100644
--- a/quafu/elements/oracle.py
+++ b/quafu/elements/oracle.py
@@ -12,10 +12,11 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+import copy
 from abc import ABCMeta
-from quafu.elements import QuantumGate, Instruction
 from typing import Dict, Iterable, List
-import copy
+
+from quafu.elements import Instruction, QuantumGate
 
 
 class OracleGateMeta(ABCMeta):
@@ -24,21 +25,24 @@ class OracleGateMeta(ABCMeta):
     """
 
     def __init__(cls, name, bases, attrs):
-        for attr_name in ['cls_name', 'gate_structure', 'qubit_num']:
-            assert attr_name in attrs, f"OracleGateMeta: {attr_name} not found in {attrs}."
+        for attr_name in ["cls_name", "gate_structure", "qubit_num"]:
+            assert (
+                attr_name in attrs
+            ), f"OracleGateMeta: {attr_name} not found in {attrs}."
 
         # TODO: check if instructions inside gate_structure are valid
 
         super().__init__(name, bases, attrs)
-        cls.name = attrs.__getitem__('cls_name')
-        cls.gate_structure = attrs.__getitem__('gate_structure')
-        cls.qubit_num = attrs.__getitem__('qubit_num')
+        cls.name = attrs.__getitem__("cls_name")
+        cls.gate_structure = attrs.__getitem__("gate_structure")
+        cls.qubit_num = attrs.__getitem__("qubit_num")
 
 
 class OracleGate(QuantumGate):  # TODO: Can it be related to OracleGateMeta explicitly?
     """
     OracleGate is a gate that can be customized by users.
     """
+
     name = None
     gate_structure = []
     qubit_num = 0
@@ -54,7 +58,9 @@ def __init__(self, pos: List, paras=None, label: str = None):
             label: label when draw or plot
         """
         if not self.qubit_num == len(pos):
-            raise ValueError(f"OracleGate: qubit number {self.qubit_num} does not match pos length {len(pos)}.")
+            raise ValueError(
+                f"OracleGate: qubit number {self.qubit_num} does not match pos length {len(pos)}."
+            )
         super().__init__(pos=pos, paras=paras)
 
         self.__instantiate_gates__()
@@ -68,7 +74,7 @@ def matrix(self):
 
     @property
     def named_pos(self) -> Dict:
-        return {'pos': self.pos}
+        return {"pos": self.pos}
 
     @property
     def named_paras(self) -> Dict:
@@ -97,14 +103,15 @@ def map_pos(pos):
             gate_ = copy.deepcopy(gate)
             for key, val in gate.named_pos.items():
                 setattr(gate_, key, map_pos(val))
-            setattr(gate_, 'pos', map_pos(gate.pos))
+            setattr(gate_, "pos", map_pos(gate.pos))
             self.insides.append(gate_)
 
 
-def customize_gate(cls_name: str,
-                   gate_structure: List[Instruction],
-                   qubit_num: int,
-                   ):
+def customize_gate(
+    cls_name: str,
+    gate_structure: List[Instruction],
+    qubit_num: int,
+):
     """
     Helper function to create customized gate class
 
@@ -122,10 +129,11 @@ def customize_gate(cls_name: str,
     if cls_name in QuantumGate.gate_classes:
         raise ValueError(f"Gate class {cls_name} already exists.")
 
-    attrs = {'cls_name': cls_name,
-             'gate_structure': gate_structure,
-             'qubit_num': qubit_num,
-             }
+    attrs = {
+        "cls_name": cls_name,
+        "gate_structure": gate_structure,
+        "qubit_num": qubit_num,
+    }
 
     customized_cls = OracleGateMeta(cls_name, (OracleGate,), attrs)
     assert issubclass(customized_cls, OracleGate)
diff --git a/quafu/elements/pulses.py b/quafu/elements/pulses.py
index 78699e0..a80355f 100644
--- a/quafu/elements/pulses.py
+++ b/quafu/elements/pulses.py
@@ -1,10 +1,9 @@
 from abc import ABC, abstractmethod
 from copy import deepcopy
-from typing import Union, Optional
+from typing import Optional, Union
 
 import matplotlib.pyplot as plt
 import numpy as np
-
 from quafu.elements.instruction import Instruction, PosType
 
 TimeType = Union[np.ndarray, float, int]
@@ -275,7 +274,8 @@ def __init__(self, qs: int, qe: int, duration: int, unit="ns"):
 
     def to_qasm(self):
         return "xy(%d%s) " % (self.duration, self.unit) + ",".join(
-            ["q[%d]" % p for p in range(min(self.pos), max(self.pos) + 1)])
+            ["q[%d]" % p for p in range(min(self.pos), max(self.pos) + 1)]
+        )
 
 
 QuantumPulse.register_pulse(RectPulse)
diff --git a/quafu/qfasm/qelib1.inc b/quafu/qfasm/qelib1.inc
index 8f510a5..24b9b2a 100644
--- a/quafu/qfasm/qelib1.inc
+++ b/quafu/qfasm/qelib1.inc
@@ -4,9 +4,9 @@
 // --- QE Hardware primitives ---
 
 // 3-parameter 2-pulse single qubit gate
-gate u3(theta,phi,lambda) q 
-{ 
-  U(theta,phi,lambda) q; 
+gate u3(theta,phi,lambda) q
+{
+  U(theta,phi,lambda) q;
 }
 // 2-parameter 1-pulse single qubit gate
 gate u2(phi,lambda) q { U(pi/2,phi,lambda) q; }
diff --git a/quafu/qfasm/qfasm_convertor.py b/quafu/qfasm/qfasm_convertor.py
index 6a2f301..950a65e 100644
--- a/quafu/qfasm/qfasm_convertor.py
+++ b/quafu/qfasm/qfasm_convertor.py
@@ -13,8 +13,10 @@
 # limitations under the License.
 
 from quafu.circuits.quantum_circuit import QuantumCircuit
+
 from .qfasm_parser import QfasmParser
 
+
 def qasm_to_quafu(openqasm: str):
     """
     Initialize pyquafu circuit from openqasm text, mainly by
@@ -46,7 +48,7 @@ def qasm2_to_quafu_qc(qc: QuantumCircuit, openqasm: str):
     qc.openqasm = openqasm
     qc.gates = newqc.gates
     qc.instructions = newqc.instructions
-    qc._measures = newqc._measures 
+    qc._measures = newqc._measures
     qc.qregs = newqc.qregs
     qc.cregs = newqc.cregs
     qc.executable_on_backend = newqc.executable_on_backend
diff --git a/quafu/qfasm/qfasm_lexer.py b/quafu/qfasm/qfasm_lexer.py
index 8d4ac48..c73cc6c 100644
--- a/quafu/qfasm/qfasm_lexer.py
+++ b/quafu/qfasm/qfasm_lexer.py
@@ -18,9 +18,11 @@
 # interaction with quantum hardware
 
 import os
+
 import ply.lex as lex
-from .qfasm_utils import Id
+
 from .exceptions import LexerError
+from .qfasm_utils import Id
 
 
 class QfasmLexer(object):
diff --git a/quafu/qfasm/qfasm_parser.py b/quafu/qfasm/qfasm_parser.py
index 6c6c00d..ab79378 100644
--- a/quafu/qfasm/qfasm_parser.py
+++ b/quafu/qfasm/qfasm_parser.py
@@ -12,21 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import copy
-import ply.yacc as yacc
 
-from .qfasm_utils import *
-from quafu.circuits.quantum_register import QuantumRegister
+import numpy as np
+import ply.yacc as yacc
 from quafu.circuits.classical_register import ClassicalRegister
+from quafu.circuits.quantum_register import QuantumRegister
+from quafu.elements import *
+from quafu.elements.classical_element import Cif
 from quafu.qfasm.exceptions import ParserError
 
-from .qfasm_lexer import QfasmLexer
-import numpy as np
 from quafu import QuantumCircuit
-from quafu.elements import *
-from quafu.elements.classical_element import Cif
 
+from .qfasm_lexer import QfasmLexer
+from .qfasm_utils import *
 
 unaryop = ["sin", "cos", "tan", "exp", "ln", "sqrt", "acos", "atan", "asin"]
 unarynp = {
@@ -140,7 +139,7 @@ def updateSymtab(self, symtabnode: SymtabNode):
                 self.circuit.cregs.append(ClassicalRegister(self.cnum, name="c"))
             else:
                 self.circuit.cregs[0] = ClassicalRegister(self.cnum, name="c")
-            
+
         if symtabnode.is_global:
             self.global_symtab[symtabnode.name] = symtabnode
         else:
@@ -205,9 +204,7 @@ def handle_gateins(self, gateins: GateInstruction):
                         qlist.append(symnode.start + i)
                     gate_list.append(Reset(qlist))
                 elif isinstance(qarg, IndexedId):
-                    gate_list.append(
-                        Reset([symnode.start + qarg.num])
-                    )
+                    gate_list.append(Reset([symnode.start + qarg.num]))
 
         elif gateins.name == "barrier":
             qlist = []
diff --git a/quafu/qfasm/qfasm_utils.py b/quafu/qfasm/qfasm_utils.py
index 997f4ad..b88e2b2 100644
--- a/quafu/qfasm/qfasm_utils.py
+++ b/quafu/qfasm/qfasm_utils.py
@@ -106,12 +106,7 @@ def fill_gate(self, qargs, instructions=None, cargs=None):
             self.cargs = cargs
 
 
-from quafu.elements import (
-    Measure,
-    Barrier,
-    Delay,
-    XYResonance,
-)
+from quafu.elements import Barrier, Delay, Measure, XYResonance
 
 gate_classes = {
     "x": qeg.XGate,
diff --git a/quafu/results/results.py b/quafu/results/results.py
index 097019d..d913d55 100644
--- a/quafu/results/results.py
+++ b/quafu/results/results.py
@@ -2,6 +2,7 @@
 from collections import OrderedDict
 
 import matplotlib.pyplot as plt
+
 from ..utils.basis import *
 
 
@@ -23,15 +24,22 @@ class ExecResult(Result):
     """
 
     def __init__(self, input_dict):
-        status_map = {0: "In Queue", 1: "Running", 2: "Completed", "Canceled": 3, 4: "Failed"}
-        self.taskid = input_dict['task_id']
-        self.taskname = input_dict['task_name']
+        status_map = {
+            0: "In Queue",
+            1: "Running",
+            2: "Completed",
+            "Canceled": 3,
+            4: "Failed",
+        }
+        self.taskid = input_dict["task_id"]
+        self.taskname = input_dict["task_name"]
         self.transpiled_openqasm = input_dict["openqasm"]
         from ..circuits.quantum_circuit import QuantumCircuit
+
         self.transpiled_circuit = QuantumCircuit(0)
         self.transpiled_circuit.from_openqasm(self.transpiled_openqasm)
         self.measure_base = []
-        
+
         self.measures = self.transpiled_circuit.measures
         self.task_status = status_map[input_dict["status"]]
         self.res = eval(input_dict["res"])
@@ -82,8 +90,12 @@ class SimuResult(Result):
         count_dict: The num of cbits measured. Only support for `qfvm_circuit`.
     """
 
-    def __init__(self, input, input_form, count_dict:dict=None):
-        self.num = int(np.log2(input.shape[0]))
+    def __init__(self, input, input_form, count_dict: dict = None):
+        if input_form != "count_dict":
+            self.num = int(np.log2(input.shape[0]))
+        else:
+            # input is num qubits
+            self.num = input
         if input_form == "density_matrix":
             self.rho = np.array(input)
             self.probabilities = np.diag(input)
@@ -91,13 +103,16 @@ def __init__(self, input, input_form, count_dict:dict=None):
             self.probabilities = input
         elif input_form == "state_vector":
             self.state_vector = input
+        elif input_form == "count_dict":
+            # do nothing, only count dict
+            pass
         # come form c++ simulator
         # TODO: add count for py_simu
         if count_dict is not None:
             self.count = {}
-            for key,value in count_dict.items():
+            for key, value in count_dict.items():
                 bitstr = bin(key)[2:].zfill(self.num)
-                self.count[bitstr] = value               
+                self.count[bitstr] = value
 
     def plot_probabilities(
         self, full: bool = False, reverse_basis: bool = False, sort: bool = None
diff --git a/quafu/simulators/default_simulator.py b/quafu/simulators/default_simulator.py
index 45e991d..4aafaaa 100644
--- a/quafu/simulators/default_simulator.py
+++ b/quafu/simulators/default_simulator.py
@@ -11,26 +11,25 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """default circuit simulator for state vector"""
 
+import copy
 from typing import Iterable, List, Union
+
+import numpy as np
 from quafu.circuits.quantum_circuit import QuantumCircuit
-from ..results.results import SimuResult
+from scipy.sparse import coo_matrix, eye, kron
+from sparse import COO, SparseArray
+
 from ..elements import (
     Barrier,
     Delay,
+    MultiQubitGate,
     QuantumGate,
     SingleQubitGate,
-    MultiQubitGate,
     XYResonance,
 )
-import numpy as np
-from functools import reduce
-from sparse import COO, SparseArray
-from scipy.sparse import kron, eye, coo_matrix
-
-import copy
+from ..results.results import SimuResult
 
 
 def global_op(gate: QuantumGate, global_qubits: List) -> coo_matrix:
diff --git a/quafu/simulators/simulator.py b/quafu/simulators/simulator.py
index 5dac435..609b282 100644
--- a/quafu/simulators/simulator.py
+++ b/quafu/simulators/simulator.py
@@ -11,15 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """simulator for quantum circuit and qasm"""
 
 from typing import Union
-from .default_simulator import py_simulate, ptrace, permutebits
-from quafu import QuantumCircuit
-from ..results.results import SimuResult
+
 import numpy as np
+
+from quafu import QuantumCircuit
+
 from ..exceptions import QuafuError
+from ..results.results import SimuResult
+from .default_simulator import permutebits, ptrace, py_simulate
 
 
 def simulate(
@@ -78,11 +80,13 @@ def simulate(
 
     count_dict = None
     from .qfvm import simulate_circuit
+
     # simulate
     if simulator == "qfvm_circ":
         if use_gpu:
             if qc.executable_on_backend == False:
                 raise QuafuError("classical operation only support for `qfvm_qasm`")
+
             if use_custatevec:
                 try:
                     from .qfvm import simulate_circuit_custate
@@ -97,18 +101,26 @@ def simulate(
                 psi = simulate_circuit_gpu(qc, psi)
         else:
             count_dict, psi = simulate_circuit(qc, psi, shots)
-            
+
+    elif simulator == "qfvm_clifford":
+        try:
+            from .qfvm import simulate_circuit_clifford
+        except ImportError:
+            raise QuafuError("you are not using the clifford version of pyquafu")
+
+        count_dict = simulate_circuit_clifford(qc, shots)
+
     elif simulator == "py_simu":
         if qc.executable_on_backend == False:
             raise QuafuError("classical operation only support for `qfvm_qasm`")
         psi = py_simulate(qc, psi)
-        
+
     elif simulator == "qfvm_qasm":
         psi = simulate_circuit(qc, psi, shots)
-        
+
     else:
         raise ValueError("invalid circuit")
-    
+
     if output == "density_matrix":
         if simulator in ["qfvm_circ", "qfvm_qasm"]:
             psi = permutebits(psi, range(num)[::-1])
@@ -126,6 +138,9 @@ def simulate(
     elif output == "state_vector":
         return SimuResult(psi, output, count_dict)
 
+    elif output == "count_dict":
+        return SimuResult(max(qc.used_qubits) + 1, output, count_dict)
+
     else:
         raise ValueError(
             "output should in be 'density_matrix', 'probabilities', or 'state_vector'"
diff --git a/quafu/simulators/torch.py b/quafu/simulators/torch.py
index 57c9f07..6be7c1e 100644
--- a/quafu/simulators/torch.py
+++ b/quafu/simulators/torch.py
@@ -11,5 +11,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Simulate the execution of a quantum circuit using pytorch"""
diff --git a/quafu/synthesis/evolution.py b/quafu/synthesis/evolution.py
index 46d40fe..76d07fc 100644
--- a/quafu/synthesis/evolution.py
+++ b/quafu/synthesis/evolution.py
@@ -11,10 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """Evolution that generate gate sequence based on operator"""
 
 from abc import ABC, abstractmethod
+
 import numpy as np
 import quafu.elements.element_gates as qeg
 
@@ -128,25 +128,25 @@ def diagonalizing_clifford(pauli: str):
 def cnot_chain(pauli: str):
     """CX chain.
 
-        For example, for the Pauli with the label 'XYZIX'.
+    For example, for the Pauli with the label 'XYZIX'.
 
-                           ┌───┐
-            q_0: ──────────┤ X ├
-                           └─┬─┘
-            q_1: ────────────┼──
-                      ┌───┐  │
-            q_2: ─────┤ X ├──■──
-                 ┌───┐└─┬─┘
-            q_3: ┤ X ├──■───────
-                 └─┬─┘
-            q_4: ──■────────────
+                       ┌───┐
+        q_0: ──────────┤ X ├
+                       └─┬─┘
+        q_1: ────────────┼──
+                  ┌───┐  │
+        q_2: ─────┤ X ├──■──
+             ┌───┐└─┬─┘
+        q_3: ┤ X ├──■───────
+             └─┬─┘
+        q_4: ──■────────────
 
-        Args:
-            pauli: The Pauli for which to construct the CX chain.
+    Args:
+        pauli: The Pauli for which to construct the CX chain.
 
-        Returns:
-            A gate list implementing the CX chain.
-        """
+    Returns:
+        A gate list implementing the CX chain.
+    """
 
     gates = []
     control, target = None, None
@@ -171,25 +171,25 @@ def cnot_chain(pauli: str):
 def cnot_fountain(pauli: str):
     """CX chain in the fountain shape.
 
-        For example, for the Pauli with the label 'XYZIX'.
+    For example, for the Pauli with the label 'XYZIX'.
 
-                 ┌───┐┌───┐┌───┐
-            q_0: ┤ X ├┤ X ├┤ X ├
-                 └─┬─┘└─┬─┘└─┬─┘
-            q_1: ──┼────┼────┼──
-                   │    │    │
-            q_2: ──■────┼────┼──
-                        │    │
-            q_3: ───────■────┼──
-                             │
-            q_4: ────────────■──
+             ┌───┐┌───┐┌───┐
+        q_0: ┤ X ├┤ X ├┤ X ├
+             └─┬─┘└─┬─┘└─┬─┘
+        q_1: ──┼────┼────┼──
+               │    │    │
+        q_2: ──■────┼────┼──
+                    │    │
+        q_3: ───────■────┼──
+                         │
+        q_4: ────────────■──
 
-        Args:
-            pauli: The Pauli for which to construct the CX chain.
+    Args:
+        pauli: The Pauli for which to construct the CX chain.
 
-        Returns:
-            A gate list implementing the CX chain.
-        """
+    Returns:
+        A gate list implementing the CX chain.
+    """
 
     gates = []
     control, target = None, None
diff --git a/quafu/tasks/task_database.py b/quafu/tasks/task_database.py
index ffdfa43..d9b9981 100644
--- a/quafu/tasks/task_database.py
+++ b/quafu/tasks/task_database.py
@@ -50,7 +50,7 @@ class QuafuTaskDatabase:
     This way ensures the database connection is closed and submission committed automatically.
     """
 
-    def __init__(self, db_dir='./'):
+    def __init__(self, db_dir="./"):
         self.database_name = "tasks.db"
         self.database_dir = Path(db_dir)
         self.conn = None
@@ -69,7 +69,8 @@ def __exit__(self, exc_type, exc_value, traceback):
 
     def _create_table(self):
         cursor = self.conn.cursor()
-        cursor.execute('''
+        cursor.execute(
+            """
             CREATE TABLE IF NOT EXISTS tasks (
                 task_id TEXT PRIMARY KEY,
                 group_name TEXT DEFAULT NULL,
@@ -79,26 +80,29 @@ def _create_table(self):
                 send_time TIMESTAMP,
                 finish_time TIMESTAMP DEFAULT NULL
             )
-        ''')
+        """
+        )
         cursor.close()
 
     # region data manipulation
-    def insert_task(self,
-                    task_id,
-                    status,
-                    send_time: str = None,
-                    priority=2,
-                    group_name=None,
-                    task_name=None,
-                    finish_time: str = None
-                    ):
+    def insert_task(
+        self,
+        task_id,
+        status,
+        send_time: str = None,
+        priority=2,
+        group_name=None,
+        task_name=None,
+        finish_time: str = None,
+    ):
         cursor = self.conn.cursor()
         cursor.execute(
             "INSERT INTO tasks "
             "(task_id, group_name, task_name, status, priority, send_time, finish_time) "
             "VALUES "
             "(?, ?, ?, ?, ?, ?, ?)",
-            (task_id, group_name, task_name, status, priority, send_time, finish_time))
+            (task_id, group_name, task_name, status, priority, send_time, finish_time),
+        )
         cursor.close()
 
     def delete_task(self, task_id):
@@ -111,6 +115,7 @@ def update_task_status(self, task_id, status):
         cursor = self.conn.cursor()
         cursor.execute("UPDATE tasks SET status=? WHERE task_id=?", (status, task_id))
         cursor.close()
+
     # endregion
 
     # region fetch tasks
@@ -150,7 +155,7 @@ def find_by_name(self, task_name):
         if task_name is None:
             cursor.execute("SELECT * FROM tasks WHERE task_name IS NULL")
         else:
-            cursor.execute("SELECT * FROM tasks WHERE task_name=?", (task_name, ))
+            cursor.execute("SELECT * FROM tasks WHERE task_name=?", (task_name,))
         tasks = cursor.fetchall()
         cursor.close()
         return tasks
@@ -160,8 +165,12 @@ def find_by_time(self, start_time, end_time):
         get tasks sent between start_time and end_time.
         """
         cursor = self.conn.cursor()
-        cursor.execute("SELECT * FROM tasks WHERE send_time BETWEEN ? AND ?", (start_time, end_time))
+        cursor.execute(
+            "SELECT * FROM tasks WHERE send_time BETWEEN ? AND ?",
+            (start_time, end_time),
+        )
         tasks = cursor.fetchall()
         cursor.close()
         return tasks
+
     # endregion
diff --git a/quafu/tasks/tasks.py b/quafu/tasks/tasks.py
index 39cd697..2f7ea9a 100644
--- a/quafu/tasks/tasks.py
+++ b/quafu/tasks/tasks.py
@@ -19,10 +19,10 @@
 
 import numpy as np
 import requests
-
 from quafu.circuits.quantum_circuit import QuantumCircuit
 from quafu.users.userapi import User
-from ..exceptions import CircuitError, ServerError, CompileError
+
+from ..exceptions import CircuitError, CompileError, ServerError
 from ..results.results import ExecResult, merge_measure
 from ..users.exceptions import UserError
 
@@ -158,9 +158,7 @@ def submit(
 
         return exec_res, measure_results
 
-    def run(self,
-            qc: QuantumCircuit,
-            measure_base: List = None) -> ExecResult:
+    def run(self, qc: QuantumCircuit, measure_base: List = None) -> ExecResult:
         """Single run for measurement task.
 
         Args:
@@ -183,12 +181,9 @@ def run(self,
 
         return res
 
-    def send(self,
-             qc: QuantumCircuit,
-             name: str = "",
-             group: str = "",
-             wait: bool = True
-             ) -> ExecResult:
+    def send(
+        self, qc: QuantumCircuit, name: str = "", group: str = "", wait: bool = True
+    ) -> ExecResult:
         """
         Run the circuit on experimental device.
 
@@ -245,12 +240,14 @@ def send(self,
         if not response.ok:
             logging.warning("Received a non-200 response from the server.\n")
         if response.status_code == 502:
-            logging.critical("Received a 502 Bad Gateway response. Please try again later.\n"
-                             "If there is persistent failure, please report it on our github page.")
-            raise UserError('502 Bad Gateway response')
+            logging.critical(
+                "Received a 502 Bad Gateway response. Please try again later.\n"
+                "If there is persistent failure, please report it on our github page."
+            )
+            raise UserError("502 Bad Gateway response")
         else:
             res_dict = response.json()  # type: dict
-            quafu_status = res_dict['status']
+            quafu_status = res_dict["status"]
             if quafu_status in [201, 205]:
                 raise UserError(res_dict["message"])
             elif quafu_status == 5001:
@@ -279,16 +276,18 @@ def retrieve(self, taskid: str) -> ExecResult:
         data = {"task_id": taskid}
         url = User.url + User.exec_recall_api
 
-        headers = {'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'api_token': self.user.api_token}
+        headers = {
+            "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8",
+            "api_token": self.user.api_token,
+        }
         response = requests.post(url, headers=headers, data=data)
 
         res_dict = response.json()
         return ExecResult(res_dict)
 
-    def retrieve_group(self,
-                       group: str,
-                       history: Dict = None,
-                       verbose: bool = True) -> List[ExecResult]:
+    def retrieve_group(
+        self, group: str, history: Dict = None, verbose: bool = True
+    ) -> List[ExecResult]:
         """
         Retrieve the results of submited task by group name.
 
diff --git a/quafu/users/userapi.py b/quafu/users/userapi.py
index 6596288..e3a7af4 100644
--- a/quafu/users/userapi.py
+++ b/quafu/users/userapi.py
@@ -17,8 +17,8 @@
 
 import requests
 
-from .exceptions import UserError, APITokenNotFound
 from ..utils.platform import get_homedir
+from .exceptions import APITokenNotFound, UserError
 
 
 class User(object):
@@ -29,7 +29,9 @@ class User(object):
     exec_async_api = "qbackend/scq_kit_asyc/"
     exec_recall_api = "qbackend/scq_task_recall/"
 
-    def __init__(self, api_token: Optional[str] = None, token_dir: Optional[str] = None):
+    def __init__(
+        self, api_token: Optional[str] = None, token_dir: Optional[str] = None
+    ):
         """
         Initialize user account and load backend information.
 
@@ -103,7 +105,7 @@ def _get_backends_info(self):
         url = self.url + self.backends_api
         response = requests.post(url=url, headers=headers)
         backends_info = response.json()
-        if backends_info['status'] == 201:
+        if backends_info["status"] == 201:
             raise UserError(backends_info["message"])
         else:
             return backends_info["data"]
diff --git a/quafu/utils/paulis.py b/quafu/utils/paulis.py
index f5bb599..627d523 100644
--- a/quafu/utils/paulis.py
+++ b/quafu/utils/paulis.py
@@ -1,5 +1,6 @@
-import numpy as np
 from functools import reduce
+
+import numpy as np
 import sparse
 
 si = sparse.COO(np.array([[1.0, 0.0], [0.0, 1.0]], dtype=complex))
diff --git a/quafu/utils/platform.py b/quafu/utils/platform.py
index 851c4f6..06bae00 100644
--- a/quafu/utils/platform.py
+++ b/quafu/utils/platform.py
@@ -1,5 +1,6 @@
 import os
 import sys
+
 from ..exceptions import QuafuError
 
 
diff --git a/quafu/visualisation/bloch_sphere.py b/quafu/visualisation/bloch_sphere.py
index c72676f..e19cc1a 100644
--- a/quafu/visualisation/bloch_sphere.py
+++ b/quafu/visualisation/bloch_sphere.py
@@ -28,7 +28,7 @@ def xyz_to_angles(xs, ys, zs):
 
 def hex_to_rgb(hex_color):
     """Transform a hex color code to RGB (normalized float)."""
-    hex_color = hex_color.lstrip('#')
+    hex_color = hex_color.lstrip("#")
     if len(hex_color) != 6:
         raise ValueError("Invalid hex color code")
 
@@ -52,7 +52,7 @@ def plot_bloch_vector(v_x, v_y, v_z, title=""):
         ax: matplotlib axes of the Bloch sphere plot.
     """
     fig = plt.figure()
-    ax = fig.add_subplot(111, projection='3d')
+    ax = fig.add_subplot(111, projection="3d")
 
     # surface of Bloch sphere
     theta = np.linspace(0, np.pi, 21)
@@ -60,8 +60,8 @@ def plot_bloch_vector(v_x, v_y, v_z, title=""):
     theta, phi = np.meshgrid(theta, phi)
     x, y, z = angles_to_xyz(theta, phi)
 
-    surf = ax.plot_surface(x, y, z, color='white', alpha=0.2)
-    edge_color = hex_to_rgb('#000000')  # #ff7f0e
+    surf = ax.plot_surface(x, y, z, color="white", alpha=0.2)
+    edge_color = hex_to_rgb("#000000")  # #ff7f0e
     edge_alpha = 0.05
     surf.set_edgecolor((edge_color[0], edge_color[1], edge_color[2], edge_alpha))
 
@@ -72,16 +72,23 @@ def plot_bloch_vector(v_x, v_y, v_z, title=""):
     ax.plot(0 * span, span, zs=0, zdir="y", label="Z", lw=1, color="black", alpha=0.5)
 
     # coordinate values
-    ax.text(1.4, 0, 0, 'x', color='black')
-    ax.text(0, 1.2, 0, 'y', color='black')
-    ax.text(0, 0, 1.2, 'z', color='black')
+    ax.text(1.4, 0, 0, "x", color="black")
+    ax.text(0, 1.2, 0, "y", color="black")
+    ax.text(0, 0, 1.2, "z", color="black")
 
     # Bloch vector
-    ax.quiver(0, 0, 0, v_x, v_y, v_z, color='r')
+    ax.quiver(0, 0, 0, v_x, v_y, v_z, color="r")
     v_theta, v_phi = xyz_to_angles(v_x, v_y, v_z)
 
     # coordinates value text
-    ax.text(0, 0, 1.6, 'Bloch vector: ($\\theta=${:.2f}, $\\varphi$={:.2f})'.format(v_theta, v_phi), fontsize=8, color='red')
+    ax.text(
+        0,
+        0,
+        1.6,
+        "Bloch vector: ($\\theta=${:.2f}, $\\varphi$={:.2f})".format(v_theta, v_phi),
+        fontsize=8,
+        color="red",
+    )
     # ax.text(0, 0, 1.6, 'Bloch vector: ({:.2f}, {:.2f}, {:.2f})'.format(v_x, v_y, v_z), fontsize=8, color='red')
 
     # Set the range of the axes
diff --git a/quafu/visualisation/circuitPlot.py b/quafu/visualisation/circuitPlot.py
index dcc5eac..296ec12 100644
--- a/quafu/visualisation/circuitPlot.py
+++ b/quafu/visualisation/circuitPlot.py
@@ -12,30 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import Dict
+
 import matplotlib.patheffects as pe
 import matplotlib.pyplot as plt
 import numpy as np
-from matplotlib.collections import PolyCollection, PatchCollection, LineCollection
-from matplotlib.patches import Circle, Arc
+from matplotlib.collections import (
+    LineCollection,
+    PatchCollection,
+    PathCollection,
+    PolyCollection,
+)
+from matplotlib.patches import Arc, Circle
+from matplotlib.path import Path
 from matplotlib.text import Text
-from quafu.elements import Instruction, ControlledGate
-from typing import Dict
+from quafu.elements import ControlledGate, Instruction
 
-from matplotlib.path import Path
-from matplotlib.collections import PathCollection
 # this line for developers only
 # from quafu.circuits.quantum_circuit import QuantumCircuit
 
 line_args = {}
 box_args = {}
 
-DEEPCOLOR = '#0C161F'
-BLUE = '#1f77b4'
-ORANGE = '#ff7f0e'
-GREEN = '#2ca02c'
-GOLDEN = '#FFB240'
-GARNET = '#C0392B'
-
+DEEPCOLOR = "#0C161F"
+BLUE = "#1f77b4"
+ORANGE = "#ff7f0e"
+GREEN = "#2ca02c"
+GOLDEN = "#FFB240"
+GARNET = "#C0392B"
 """
 layers(zorder):
 
@@ -47,18 +51,35 @@
 5: labels
 """
 
-su2_gate_names = ['x', 'y', 'z', 'id', 'w',
-                  'h', 't', 'tdg', 's', 'sdg', 'sx', 'sy', 'sw', 'sxdg', 'sydg', 'swdg',
-                  'p',
-                  'rx', 'ry', 'rz',
-                  ]
-
-swap_gate_names = ['swap', 'iswap']
-r2_gate_names = ['rxx', 'ryy', 'rzz']
-c2_gate_names = ['cp', 'cs', 'ct', 'cx', 'cy', 'cz']
-c3_gate_names = ['fredkin', 'toffoli']
-mc_gate_names = ['mcx', 'mcy', 'mcz']
-operation_names = ['barrier', 'delay']
+su2_gate_names = [
+    "x",
+    "y",
+    "z",
+    "id",
+    "w",
+    "h",
+    "t",
+    "tdg",
+    "s",
+    "sdg",
+    "sx",
+    "sy",
+    "sw",
+    "sxdg",
+    "sydg",
+    "swdg",
+    "p",
+    "rx",
+    "ry",
+    "rz",
+]
+
+swap_gate_names = ["swap", "iswap"]
+r2_gate_names = ["rxx", "ryy", "rzz"]
+c2_gate_names = ["cp", "cs", "ct", "cx", "cy", "cz"]
+c3_gate_names = ["fredkin", "toffoli"]
+mc_gate_names = ["mcx", "mcy", "mcz"]
+operation_names = ["barrier", "delay"]
 
 
 class CircuitPlotManager:
@@ -68,9 +89,10 @@ class CircuitPlotManager:
 
     To be initialized when circuit.plot() is called.
     """
+
     # colors
-    _wire_color = '#FF0000'
-    _light_blue = '#3B82F6'
+    _wire_color = "#FF0000"
+    _light_blue = "#3B82F6"
     _ec = DEEPCOLOR
 
     _wire_lw = 1.5
@@ -79,7 +101,7 @@ class CircuitPlotManager:
     _a = 0.5  # box width and height, unit: ax
     _barrier_width = _a / 3  # barrier width
 
-    _stroke = pe.withStroke(linewidth=2, foreground='white')
+    _stroke = pe.withStroke(linewidth=2, foreground="white")
 
     def __init__(self, qc):
         """
@@ -127,21 +149,25 @@ def __init__(self, qc):
             self._proc_measure(self.depth - 1, q)
 
         # step2: initialize bit-label
-        self.q_label = {y: r'$|q_{%d}\rangle$' % i for i, y in self.used_qbit_y.items()}
-        self.c_label = {self.used_qbit_y[iq]: r'c_{%d}' % ic for iq, ic in qc.measures.items()}
+        self.q_label = {y: r"$|q_{%d}\rangle$" % i for i, y in self.used_qbit_y.items()}
+        self.c_label = {
+            self.used_qbit_y[iq]: r"c_{%d}" % ic for iq, ic in qc.measures.items()
+        }
 
         # step3: figure coordination
         self.xs = np.arange(-3 / 2, self.depth + 3 / 2)
         self.ys = np.arange(-2, self.used_qbit_num + 1 / 2)
 
-    def __call__(self,
-                 title=None,
-                 init_labels=None,
-                 end_labels=None,
-                 save_path: str = None,
-                 show: bool = False,
-                 *args,
-                 **kwargs):
+    def __call__(
+        self,
+        title=None,
+        init_labels=None,
+        end_labels=None,
+        save_path: str = None,
+        show: bool = False,
+        *args,
+        **kwargs,
+    ):
         """
         :param title
         :param init_labels: dict, {qbit: label}
@@ -158,22 +184,27 @@ def __call__(self,
         #     import random
         #     plt.gca().xkcd(randomness=random.randint(0, 1000))
         if title is not None:
-            title = Text((self.xs[0] + self.xs[-1]) / 2, -0.8,
-                         title,
-                         size=30,
-                         ha='center', va='baseline')
+            title = Text(
+                (self.xs[0] + self.xs[-1]) / 2,
+                -0.8,
+                title,
+                size=30,
+                ha="center",
+                va="baseline",
+            )
             self._text_list.append(title)
 
         # initialize a figure
         _size_x = self._a_inch * abs(self.xs[-1] - self.xs[0])
         _size_y = self._a_inch * abs(self.ys[-1] - self.ys[0])
         fig = plt.figure(figsize=(_size_x, _size_y))  # inch
-        ax = fig.add_axes([0, 0, 1, 1],
-                          aspect=1,
-                          xlim=[self.xs[0], self.xs[-1]],
-                          ylim=[self.ys[0], self.ys[-1]],
-                          )
-        ax.axis('off')
+        ax = fig.add_axes(
+            [0, 0, 1, 1],
+            aspect=1,
+            xlim=[self.xs[0], self.xs[-1]],
+            ylim=[self.ys[0], self.ys[-1]],
+        )
+        ax.axis("off")
         ax.invert_yaxis()
 
         self._circuit_wires()
@@ -182,26 +213,28 @@ def __call__(self,
         self._render_circuit()
 
         if save_path is not None:
-            plt.savefig(save_path, dpi=300, bbox_inches='tight')
+            plt.savefig(save_path, dpi=300, bbox_inches="tight")
         if show:
             plt.show()
 
     def _process_ins(self, ins: Instruction, append: bool = True):
         name = ins.name.lower()
-        assert name in Instruction.ins_classes, 'Name: %s not registered, if this should occur, please report a bug.' % name
+        assert name in Instruction.ins_classes, (
+            "Name: %s not registered, if this should occur, please report a bug." % name
+        )
 
         _which = slice(np.min(ins.pos), np.max(ins.pos) + 1)
         depth = np.max(self.dorders[_which])
         paras = ins.paras
 
-        if name == 'barrier':
+        if name == "barrier":
             self._proc_barrier(depth, ins.pos)
-        elif name == 'measure':
+        elif name == "measure":
             self._proc_measure(depth, ins.pos)
         elif name in su2_gate_names:
             self._proc_su2(name, depth, ins.pos, paras)
         elif name in swap_gate_names:
-            self._proc_swap(depth, ins.pos, name == 'iswap')
+            self._proc_swap(depth, ins.pos, name == "iswap")
         elif name in r2_gate_names:
             # TODO: combine into one box
             self._ctrl_wire_points.append([[depth, ins.pos[0]], [depth, ins.pos[1]]])
@@ -210,11 +243,13 @@ def _process_ins(self, ins: Instruction, append: bool = True):
             self._proc_su2(name[:-1], depth, max(ins.pos), paras)
         elif isinstance(ins, ControlledGate):
             self._proc_ctrl(depth, ins)
-        elif name == 'delay':
+        elif name == "delay":
             self._delay(depth, ins.pos, ins.duration, ins.unit)
         else:
-            raise NotImplementedError(f'Gate {name} is not supported yet.\n'
-                                      f'If this should occur, please report a bug.')
+            raise NotImplementedError(
+                f"Gate {name} is not supported yet.\n"
+                f"If this should occur, please report a bug."
+            )
         if append:
             self.dorders[_which] = depth + 1
 
@@ -233,137 +268,155 @@ def _circuit_wires(self):
             self._h_wire_points.append([[x0, y], [x1, y]])
 
     def _inits_label(self, labels: Dict[int, str] = None):
-        """ qubit-labeling """
+        """qubit-labeling"""
         if labels is None:
             labels = self.q_label
 
         for i, label in labels.items():
-            txt = Text(-2 / 3, i,
-                       label,
-                       size=18,
-                       color=DEEPCOLOR,
-                       ha='right',
-                       va='center',
-                       )
+            txt = Text(
+                -2 / 3,
+                i,
+                label,
+                size=18,
+                color=DEEPCOLOR,
+                ha="right",
+                va="center",
+            )
             self._text_list.append(txt)
 
     def _measured_label(self, labels: Dict[int, str] = None):
-        """ measured qubit-labeling """
+        """measured qubit-labeling"""
         if labels is None:
             labels = self.c_label
 
         for i, label in labels.items():
-            label = r'$%s$' % label
-            txt = Text(self.xs[-1] - 3 / 4, i,
-                       label,
-                       size=18,
-                       color=DEEPCOLOR,
-                       ha='left',
-                       va='center',
-                       )
+            label = r"$%s$" % label
+            txt = Text(
+                self.xs[-1] - 3 / 4,
+                i,
+                label,
+                size=18,
+                color=DEEPCOLOR,
+                ha="left",
+                va="center",
+            )
             self._text_list.append(txt)
 
     def _gate_bbox(self, x, y, fc: str):
-        """ Single qubit gate box """
+        """Single qubit gate box"""
         a = self._a
         from matplotlib.patches import FancyBboxPatch
-        bbox = FancyBboxPatch((-a / 2 + x, -a / 2 + y), a, a,  # this warning belongs to matplotlib
-                              boxstyle=f'round, pad={0.2 * a}',
-                              edgecolor=DEEPCOLOR,
-                              facecolor=fc,
-                              )
+
+        bbox = FancyBboxPatch(
+            (-a / 2 + x, -a / 2 + y),
+            a,
+            a,  # this warning belongs to matplotlib
+            boxstyle=f"round, pad={0.2 * a}",
+            edgecolor=DEEPCOLOR,
+            facecolor=fc,
+        )
         self._closed_patches.append(bbox)
 
     def _gate_label(self, x, y, s):
         if not s:
             return None
         _dy = 0.05
-        text = Text(x, y + _dy,
-                    s,
-                    size=24,
-                    color=DEEPCOLOR,
-                    ha='center',
-                    va='center',
-                    )
+        text = Text(
+            x,
+            y + _dy,
+            s,
+            size=24,
+            color=DEEPCOLOR,
+            ha="center",
+            va="center",
+        )
         text.set_path_effects([self._stroke])
         self._text_list.append(text)
 
     def _para_label(self, x, y, para_txt):
-        """ label parameters """
+        """label parameters"""
         if not para_txt:
             return None
         _dx = 0
-        text = Text(x + _dx, y + 0.8 * self._a,
-                    para_txt,
-                    size=12,
-                    color=DEEPCOLOR,
-                    ha='center',
-                    va='top',
-                    )
+        text = Text(
+            x + _dx,
+            y + 0.8 * self._a,
+            para_txt,
+            size=12,
+            color=DEEPCOLOR,
+            ha="center",
+            va="top",
+        )
         self._text_list.append(text)
 
     def _measure_label(self, x, y):
         from matplotlib.patches import FancyArrow
+
         a = self._a
         r = 1.1 * a
         d = 1.2 * a / 3.5
 
-        arrow = FancyArrow(x=x,
-                           y=y + d,
-                           dx=0.15,
-                           dy=-0.35,
-                           width=0.04,
-                           facecolor=DEEPCOLOR,
-                           head_width=0.07,
-                           head_length=0.15,
-                           edgecolor='white')
-        arc = Arc((x, y + d),
-                  width=r,
-                  height=r,
-                  lw=1,
-                  theta1=180,
-                  theta2=0,
-                  fill=False,
-                  zorder=4,
-                  color=DEEPCOLOR,
-                  capstyle='round',
-                  )
-        center_bkg = Circle((x, y + d),
-                            radius=0.035,
-                            color='white',
-                            )
-        center = Circle((x, y + d),
-                        radius=0.025,
-                        facecolor=DEEPCOLOR,
-                        )
+        arrow = FancyArrow(
+            x=x,
+            y=y + d,
+            dx=0.15,
+            dy=-0.35,
+            width=0.04,
+            facecolor=DEEPCOLOR,
+            head_width=0.07,
+            head_length=0.15,
+            edgecolor="white",
+        )
+        arc = Arc(
+            (x, y + d),
+            width=r,
+            height=r,
+            lw=1,
+            theta1=180,
+            theta2=0,
+            fill=False,
+            zorder=4,
+            color=DEEPCOLOR,
+            capstyle="round",
+        )
+        center_bkg = Circle(
+            (x, y + d),
+            radius=0.035,
+            color="white",
+        )
+        center = Circle(
+            (x, y + d),
+            radius=0.025,
+            facecolor=DEEPCOLOR,
+        )
         self._mea_arc_patches.append(arc)
         self._mea_point_patches += [center_bkg, arrow, center]
 
     #########################################################################
     # region # # # # processing-functions: decompose ins into graphical elements # # #
     def _proc_su2(self, id_name, depth, pos, paras):
-        if id_name in ['x', 'y', 'z', 'h', 'id', 's', 't', 'p', 'w']:
-            fc = '#EE7057'
+        if id_name in ["x", "y", "z", "h", "id", "s", "t", "p", "w"]:
+            fc = "#EE7057"
             label = id_name.capitalize()[0]
-        elif id_name in ['sw', 'swdg', 'sx', 'sxdg', 'sy', 'sydg']:
-            fc = '#EE7057'
-            if id_name[-2:] == 'dg':
-                label = r'$\sqrt{%s}^\dagger$' % id_name[1]
+        elif id_name in ["sw", "swdg", "sx", "sxdg", "sy", "sydg"]:
+            fc = "#EE7057"
+            if id_name[-2:] == "dg":
+                label = r"$\sqrt{%s}^\dagger$" % id_name[1]
             else:
-                label = r'$\sqrt{%s}$' % id_name[1]
-        elif id_name in ['sdg', 'tdg']:
-            fc = '#EE7057'
-            label = id_name[0] + r'$^\dagger$'
-        elif id_name in ['rx', 'ry', 'rz']:
-            fc = '#6366F1'
+                label = r"$\sqrt{%s}$" % id_name[1]
+        elif id_name in ["sdg", "tdg"]:
+            fc = "#EE7057"
+            label = id_name[0] + r"$^\dagger$"
+        elif id_name in ["rx", "ry", "rz"]:
+            fc = "#6366F1"
             label = id_name.upper()
         else:
-            fc = '#8C9197'
-            label = '?'
+            fc = "#8C9197"
+            label = "?"
 
-        if id_name in ['rx', 'ry', 'rz', 'p']:
+        if id_name in ["rx", "ry", "rz", "p"]:
             # too long to display: r'$\theta=$' + f'{paras:.3f}' (TODO)
-            para_txt = f'({paras:.3f})' if paras else None
+            para_txt = f"({paras:.3f})" if paras else None
         else:
             para_txt = None
 
@@ -376,7 +429,7 @@ def _proc_su2(self, id_name, depth, pos, paras):
     def _delay(self, depth, pos, paras, unit):
         fc = BLUE
 
-        para_txt = '%d%s' % (paras, unit)
+        para_txt = "%d%s" % (paras, unit)
 
         x = depth
         y = self.used_qbit_y[pos]
@@ -403,17 +456,20 @@ def _proc_ctrl(self, depth, ins: ControlledGate, ctrl_type: bool = True):
             tar_name = ins.targ_name.lower()[-1]
             pos = ins.targs if isinstance(ins.targs, int) else ins.targs[0]
             x = self.used_qbit_y[pos]
-            if tar_name == 'x':
+            if tar_name == "x":
                 self._not_points.append((depth, x))
             else:
                 self._proc_su2(tar_name, depth, pos, ins.paras)
-        elif name == 'cswap':
+        elif name == "cswap":
             self._swap_points += [[depth, self.used_qbit_y[p]] for p in ins.targs]
-        elif name == 'ccx':
+        elif name == "ccx":
             self._not_points.append((depth, self.used_qbit_y[ins.targs[0]]))
         else:
             from quafu.elements.element_gates import ControlledU
-            assert isinstance(ins, ControlledU), f'unknown gate: {name}, {ins.__class__.__name__}'
+
+            assert isinstance(
+                ins, ControlledU
+            ), f"unknown gate: {name}, {ins.__class__.__name__}"
             self._process_ins(ins, append=False)
 
     def _proc_swap(self, depth, pos, iswap: bool = False):
@@ -431,8 +487,8 @@ def _proc_barrier(self, depth, pos: list):
 
         for p in pos:
             y = self.used_qbit_y[p]
-            y0 = (y - 1 / 2)
-            y1 = (y + 1 / 2)
+            y0 = y - 1 / 2
+            y1 = y + 1 / 2
             nodes = [[x0, y0], [x0, y1], [x1, y1], [x1, y0], [x0, y0]]
             self._barrier_points.append(nodes)
 
@@ -448,6 +504,7 @@ def _proc_measure(self, depth, pos: int):
         # x0 = depth
         # x1 = self.depth - 1 / 2
         # self._h_wire_points.append([[x0, y], [x1, y]])
+
     # endregion
     #########################################################################
 
@@ -455,45 +512,49 @@ def _proc_measure(self, depth, pos: int):
     # # # # # # # # # # # # # # rendering functions # # # # # # # # # # # # #
     #########################################################################
     def _render_h_wires(self):
-        h_lines = LineCollection(self._h_wire_points,
-                                 zorder=0,
-                                 colors=self._wire_color,
-                                 alpha=0.8,
-                                 linewidths=2,
-                                 )
+        h_lines = LineCollection(
+            self._h_wire_points,
+            zorder=0,
+            colors=self._wire_color,
+            alpha=0.8,
+            linewidths=2,
+        )
         plt.gca().add_collection(h_lines)
 
     def _render_ctrl_wires(self):
-        v_lines = LineCollection(self._ctrl_wire_points,
-                                 zorder=0,
-                                 colors=self._light_blue,
-                                 alpha=0.8,
-                                 linewidths=4,
-                                 )
+        v_lines = LineCollection(
+            self._ctrl_wire_points,
+            zorder=0,
+            colors=self._light_blue,
+            alpha=0.8,
+            linewidths=4,
+        )
         plt.gca().add_collection(v_lines)
 
     def _render_closed_patch(self):
-        collection = PatchCollection(self._closed_patches,
-                                     match_original=True,
-                                     zorder=3,
-                                     ec=self._ec,
-                                     linewidths=0.5,
-                                     )
+        collection = PatchCollection(
+            self._closed_patches,
+            match_original=True,
+            zorder=3,
+            ec=self._ec,
+            linewidths=0.5,
+        )
         plt.gca().add_collection(collection)
 
     def _render_ctrl_nodes(self):
         circle_collection = []
         r = self._a / 4
         for x, y, ctrl in self._ctrl_points:
-            fc = '#3B82F6' if ctrl else 'white'
+            fc = "#3B82F6" if ctrl else "white"
             circle = Circle((x, y), radius=r, fc=fc)
             circle_collection.append(circle)
-        circles = PatchCollection(circle_collection,
-                                  match_original=True,
-                                  zorder=5,
-                                  ec=self._ec,
-                                  linewidths=2,
-                                  )
+        circles = PatchCollection(
+            circle_collection,
+            match_original=True,
+            zorder=5,
+            ec=self._ec,
+            linewidths=2,
+        )
         plt.gca().add_collection(circles)
 
     def _render_not_nodes(self):
@@ -504,16 +565,16 @@ def _render_not_nodes(self):
         for x, y in self._not_points:
             points.append([[x, y - rp], [x, y + rp]])
             points.append([[x - rp, y], [x + rp, y]])
-            circle = Circle((x, y), radius=r, lw=1,
-                            fc='#3B82F6')
+            circle = Circle((x, y), radius=r, lw=1, fc="#3B82F6")
             self._closed_patches.append(circle)
 
-        collection = LineCollection(points,
-                                    zorder=5,
-                                    colors='white',
-                                    linewidths=2,
-                                    capstyle='round',
-                                    )
+        collection = LineCollection(
+            points,
+            zorder=5,
+            colors="white",
+            linewidths=2,
+            capstyle="round",
+        )
         plt.gca().add_collection(collection)
 
     def _render_swap_nodes(self):
@@ -522,49 +583,50 @@ def _render_swap_nodes(self):
         for x, y in self._swap_points:
             points.append([[x - r, y - r], [x + r, y + r]])
             points.append([[x + r, y - r], [x - r, y + r]])
-        collection = LineCollection(points,
-                                    zorder=5,
-                                    colors='#3B82F6',
-                                    linewidths=4,
-                                    capstyle='round',
-                                    )
+        collection = LineCollection(
+            points,
+            zorder=5,
+            colors="#3B82F6",
+            linewidths=4,
+            capstyle="round",
+        )
         plt.gca().add_collection(collection)
 
         # iswap-cirlces
         i_circles = []
         for x, y in self._iswap_points:
-            circle = Circle((x, y), radius=2 ** (1 / 2) * r, lw=3,
-                            ec='#3B82F6', fill=False)
+            circle = Circle(
+                (x, y), radius=2 ** (1 / 2) * r, lw=3, ec="#3B82F6", fill=False
+            )
             i_circles.append(circle)
-        collection = PatchCollection(i_circles,
-                                     match_original=True,
-                                     zorder=5,
-                                     )
+        collection = PatchCollection(
+            i_circles,
+            match_original=True,
+            zorder=5,
+        )
         plt.gca().add_collection(collection)
 
     def _render_measure(self):
-        stroke = pe.withStroke(linewidth=4, foreground='white')
-        arcs = PatchCollection(self._mea_arc_patches,
-                               match_original=True,
-                               capstyle='round',
-                               zorder=4)
+        stroke = pe.withStroke(linewidth=4, foreground="white")
+        arcs = PatchCollection(
+            self._mea_arc_patches, match_original=True, capstyle="round", zorder=4
+        )
         arcs.set_path_effects([stroke])
 
         plt.gca().add_collection(arcs)
-        pointers = PatchCollection(self._mea_point_patches,  # note the order
-                                   match_original=True,
-                                   zorder=5,
-                                   facecolors=DEEPCOLOR,
-                                   linewidths=2,
-                                   )
+        pointers = PatchCollection(
+            self._mea_point_patches,  # note the order
+            match_original=True,
+            zorder=5,
+            facecolors=DEEPCOLOR,
+            linewidths=2,
+        )
         plt.gca().add_collection(pointers)
 
     def _render_barrier(self):
-        barrier = PolyCollection(self._barrier_points,
-                                 closed=True,
-                                 fc='lightgray',
-                                 hatch='///',
-                                 zorder=4)
+        barrier = PolyCollection(
+            self._barrier_points, closed=True, fc="lightgray", hatch="///", zorder=4
+        )
         plt.gca().add_collection(barrier)
 
     def _render_txt(self):
@@ -572,11 +634,13 @@ def _render_txt(self):
             plt.gca().add_artist(txt)
 
     def _render_white_path(self):
-        path_collection = PathCollection([Path(points) for points in self._white_path_points],
-                                         facecolor='none',
-                                         edgecolor='white',
-                                         zorder=4,
-                                         linewidth=2)
+        path_collection = PathCollection(
+            [Path(points) for points in self._white_path_points],
+            facecolor="none",
+            edgecolor="white",
+            zorder=4,
+            linewidth=2,
+        )
         plt.gca().add_collection(path_collection)
 
     def _render_circuit(self):
@@ -591,4 +655,3 @@ def _render_circuit(self):
         self._render_closed_patch()
         self._render_white_path()
         self._render_txt()
-
diff --git a/quafu/visualisation/draw_dag.py b/quafu/visualisation/draw_dag.py
index bf1e270..df58692 100644
--- a/quafu/visualisation/draw_dag.py
+++ b/quafu/visualisation/draw_dag.py
@@ -1,10 +1,10 @@
-import graphviz
+from typing import Any, Union
 
-from quafu import QuantumCircuit
+import graphviz
 from quafu.dagcircuits.circuit_dag import circuit_to_dag
 from quafu.dagcircuits.instruction_node import InstructionNode
 
-from typing import Union, Any
+from quafu import QuantumCircuit
 
 
 def _extract_node_info(node):
@@ -16,10 +16,12 @@ def _extract_node_info(node):
     return name, label
 
 
-def draw_dag(qc: Union[QuantumCircuit, None],
-             dag: Any = None,
-             output_format: str = 'pdf',
-             output_filename: str = 'DAG'):
+def draw_dag(
+    qc: Union[QuantumCircuit, None],
+    dag: Any = None,
+    output_format: str = "pdf",
+    output_filename: str = "DAG",
+):
     """
     TODO: complete docstring, test supports for notebook
 
@@ -48,7 +50,7 @@ def draw_dag(qc: Union[QuantumCircuit, None],
         node1, node2, link = edge
         name1, label1 = _extract_node_info(node1)
         name2, label2 = _extract_node_info(node2)
-        dot.edge(name1, name2, label=link['label'])
+        dot.edge(name1, name2, label=link["label"])
 
     dot.render(format=output_format, cleanup=True)
     return dot
diff --git a/requirements.txt b/requirements.txt
index ebb3c68..72c69e3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,12 +1,12 @@
+graphviz>=0.14.2
 ipython>=8.14.0
 matplotlib>=3.5.2
 networkx>=2.6.3
 numpy>=1.20.3
+ply~=3.11
+pybind11>=2.10.3
 requests>=2.26.0
+scikit-build>=0.16.1
 scipy>=1.8.1
 setuptools>=58.0.4
 sparse>=0.13.0
-scikit-build>=0.16.1
-pybind11>=2.10.3
-graphviz>=0.14.2
-ply~=3.11
diff --git a/setup.py b/setup.py
index c8f9830..1f8e357 100644
--- a/setup.py
+++ b/setup.py
@@ -10,9 +10,10 @@
     )
     raise
 
-from setuptools import find_packages
 from os import path
 
+from setuptools import find_packages
+
 here = path.abspath(path.dirname(__file__))
 with open(path.join(here, "README.md"), encoding="utf-8") as f:
     long_description = f.read()
@@ -43,12 +44,12 @@
     packages=find_packages(exclude=["test*"]),
     cmake_install_dir="quafu/simulators/",
     include_package_data=True,
-    package_data={"quafu":["qfasm/*.inc"]},
+    package_data={"quafu": ["qfasm/*.inc"]},
     long_description=long_description,
     long_description_content_type="text/markdown",
-    extras_require={"test": ["pytest"]}, 
+    extras_require={"test": ["pytest"]},
     python_requires=">=3.8",
     zip_safe=False,
     setup_cfg=True,
-    license="Apache-2.0 License"
+    license="Apache-2.0 License",
 )
diff --git a/src/qfvm/circuit.hpp b/src/qfvm/circuit.hpp
index cd08bea..f060f56 100644
--- a/src/qfvm/circuit.hpp
+++ b/src/qfvm/circuit.hpp
@@ -1,241 +1,234 @@
 #pragma once
 #include "operators.hpp"
 #include "qasm.hpp"
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
-#include <pybind11/eigen.h>
 #include "util.h"
 #include <Eigen/Core>
 #include <algorithm>
 #include <iostream>
+#include <pybind11/eigen.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
 
 namespace py = pybind11;
 using namespace pybind11::literals;
 
+void check_operator(QuantumOperator& op) {
+  std::cout << "-------------" << std::endl;
 
-void check_operator(QuantumOperator &op){
-    std::cout << "-------------" << std::endl;
-
-    std::cout << "name: " << op.name() << std::endl;
-    std::cout << "pos: ";
-    Qfutil::printVector(op.positions());
+  std::cout << "name: " << op.name() << std::endl;
+  std::cout << "pos: ";
+  Qfutil::printVector(op.positions());
 
-    std::cout << "paras: ";
-    Qfutil::printVector(op.paras());
+  std::cout << "paras: ";
+  Qfutil::printVector(op.paras());
 
-    std::cout << "control number: ";
-    std::cout << op.control_num() << std::endl;
+  std::cout << "control number: ";
+  std::cout << op.control_num() << std::endl;
 
-    std::cout << "matrix: " << std::endl;
-    std::cout << op.mat() << std::endl;
+  std::cout << "matrix: " << std::endl;
+  std::cout << op.mat() << std::endl;
 
-    std::cout << "flatten matrix: " << std::endl;
-    auto mat = op.mat();
-    // Eigen::Map<Eigen::RowVectorXcd> v1(mat.data(), mat.size());
-    // std::cout << "v1: " << v1 << std::endl;
-    auto matv = mat.data();
-    for (auto i = 0;i < mat.size();i++){
-        std::cout << matv[i] << " ";
-    }
-    std::cout << std::endl;
-    std::cout << "-------------" << std::endl;
+  std::cout << "flatten matrix: " << std::endl;
+  auto mat = op.mat();
+  // Eigen::Map<Eigen::RowVectorXcd> v1(mat.data(), mat.size());
+  // std::cout << "v1: " << v1 << std::endl;
+  auto matv = mat.data();
+  for (auto i = 0; i < mat.size(); i++) {
+    std::cout << matv[i] << " ";
+  }
+  std::cout << std::endl;
+  std::cout << "-------------" << std::endl;
 }
 
-
-class Circuit{
-    private:
-        uint qubit_num_;  
-        vector<QuantumOperator> instructions_;
-        uint max_targe_num_;
-        uint cbit_num_;
-        // to sample count
-        vector<std::pair<uint,uint>> measure_vec_;
-        bool final_measure_ = true;
-
-    public:
-    Circuit();
-    explicit Circuit(uint qubit_num);
-    explicit Circuit(vector<QuantumOperator> &ops);
-    explicit Circuit(py::object const&pycircuit); 
-
-    void add_op(QuantumOperator &op);
-    void compress_instructions();
-    uint qubit_num() const { return qubit_num_; }
-    uint cbit_num() const { return cbit_num_; }
-    uint max_targe_num() const { return max_targe_num_; }
-    bool final_measure() const { return final_measure_; }
-    vector<QuantumOperator> gates();
-    vector<std::pair<uint,uint>> measure_vec() { return measure_vec_; }
-    vector<QuantumOperator> instructions() const { return instructions_; }
-    QuantumOperator from_pyops(py::object const &obj);
+class Circuit {
+private:
+  uint qubit_num_;
+  vector<QuantumOperator> instructions_;
+  uint max_targe_num_;
+  uint cbit_num_;
+  // to sample count
+  vector<std::pair<uint, uint>> measure_vec_;
+  bool final_measure_ = true;
+
+public:
+  Circuit();
+  explicit Circuit(uint qubit_num);
+  explicit Circuit(vector<QuantumOperator>& ops);
+  explicit Circuit(py::object const& pycircuit);
+
+  void add_op(QuantumOperator& op);
+  void compress_instructions();
+  uint qubit_num() const { return qubit_num_; }
+  uint cbit_num() const { return cbit_num_; }
+  uint max_targe_num() const { return max_targe_num_; }
+  bool final_measure() const { return final_measure_; }
+  vector<QuantumOperator> gates();
+  vector<std::pair<uint, uint>> measure_vec() { return measure_vec_; }
+  vector<QuantumOperator> instructions() const { return instructions_; }
+  QuantumOperator from_pyops(py::object const& obj);
 };
 
-void Circuit::add_op(QuantumOperator &op){
-    for (pos_t pos : op.positions()){
-        if (pos > qubit_num_) {
-            throw "invalid position on quantum registers";
-        }
-        else{
-            instructions_.push_back(op);
-        }
+void Circuit::add_op(QuantumOperator& op) {
+  for (pos_t pos : op.positions()) {
+    if (pos > qubit_num_) {
+      throw "invalid position on quantum registers";
+    } else {
+      instructions_.push_back(op);
     }
+  }
 }
 
- Circuit::Circuit(){};
- Circuit::Circuit(uint qubit_num)
- :
- qubit_num_(qubit_num){ }
-
- Circuit::Circuit(vector<QuantumOperator> &ops)
- :
- instructions_(ops),
- max_targe_num_(0){
-    qubit_num_ = 0;
-    for (auto op : ops){
-        for (pos_t pos : op.positions()){
-            if (op.targe_num() > max_targe_num_)
-                max_targe_num_ = op.targe_num();
-            if (pos+1 > qubit_num_){ qubit_num_ = pos+1; }
-        }
+Circuit::Circuit(){};
+Circuit::Circuit(uint qubit_num) : qubit_num_(qubit_num) {}
+
+Circuit::Circuit(vector<QuantumOperator>& ops)
+    : instructions_(ops), max_targe_num_(0) {
+  qubit_num_ = 0;
+  for (auto op : ops) {
+    for (pos_t pos : op.positions()) {
+      if (op.targe_num() > max_targe_num_)
+        max_targe_num_ = op.targe_num();
+      if (pos + 1 > qubit_num_) {
+        qubit_num_ = pos + 1;
+      }
     }
+  }
 }
 
-vector<QuantumOperator> Circuit::gates(){
-    // provide gates for gpu and custate
-    std::vector<std::string> classics = {"measure", "cif", "reset"};
-    vector<QuantumOperator> gates;
-    for(auto op : instructions_){
-        if(std::find(classics.begin(), classics.end(), op.name()) == classics.end()){
-            gates.push_back(op);
-        }
+vector<QuantumOperator> Circuit::gates() {
+  // provide gates for gpu and custate
+  std::vector<std::string> classics = {"measure", "cif", "reset"};
+  vector<QuantumOperator> gates;
+  for (auto op : instructions_) {
+    if (std::find(classics.begin(), classics.end(), op.name()) ==
+        classics.end()) {
+      gates.push_back(op);
     }
-    return gates;
+  }
+  return gates;
 }
 
 // Construct C++ operators from pygates
-QuantumOperator Circuit::from_pyops(py::object const &obj){
-    string name;
-    vector<pos_t> positions;
-    vector<pos_t> qbits;
-    vector<pos_t> cbits;
-    vector<double> paras;
-    uint control_num = 0;
-    RowMatrixXcd mat;
-    
-    name = obj.attr("name").attr("lower")().cast<string>();
-    if (!(name == "barrier" || name == "delay" || name == "id" || name == "measure" || name == "reset" || name == "cif"))
-    {
-        if (py::isinstance<py::list>(obj.attr("pos"))){
-            positions = obj.attr("pos").cast<vector<pos_t>>();
-        }
-        else if(py::isinstance<py::int_>(obj.attr("pos"))){
-            positions = vector<pos_t>{obj.attr("pos").cast<pos_t>()};
-        }
-
-        if (py::isinstance<py::list>(obj.attr("paras"))){
-            paras = obj.attr("paras").cast<vector<double>>();
-        }
-        else if(py::isinstance<py::float_>(obj.attr("paras")) || py::isinstance<py::int_>(obj.attr("paras"))){
-            paras = vector<double>{obj.attr("paras").cast<double>()};
-        }
+QuantumOperator Circuit::from_pyops(py::object const& obj) {
+  string name;
+  vector<pos_t> positions;
+  vector<pos_t> qbits;
+  vector<pos_t> cbits;
+  vector<double> paras;
+  uint control_num = 0;
+  RowMatrixXcd mat;
+
+  name = obj.attr("name").attr("lower")().cast<string>();
+  if (!(name == "barrier" || name == "delay" || name == "id" ||
+        name == "measure" || name == "reset" || name == "cif")) {
+    if (py::isinstance<py::list>(obj.attr("pos"))) {
+      positions = obj.attr("pos").cast<vector<pos_t>>();
+    } else if (py::isinstance<py::int_>(obj.attr("pos"))) {
+      positions = vector<pos_t>{obj.attr("pos").cast<pos_t>()};
+    }
 
-        if (py::hasattr(obj, "ctrls")){
-                control_num = py::len(obj.attr("ctrls"));
-        }
-        
-        //Reverse order for multi-target gate
-        if (py::hasattr(obj, "_targ_matrix")){
-                mat = obj.attr("get_targ_matrix")("reverse_order"_a=true).cast<RowMatrixXcd>();
-        }
-        else{ //Single target gate
-                mat = obj.attr("matrix").cast<RowMatrixXcd>();
-        }
-        return QuantumOperator(name, paras, positions, control_num, mat);
+    if (py::isinstance<py::list>(obj.attr("paras"))) {
+      paras = obj.attr("paras").cast<vector<double>>();
+    } else if (py::isinstance<py::float_>(obj.attr("paras")) ||
+               py::isinstance<py::int_>(obj.attr("paras"))) {
+      paras = vector<double>{obj.attr("paras").cast<double>()};
+    }
 
-    }else if(name == "measure"){
-        if (py::isinstance<py::list>(obj.attr("qbits"))){
-            qbits = obj.attr("qbits").cast<vector<pos_t>>();
-        }else if(py::isinstance<py::int_>(obj.attr("qbits"))){
-            qbits = vector<pos_t>{obj.attr("qbits").cast<pos_t>()};
-        }
+    if (py::hasattr(obj, "ctrls")) {
+      control_num = py::len(obj.attr("ctrls"));
+    }
 
-        if (py::isinstance<py::list>(obj.attr("cbits"))){
-            cbits = obj.attr("cbits").cast<vector<pos_t>>();
-        }else if(py::isinstance<py::int_>(obj.attr("cbits"))){
-            cbits = vector<pos_t>{obj.attr("cbits").cast<pos_t>()};
-        }
-        //record qbit-cbit measure map
-        for(uint i = 0; i < qbits.size(); i++){
-            measure_vec_.push_back(std::make_pair(qbits[i], cbits[i]));
-        }
-        return QuantumOperator(name, qbits, cbits);
+    // Reverse order for multi-target gate
+    if (py::hasattr(obj, "_targ_matrix")) {
+      mat = obj.attr("get_targ_matrix")("reverse_order"_a = true)
+                .cast<RowMatrixXcd>();
+    } else { // Single target gate
+      mat = obj.attr("matrix").cast<RowMatrixXcd>();
+    }
+    return QuantumOperator(name, paras, positions, control_num, mat);
 
-    }else if(name == "reset"){
-        if (py::isinstance<py::list>(obj.attr("pos"))){
-            positions = obj.attr("pos").cast<vector<pos_t>>();
-        }
-        else if(py::isinstance<py::int_>(obj.attr("pos"))){
-            positions = vector<pos_t>{obj.attr("pos").cast<pos_t>()};
-        }
-        return QuantumOperator(name, positions);
-
-    }else if(name == "cif"){
-        uint condition = 0;
-        vector<QuantumOperator> instructions;
-        if (py::isinstance<py::list>(obj.attr("cbits"))){
-            cbits = obj.attr("cbits").cast<vector<pos_t>>();
-        }else if(py::isinstance<py::int_>(obj.attr("cbits"))){
-            cbits = vector<pos_t>{obj.attr("cbits").cast<pos_t>()};
-        }
+  } else if (name == "measure") {
+    if (py::isinstance<py::list>(obj.attr("qbits"))) {
+      qbits = obj.attr("qbits").cast<vector<pos_t>>();
+    } else if (py::isinstance<py::int_>(obj.attr("qbits"))) {
+      qbits = vector<pos_t>{obj.attr("qbits").cast<pos_t>()};
+    }
 
-        if(py::isinstance<py::int_>(obj.attr("condition"))){
-            condition = obj.attr("condition").cast<pos_t>();
-        }
+    if (py::isinstance<py::list>(obj.attr("cbits"))) {
+      cbits = obj.attr("cbits").cast<vector<pos_t>>();
+    } else if (py::isinstance<py::int_>(obj.attr("cbits"))) {
+      cbits = vector<pos_t>{obj.attr("cbits").cast<pos_t>()};
+    }
+    // record qbit-cbit measure map
+    for (uint i = 0; i < qbits.size(); i++) {
+      measure_vec_.push_back(std::make_pair(qbits[i], cbits[i]));
+    }
+    return QuantumOperator(name, qbits, cbits);
 
-        // Recursively handdle instruction
-        if (py::isinstance<py::list>(obj.attr("instructions"))){
-            auto pyops = obj.attr("instructions");
-            for(auto pyop_h : pyops){
-                py::object pyop = py::reinterpret_borrow<py::object>(pyop_h);
-                QuantumOperator op = from_pyops(pyop);
-                if (op){
-                    if (op.targe_num() > max_targe_num_)
-                        max_targe_num_ = op.targe_num();
-                    instructions.push_back(std::move(op));
-                }        
-            }
-        }
-        return QuantumOperator(name, cbits, condition, instructions); 
+  } else if (name == "reset") {
+    if (py::isinstance<py::list>(obj.attr("pos"))) {
+      positions = obj.attr("pos").cast<vector<pos_t>>();
+    } else if (py::isinstance<py::int_>(obj.attr("pos"))) {
+      positions = vector<pos_t>{obj.attr("pos").cast<pos_t>()};
+    }
+    return QuantumOperator(name, positions);
+
+  } else if (name == "cif") {
+    uint condition = 0;
+    vector<QuantumOperator> instructions;
+    if (py::isinstance<py::list>(obj.attr("cbits"))) {
+      cbits = obj.attr("cbits").cast<vector<pos_t>>();
+    } else if (py::isinstance<py::int_>(obj.attr("cbits"))) {
+      cbits = vector<pos_t>{obj.attr("cbits").cast<pos_t>()};
+    }
 
-    }else{
-        return QuantumOperator();
+    if (py::isinstance<py::int_>(obj.attr("condition"))) {
+      condition = obj.attr("condition").cast<pos_t>();
     }
-   
-}
 
-Circuit::Circuit(py::object const&pycircuit)
-:
-max_targe_num_(0)
-{
-    // auto pygates = pycircuit.attr("gates");
-    auto pyops = pycircuit.attr("instructions");
-    auto used_qubits = pycircuit.attr("used_qubits").cast<vector<pos_t>>();
-    cbit_num_ =  pycircuit.attr("cbits_num").cast<uint>();
-    qubit_num_ = *std::max_element(used_qubits.begin(), used_qubits.end())+1;
-    // judge wheather op qubit after measure
-    bool measured = false;
-    for (auto pyop_h : pyops){
+    // Recursively handdle instruction
+    if (py::isinstance<py::list>(obj.attr("instructions"))) {
+      auto pyops = obj.attr("instructions");
+      for (auto pyop_h : pyops) {
         py::object pyop = py::reinterpret_borrow<py::object>(pyop_h);
         QuantumOperator op = from_pyops(pyop);
-        if (op){
-            if (op.targe_num() > max_targe_num_)
-                max_targe_num_ = op.targe_num();
-            if(op.name() == "measure") {measured = true;}
-            else if(measured == true) {final_measure_ = false; } 
-            instructions_.push_back(std::move(op));
-        }        
+        if (op) {
+          if (op.targe_num() > max_targe_num_)
+            max_targe_num_ = op.targe_num();
+          instructions.push_back(std::move(op));
+        }
+      }
     }
-} 
+    return QuantumOperator(name, cbits, condition, instructions);
+
+  } else {
+    return QuantumOperator();
+  }
+}
+
+Circuit::Circuit(py::object const& pycircuit) : max_targe_num_(0) {
+  // auto pygates = pycircuit.attr("gates");
+  auto pyops = pycircuit.attr("instructions");
+  auto used_qubits = pycircuit.attr("used_qubits").cast<vector<pos_t>>();
+  cbit_num_ = pycircuit.attr("cbits_num").cast<uint>();
+  qubit_num_ = *std::max_element(used_qubits.begin(), used_qubits.end()) + 1;
+  // judge wheather op qubit after measure
+  bool measured = false;
+  for (auto pyop_h : pyops) {
+    py::object pyop = py::reinterpret_borrow<py::object>(pyop_h);
+    QuantumOperator op = from_pyops(pyop);
+    if (op) {
+      if (op.targe_num() > max_targe_num_)
+        max_targe_num_ = op.targe_num();
+      if (op.name() == "measure") {
+        measured = true;
+      } else if (measured == true) {
+        final_measure_ = false;
+      }
+      instructions_.push_back(std::move(op));
+    }
+  }
+}
 
-void Circuit::compress_instructions(){}
+void Circuit::compress_instructions() {}
diff --git a/src/qfvm/operators.hpp b/src/qfvm/operators.hpp
index cb9fd66..ab1d11a 100644
--- a/src/qfvm/operators.hpp
+++ b/src/qfvm/operators.hpp
@@ -1,97 +1,89 @@
 #pragma once
 
-#include <iostream>
 #include "statevector.hpp"
+#include <iostream>
 
-class QuantumOperator{
-    protected:
-        string name_;
-        vector<pos_t> positions_;
-        vector<double> paras_;
-        uint control_num_;
-        uint targe_num_;  
-        bool diag_;
-        bool real_;
-        RowMatrixXcd mat_;
-        vector<pos_t> qbits_;
-        vector<pos_t> cbits_;
-        vector<QuantumOperator> instructions_;
-        uint condition_;
-    public:
-        //Constructor
-        QuantumOperator();
-        QuantumOperator(string name, vector<pos_t> const &qbits);
-        QuantumOperator(string name, vector<pos_t> const &qbits, vector<pos_t> const &cbits);
-        QuantumOperator(string name, vector<pos_t> const &cbits, const uint condition, vector<QuantumOperator> const &ins);
-        QuantumOperator(string name, vector<double> paras, vector<pos_t> const &control_qubits, vector<pos_t> const &targe_qubits, RowMatrixXcd const &mat, bool diag=false, bool real=false);
-        QuantumOperator(string name,vector<double> paras, vector<pos_t> const &positions, uint control_num, RowMatrixXcd const &mat, bool diag=false, bool real=false);
+class QuantumOperator {
+protected:
+  string name_;
+  vector<pos_t> positions_;
+  vector<double> paras_;
+  uint control_num_;
+  uint targe_num_;
+  bool diag_;
+  bool real_;
+  RowMatrixXcd mat_;
+  vector<pos_t> qbits_;
+  vector<pos_t> cbits_;
+  vector<QuantumOperator> instructions_;
+  uint condition_;
 
-        //data accessor
-        string name() const {return name_;}
-        vector<double> paras() const {return paras_;}
-        bool has_control() const{return control_num_ == 0 ? false : true;}
-        bool is_real() const{ return real_; }
-        bool is_diag() const{ return diag_; }
-        RowMatrixXcd mat() const { return mat_;}
-        uint control_num() const { return control_num_; } 
-        uint targe_num() const { return targe_num_; }
-        uint condition() const { return condition_; }
-        vector<pos_t> positions(){ return positions_; }
-        explicit operator bool() const {
-            return !(name_ == "empty");
-        }
-        vector<pos_t> qbits(){ return qbits_; }
-        vector<pos_t> cbits(){ return cbits_; }
-        vector<QuantumOperator> instructions(){ return instructions_; }
-        //Apply method
-        virtual void apply_to_state(StateVector<double> & state){ };
-};
+public:
+  // Constructor
+  QuantumOperator();
+  QuantumOperator(string name, vector<pos_t> const& qbits);
+  QuantumOperator(string name, vector<pos_t> const& qbits,
+                  vector<pos_t> const& cbits);
+  QuantumOperator(string name, vector<pos_t> const& cbits, const uint condition,
+                  vector<QuantumOperator> const& ins);
+  QuantumOperator(string name, vector<double> paras,
+                  vector<pos_t> const& control_qubits,
+                  vector<pos_t> const& targe_qubits, RowMatrixXcd const& mat,
+                  bool diag = false, bool real = false);
+  QuantumOperator(string name, vector<double> paras,
+                  vector<pos_t> const& positions, uint control_num,
+                  RowMatrixXcd const& mat, bool diag = false,
+                  bool real = false);
 
+  // data accessor
+  string name() const { return name_; }
+  vector<double> paras() const { return paras_; }
+  bool has_control() const { return control_num_ == 0 ? false : true; }
+  bool is_real() const { return real_; }
+  bool is_diag() const { return diag_; }
+  RowMatrixXcd mat() const { return mat_; }
+  uint control_num() const { return control_num_; }
+  uint targe_num() const { return targe_num_; }
+  uint condition() const { return condition_; }
+  vector<pos_t> positions() { return positions_; }
+  explicit operator bool() const { return !(name_ == "empty"); }
+  vector<pos_t> qbits() { return qbits_; }
+  vector<pos_t> cbits() { return cbits_; }
+  vector<QuantumOperator> instructions() { return instructions_; }
+  // Apply method
+  virtual void apply_to_state(StateVector<double>& state){};
+};
 
-QuantumOperator::QuantumOperator() : name_("empty"){ };
+QuantumOperator::QuantumOperator() : name_("empty"){};
 
-QuantumOperator::QuantumOperator(string name, vector<pos_t> const &qbits)
-:
-name_(name),
-targe_num_(0),
-qbits_(qbits){}
+QuantumOperator::QuantumOperator(string name, vector<pos_t> const& qbits)
+    : name_(name), targe_num_(0), qbits_(qbits) {}
 
-QuantumOperator::QuantumOperator(string name, vector<pos_t> const &qbits, vector<pos_t> const &cbits)
-:
-name_(name),
-targe_num_(0),
-qbits_(qbits),
-cbits_(cbits){}
+QuantumOperator::QuantumOperator(string name, vector<pos_t> const& qbits,
+                                 vector<pos_t> const& cbits)
+    : name_(name), targe_num_(0), qbits_(qbits), cbits_(cbits) {}
 
-QuantumOperator::QuantumOperator(string name, vector<pos_t> const &cbits, const uint condition, vector<QuantumOperator> const &ins)
-:
-name_(name),
-targe_num_(0),
-cbits_(cbits),
-instructions_(ins),
-condition_(condition){}
+QuantumOperator::QuantumOperator(string name, vector<pos_t> const& cbits,
+                                 const uint condition,
+                                 vector<QuantumOperator> const& ins)
+    : name_(name), targe_num_(0), cbits_(cbits), instructions_(ins),
+      condition_(condition) {}
 
-QuantumOperator::QuantumOperator(string name, vector<double> paras, vector<pos_t> const &positions, uint control_num, RowMatrixXcd const &mat, bool diag, bool real)
-:
-name_(name),
-paras_(paras),
-positions_(positions),
-control_num_(control_num),
-targe_num_(positions.size()-control_num),
-diag_(diag),
-real_(real),
-mat_(mat){ }
+QuantumOperator::QuantumOperator(string name, vector<double> paras,
+                                 vector<pos_t> const& positions,
+                                 uint control_num, RowMatrixXcd const& mat,
+                                 bool diag, bool real)
+    : name_(name), paras_(paras), positions_(positions),
+      control_num_(control_num), targe_num_(positions.size() - control_num),
+      diag_(diag), real_(real), mat_(mat) {}
 
-QuantumOperator::QuantumOperator(string name, vector<double> paras, vector<pos_t> const &control_qubits, vector<pos_t> const &targe_qubits, RowMatrixXcd const &mat, bool diag, bool real)
-:
-name_(name),
-paras_(paras),
-diag_(diag),
-real_(real),
-mat_(mat){
-    positions_ = control_qubits;
-    positions_.insert(positions_.end(), targe_qubits.begin(), targe_qubits.end());
-    control_num_ = control_qubits.size();
-    targe_num_ = targe_qubits.size();
+QuantumOperator::QuantumOperator(string name, vector<double> paras,
+                                 vector<pos_t> const& control_qubits,
+                                 vector<pos_t> const& targe_qubits,
+                                 RowMatrixXcd const& mat, bool diag, bool real)
+    : name_(name), paras_(paras), diag_(diag), real_(real), mat_(mat) {
+  positions_ = control_qubits;
+  positions_.insert(positions_.end(), targe_qubits.begin(), targe_qubits.end());
+  control_num_ = control_qubits.size();
+  targe_num_ = targe_qubits.size();
 }
-
diff --git a/src/qfvm/qasm.hpp b/src/qfvm/qasm.hpp
index e0a2285..4cfd83d 100644
--- a/src/qfvm/qasm.hpp
+++ b/src/qfvm/qasm.hpp
@@ -1,42 +1,74 @@
 #pragma once
 
-#include <unordered_map>
 #include "types.hpp"
 #include "util.h"
+#include <unordered_map>
 
-using Qfutil::split_string;
 using Qfutil::find_numbers;
-#define Pair(name) {#name, Opname::name}
+using Qfutil::split_string;
+#define Pair(name)                                                             \
+  {                                                                            \
+#name, Opname::name                                                        \
+  }
 
-enum class Opname{
-    creg, x, y, z, h, s, sdg, t, tdg, p, rx, ry, rz, cnot, cx, cz, crx, cp, ccx, toffoli, swap, iswap, rxx, ryy, rzz, measure, reset, cif
+enum class Opname {
+  creg,
+  x,
+  y,
+  z,
+  h,
+  s,
+  sdg,
+  t,
+  tdg,
+  p,
+  rx,
+  ry,
+  rz,
+  cnot,
+  cx,
+  cz,
+  crx,
+  cp,
+  ccx,
+  toffoli,
+  swap,
+  iswap,
+  rxx,
+  ryy,
+  rzz,
+  measure,
+  reset,
+  cif
 };
 
-std::unordered_map<string, Opname> OPMAP{Pair(creg), Pair(x), Pair(y), Pair(z), Pair(h), Pair(s), Pair(sdg), Pair(t),
-                            Pair(tdg), Pair(p), Pair(rx), Pair(ry), Pair(rz), Pair(cnot), Pair(cx), Pair(cz), 
-                            Pair(crx), Pair(cp), Pair(ccx), Pair(swap), Pair(iswap), Pair(rxx), Pair(ryy), 
-                            Pair(rzz), Pair(measure), Pair(reset), Pair(cif)};
+std::unordered_map<string, Opname> OPMAP{
+    Pair(creg),    Pair(x),     Pair(y),     Pair(z),   Pair(h),   Pair(s),
+    Pair(sdg),     Pair(t),     Pair(tdg),   Pair(p),   Pair(rx),  Pair(ry),
+    Pair(rz),      Pair(cnot),  Pair(cx),    Pair(cz),  Pair(crx), Pair(cp),
+    Pair(ccx),     Pair(swap),  Pair(iswap), Pair(rxx), Pair(ryy), Pair(rzz),
+    Pair(measure), Pair(reset), Pair(cif)};
 
-struct Operation{
-    string name;
-    vector<pos_t> positions;
-    vector<double> params; 
+struct Operation {
+  string name;
+  vector<pos_t> positions;
+  vector<double> params;
 
-    void print_info(){
-        std::cout << "name " << name << std::endl;
-        std::cout << "positions: ";
-        for (auto pos : positions){
-            std::cout << pos << " ";
-        }
-        std::cout << std::endl;
+  void print_info() {
+    std::cout << "name " << name << std::endl;
+    std::cout << "positions: ";
+    for (auto pos : positions) {
+      std::cout << pos << " ";
+    }
+    std::cout << std::endl;
 
-        if (params.size() > 0){
-        printf("parameters: ");
-            for (auto para : params){
-                printf("%.6f ", para);
-            }
-        }
-        printf("\n");
-        printf("-----\n");
+    if (params.size() > 0) {
+      printf("parameters: ");
+      for (auto para : params) {
+        printf("%.6f ", para);
+      }
     }
-} ;
+    printf("\n");
+    printf("-----\n");
+  }
+};
diff --git a/src/qfvm/qfvm.cpp b/src/qfvm/qfvm.cpp
index f318da6..913262d 100644
--- a/src/qfvm/qfvm.cpp
+++ b/src/qfvm/qfvm.cpp
@@ -1,8 +1,10 @@
-#include <pybind11/pybind11.h>
-#include <pybind11/numpy.h>
 #include "simulator.hpp"
 #include <iostream>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
 #include <random>
+#include <tuple>
+
 #ifdef _USE_GPU
 #include <cuda_simulator.cuh>
 #endif
@@ -11,154 +13,254 @@
 #include <custate_simu.cuh>
 #endif
 
+#ifdef USE_SIMD
+constexpr size_t _word_size = 256;
+#else
+constexpr size_t _word_size = 64;
+#endif
+
 namespace py = pybind11;
 
 template <typename T>
-py::array_t<T> to_numpy(const std::tuple<T*, size_t> &src) {
-    auto src_ptr = std::get<0>(src);
-    auto src_size = std::get<1>(src);
-
-    auto capsule = py::capsule(src_ptr, [](void* p) {
-        delete [] reinterpret_cast<T*>(p);
-    });
-    return py::array_t<T>(
-        src_size,
-        src_ptr,
-        capsule
-    );
+py::array_t<T> to_numpy(const std::tuple<T*, size_t>& src) {
+  auto src_ptr = std::get<0>(src);
+  auto src_size = std::get<1>(src);
+
+  auto capsule =
+      py::capsule(src_ptr, [](void* p) { delete[] reinterpret_cast<T*>(p); });
+  return py::array_t<T>(src_size, src_ptr, capsule);
 }
 
-std::pair<std::map<uint, uint>, py::array_t<complex<double>> > simulate_circuit(py::object const&pycircuit, py::array_t<complex<double>> &np_inputstate, const int &shots){
-    auto circuit = Circuit(pycircuit);
-    py::buffer_info buf = np_inputstate.request();
-    auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
-    size_t data_size = buf.size;
-    // If measure all at the end, simulate once
-    uint actual_shots = shots;
-    if (circuit.final_measure()) actual_shots = 1;
-    StateVector<double> global_state;
-    vector<std::pair<uint,uint>>measures = circuit.measure_vec();
-    std::map<uint,bool>cbit_measured;
-    for(auto &pair: measures){
-        cbit_measured[pair.second] = true;        
+std::pair<std::map<uint, uint>, py::array_t<complex<double>>>
+simulate_circuit(py::object const& pycircuit,
+                 py::array_t<complex<double>>& np_inputstate,
+                 const int& shots) {
+  auto circuit = Circuit(pycircuit);
+  py::buffer_info buf = np_inputstate.request();
+  auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
+  size_t data_size = buf.size;
+  // If measure all at the end, simulate once
+  uint actual_shots = shots;
+  if (circuit.final_measure())
+    actual_shots = 1;
+  StateVector<double> global_state;
+  vector<std::pair<uint, uint>> measures = circuit.measure_vec();
+  std::map<uint, bool> cbit_measured;
+  for (auto& pair : measures) {
+    cbit_measured[pair.second] = true;
+  }
+  // Store outcome's count
+  std::map<uint, uint> outcount;
+  for (uint i = 0; i < actual_shots; i++) {
+    StateVector<double> state;
+    if (data_size == 0) {
+      simulate(circuit, state);
+    } else {
+      // deepcopy state
+      vector<std::complex<double>> data_copy(data_ptr, data_ptr + data_size);
+      state =
+          std::move(StateVector<double>(data_copy.data(), data_copy.size()));
+      simulate(circuit, state);
+    }
+    if (!circuit.final_measure()) {
+      // store reg
+      vector<uint> tmpcreg = state.creg();
+      uint outcome = 0;
+      for (uint j = 0; j < tmpcreg.size(); j++) {
+        if (cbit_measured.find(j) == cbit_measured.end())
+          continue;
+        outcome *= 2;
+        outcome += tmpcreg[j];
+      }
+      if (outcount.find(outcome) != outcount.end())
+        outcount[outcome]++;
+      else
+        outcount[outcome] = 1;
     }
-    // Store outcome's count
-    std::map<uint, uint> outcount;
-    for(uint i =0; i < actual_shots; i++){
-        StateVector<double> state;
-        if(data_size == 0){
-            simulate(circuit, state);
-        }else{
-            //deepcopy state
-            vector<std::complex<double>> data_copy(data_ptr, data_ptr + data_size);
-            state = std::move(StateVector<double>(data_copy.data(), data_copy.size()));
-            simulate(circuit, state);
-        }
-        if(!circuit.final_measure()){
-            // store reg
-            vector<uint> tmpcreg = state.creg();
-            uint outcome = 0;
-            for(uint j=0;j<tmpcreg.size();j++){
-                if(cbit_measured.find(j) == cbit_measured.end()) continue; 
-                outcome *= 2;
-                outcome += tmpcreg[j];
-            }
-            if(outcount.find(outcome) != outcount.end()) outcount[outcome]++;
-            else outcount[outcome] = 1;
-        }
-        if (circuit.final_measure() || i == actual_shots-1) global_state = std::move(state);
+    if (circuit.final_measure() || i == actual_shots - 1)
+      global_state = std::move(state);
+  }
+  // sample outcome if final_measure is true
+  if (circuit.final_measure() && !measures.empty()) {
+    vector<uint> tmpcount(global_state.size(), 0);
+    vector<double> probs = global_state.probabilities();
+    std::random_device rd;
+    std::mt19937 global_rng(rd());
+    for (uint i = 0; i < shots; i++) {
+      uint outcome = std::discrete_distribution<uint>(probs.begin(),
+                                                      probs.end())(global_rng);
+      tmpcount[outcome]++;
     }
-    // sample outcome if final_measure is true
-    if(circuit.final_measure() && !measures.empty()){
-        vector<uint> tmpcount(global_state.size(), 0);
-        vector<double> probs = global_state.probabilities();
-        std::random_device rd;
-        std::mt19937 global_rng(rd());
-        for(uint i = 0; i < shots; i++){
-            uint outcome = std::discrete_distribution<uint>(probs.begin(), probs.end())(global_rng);
-            tmpcount[outcome]++;
-        }
-        // map to reg
-        for(uint i = 0; i < global_state.size(); i++){
-            if(tmpcount[i] == 0) continue;
-            vector<uint> tmpcreg(global_state.cbit_num(), 0); 
-            vector<uint> tmpout = int2vec(i, 2);
-            if(tmpout.size() < global_state.num())
-                tmpout.resize(global_state.num());
-            for(auto &pair: measures){
-                tmpcreg[pair.second] = tmpout[pair.first];
-            }
-            uint outcome = 0;
-            for(uint j=0;j<tmpcreg.size();j++){
-                if(cbit_measured.find(j) == cbit_measured.end()) continue; 
-                outcome *= 2;
-                outcome += tmpcreg[j];
-            }
-            if(outcount.find(outcome) != outcount.end()) outcount[outcome] += tmpcount[i];
-            else outcount[outcome] = tmpcount[i];
-        }
-    } 
-    // return 
-    if(data_size == 0) return std::make_pair(outcount, to_numpy(global_state.move_data_to_python()));
-    else return std::make_pair(outcount, np_inputstate);  
+    // map to reg
+    for (uint i = 0; i < global_state.size(); i++) {
+      if (tmpcount[i] == 0)
+        continue;
+      vector<uint> tmpcreg(global_state.cbit_num(), 0);
+      vector<uint> tmpout = int2vec(i, 2);
+      if (tmpout.size() < global_state.num())
+        tmpout.resize(global_state.num());
+      for (auto& pair : measures) {
+        tmpcreg[pair.second] = tmpout[pair.first];
+      }
+      uint outcome = 0;
+      for (uint j = 0; j < tmpcreg.size(); j++) {
+        if (cbit_measured.find(j) == cbit_measured.end())
+          continue;
+        outcome *= 2;
+        outcome += tmpcreg[j];
+      }
+      if (outcount.find(outcome) != outcount.end())
+        outcount[outcome] += tmpcount[i];
+      else
+        outcount[outcome] = tmpcount[i];
+    }
+  }
+  // return
+  if (data_size == 0)
+    return std::make_pair(outcount,
+                          to_numpy(global_state.move_data_to_python()));
+  else
+    return std::make_pair(outcount, np_inputstate);
 }
 
-#ifdef _USE_GPU
-py::object simulate_circuit_gpu(py::object const&pycircuit, py::array_t<complex<double>> &np_inputstate){
-    auto circuit = Circuit(pycircuit);
-    py::buffer_info buf = np_inputstate.request();
-    auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
-    size_t data_size = buf.size;
+std::map<uint, uint> simulate_circuit_clifford(py::object const& pycircuit,
+                                               const int& shots) {
 
+  auto circuit = Circuit(pycircuit);
 
-    if (data_size == 0){
-        StateVector<double> state;
-        simulate_gpu(circuit, state);
-        return to_numpy(state.move_data_to_python());
-    }
-    else{
-      StateVector<double> state(data_ptr, buf.size);
-      simulate_gpu(circuit, state);
-      state.move_data_to_python();
-      return np_inputstate;
-    }
-}
-#endif
+  // If measure all at the end, simulate once
+  uint actual_shots = shots;
 
-#ifdef _USE_CUQUANTUM
-py::object simulate_circuit_custate(py::object const&pycircuit, py::array_t<complex<double>> &np_inputstate){
-    auto circuit = Circuit(pycircuit);
-    py::buffer_info buf = np_inputstate.request();
-    auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
-    size_t data_size = buf.size;
+  // qbit, cbit
+  vector<std::pair<uint, uint>> measures = circuit.measure_vec();
+  std::map<uint, bool> cbit_measured;
+  for (auto& pair : measures) {
+    cbit_measured[pair.second] = true;
+  }
+
+  // Store outcome's count
+  std::map<uint, uint> outcount;
+
+  circuit_simulator<_word_size> cs(circuit.qubit_num());
+
+  for (uint i = 0; i < actual_shots; i++) {
+
+    simulate(circuit, cs);
+    uint outcome = 0;
+
+    if (!circuit.final_measure()) {
+      // qubit, cbit, measure result
+      auto measure_results = cs.current_measurement_record();
 
+      // make sure the order is the same with other simulators
+      std::sort(
+          measure_results.begin(), measure_results.end(),
+          [](auto& a, auto& b) { return std::get<1>(a) < std::get<1>(b); });
 
-    if (data_size == 0){
-        StateVector<double> state;
-        simulate_custate(circuit, state);
-        return to_numpy(state.move_data_to_python());
+      for (auto& measure_result : measure_results) {
+        outcome *= 2;
+        outcome += std::get<2>(measure_result);
+      }
+
+    } else if (circuit.final_measure() && !measures.empty()) {
+      for (auto& measure : measures) {
+        cs.do_circuit_instruction(
+            {"measure", std::vector<size_t>{measure.first},
+             std::vector<double>{static_cast<double>(measure.second)}});
+      }
+
+      // qubit, cbit, measure result
+      auto measure_results = cs.current_measurement_record();
+
+      // make sure the order is the same with other simulators
+      std::sort(
+          measure_results.begin(), measure_results.end(),
+          [](auto& a, auto& b) { return std::get<1>(a) < std::get<1>(b); });
+
+      for (auto& measure_result : measure_results) {
+        outcome *= 2;
+        outcome += std::get<2>(measure_result);
+      }
     }
-    else{
-      StateVector<double> state(data_ptr, buf.size);
-      simulate_custate(circuit, state);
-      state.move_data_to_python();
-      return np_inputstate;
+
+    if (measures.empty()) {
+      continue;
     }
+
+    if (outcount.find(outcome) != outcount.end())
+      outcount[outcome]++;
+    else
+      outcount[outcome] = 1;
+
+    cs.reset_tableau();
+    cs.sim_record.clear();
+  }
+
+  return outcount;
+}
+
+#ifdef _USE_GPU
+py::object simulate_circuit_gpu(py::object const& pycircuit,
+                                py::array_t<complex<double>>& np_inputstate) {
+  auto circuit = Circuit(pycircuit);
+  py::buffer_info buf = np_inputstate.request();
+  auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
+  size_t data_size = buf.size;
+
+  if (data_size == 0) {
+    StateVector<double> state;
+    simulate_gpu(circuit, state);
+    return to_numpy(state.move_data_to_python());
+  } else {
+    StateVector<double> state(data_ptr, buf.size);
+    simulate_gpu(circuit, state);
+    state.move_data_to_python();
+    return np_inputstate;
+  }
 }
 #endif
 
+#ifdef _USE_CUQUANTUM
+py::object
+simulate_circuit_custate(py::object const& pycircuit,
+                         py::array_t<complex<double>>& np_inputstate) {
+  auto circuit = Circuit(pycircuit);
+  py::buffer_info buf = np_inputstate.request();
+  auto* data_ptr = reinterpret_cast<std::complex<double>*>(buf.ptr);
+  size_t data_size = buf.size;
 
+  if (data_size == 0) {
+    StateVector<double> state;
+    simulate_custate(circuit, state);
+    return to_numpy(state.move_data_to_python());
+  } else {
+    StateVector<double> state(data_ptr, buf.size);
+    simulate_custate(circuit, state);
+    state.move_data_to_python();
+    return np_inputstate;
+  }
+}
+#endif
 
 PYBIND11_MODULE(qfvm, m) {
-    m.doc() = "Qfvm simulator";
-    m.def("simulate_circuit", &simulate_circuit, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t<complex<double>>(0), py::arg("shots"));
+  m.doc() = "Qfvm simulator";
+  m.def("simulate_circuit", &simulate_circuit, "Simulate with circuit",
+        py::arg("circuit"),
+        py::arg("inputstate") = py::array_t<complex<double>>(0),
+        py::arg("shots"));
+  m.def("simulate_circuit_clifford", &simulate_circuit_clifford,
+        "Simulate with circuit using clifford", py::arg("circuit"),
+        py::arg("shots"));
 
-    #ifdef _USE_GPU
-     m.def("simulate_circuit_gpu", &simulate_circuit_gpu, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t<complex<double>>(0));
-    #endif
+#ifdef _USE_GPU
+  m.def("simulate_circuit_gpu", &simulate_circuit_gpu, "Simulate with circuit",
+        py::arg("circuit"),
+        py::arg("inputstate") = py::array_t<complex<double>>(0));
+#endif
 
-    #ifdef _USE_CUQUANTUM
-    m.def("simulate_circuit_custate", &simulate_circuit_custate, "Simulate with circuit", py::arg("circuit"), py::arg("inputstate")= py::array_t<complex<double>>(0));
-    #endif
+#ifdef _USE_CUQUANTUM
+  m.def("simulate_circuit_custate", &simulate_circuit_custate,
+        "Simulate with circuit", py::arg("circuit"),
+        py::arg("inputstate") = py::array_t<complex<double>>(0));
+#endif
 }
-
diff --git a/src/qfvm/simulator.hpp b/src/qfvm/simulator.hpp
index 1499922..9e88fb6 100644
--- a/src/qfvm/simulator.hpp
+++ b/src/qfvm/simulator.hpp
@@ -1,116 +1,162 @@
 #pragma once
 
-#include "statevector.hpp"
 #include "circuit.hpp"
+#include "clifford_simulator.h"
+#include "qasm.hpp"
+#include "statevector.hpp"
+#include "types.hpp"
+#include <cstddef>
+#include <vector>
 
-void apply_op(QuantumOperator &op, StateVector<data_t> &state){
-    bool matched = false; 
-    switch (OPMAP[op.name()]){
-            //Named gate
-        case Opname::x:
-            state.apply_x(op.positions()[0]);
-            break;
-        case Opname::y:
-            state.apply_y(op.positions()[0]);
-            break;
-        case Opname::z:
-            state.apply_z(op.positions()[0]);
-            break;
-        case Opname::h:
-            state.apply_h(op.positions()[0]);
-            break;
-        case Opname::s:
-            state.apply_s(op.positions()[0]);
-            break;
-        case Opname::sdg:
-            state.apply_sdag(op.positions()[0]);
-            break;
-        case Opname::t:
-            state.apply_t(op.positions()[0]);
-            break;
-        case Opname::tdg:
-            state.apply_tdag(op.positions()[0]);
-            break;
-        case Opname::p:
-            state.apply_p(op.positions()[0], op.paras()[0]);
-            break;
-        case Opname::rx:
-            state.apply_rx(op.positions()[0], op.paras()[0]);
-            break;
-        case Opname::ry:
-            state.apply_ry(op.positions()[0], op.paras()[0]);
-            break;
-        case Opname::rz:
-            state.apply_rz(op.positions()[0], op.paras()[0]);
-            break;
-        case Opname::cx:
-            state.apply_cnot(op.positions()[0], op.positions()[1]);
-            break;
-        case Opname::cnot:
-            state.apply_cnot(op.positions()[0], op.positions()[1]);
-            break;
-        case Opname::cp:
-            state.apply_cp(op.positions()[0], op.positions()[1], op.paras()[0]);
-            break;
-        case Opname::cz:
-            state.apply_cz(op.positions()[0], op.positions()[1]);
-            break;
-        case Opname::ccx:
-            state.apply_ccx(op.positions()[0], op.positions()[1],  op.positions()[2]);
-            break;
-        case Opname::toffoli:
-            state.apply_ccx(op.positions()[0], op.positions()[1],  op.positions()[2]);
-        case Opname::rzz:
-            state.apply_cnot(op.positions()[0], op.positions()[1]);
-            state.apply_rz(op.positions()[1], op.paras()[0]);
-            state.apply_cnot(op.positions()[0], op.positions()[1]);
-            break;
-        case Opname::measure:
-            state.apply_measure(op.qbits(), op.cbits());
-            break;
-        case Opname::reset:
-            state.apply_reset(op.qbits());
-            break;
-        case Opname::cif:
-            // check cbits and condition
-            matched = state.check_cif(op.cbits(), op.condition());
-            // apply op in instructions
-            if(matched){
-                for(auto op_h :op.instructions()){
-                    apply_op(op_h, state);
-                }
-            }
-            break;
-        //Other general gate
-        default:
-        {
-            if (op.targe_num() == 1){
-                auto mat_temp = op.mat();
-                complex<double> *mat = mat_temp.data();
-                if (op.control_num() == 0){
-                    state.apply_one_targe_gate_general<0>(op.positions(), mat);
-                }else if (op.control_num() == 1){
-                    state.apply_one_targe_gate_general<1>(op.positions(), mat);
-                }else{
-                    state.apply_one_targe_gate_general<2>(op.positions(), mat);
-                }
-            }else if(op.targe_num() > 1){
-                state.apply_multi_targe_gate_general(op.positions(), op.control_num(), op.mat());
-            }else{
-                throw "Invalid target number";
-            }
-        }
+void apply_op(QuantumOperator& op, StateVector<data_t>& state) {
+  bool matched = false;
+  switch (OPMAP[op.name()]) {
+    // Named gate
+  case Opname::x:
+    state.apply_x(op.positions()[0]);
+    break;
+  case Opname::y:
+    state.apply_y(op.positions()[0]);
+    break;
+  case Opname::z:
+    state.apply_z(op.positions()[0]);
+    break;
+  case Opname::h:
+    state.apply_h(op.positions()[0]);
+    break;
+  case Opname::s:
+    state.apply_s(op.positions()[0]);
+    break;
+  case Opname::sdg:
+    state.apply_sdag(op.positions()[0]);
+    break;
+  case Opname::t:
+    state.apply_t(op.positions()[0]);
+    break;
+  case Opname::tdg:
+    state.apply_tdag(op.positions()[0]);
+    break;
+  case Opname::p:
+    state.apply_p(op.positions()[0], op.paras()[0]);
+    break;
+  case Opname::rx:
+    state.apply_rx(op.positions()[0], op.paras()[0]);
+    break;
+  case Opname::ry:
+    state.apply_ry(op.positions()[0], op.paras()[0]);
+    break;
+  case Opname::rz:
+    state.apply_rz(op.positions()[0], op.paras()[0]);
+    break;
+  case Opname::cx:
+    state.apply_cnot(op.positions()[0], op.positions()[1]);
+    break;
+  case Opname::cnot:
+    state.apply_cnot(op.positions()[0], op.positions()[1]);
+    break;
+  case Opname::cp:
+    state.apply_cp(op.positions()[0], op.positions()[1], op.paras()[0]);
+    break;
+  case Opname::cz:
+    state.apply_cz(op.positions()[0], op.positions()[1]);
+    break;
+  case Opname::ccx:
+    state.apply_ccx(op.positions()[0], op.positions()[1], op.positions()[2]);
+    break;
+  case Opname::toffoli:
+    state.apply_ccx(op.positions()[0], op.positions()[1], op.positions()[2]);
+  case Opname::rzz:
+    state.apply_cnot(op.positions()[0], op.positions()[1]);
+    state.apply_rz(op.positions()[1], op.paras()[0]);
+    state.apply_cnot(op.positions()[0], op.positions()[1]);
+    break;
+  case Opname::measure:
+    state.apply_measure(op.qbits(), op.cbits());
+    break;
+  case Opname::reset:
+    state.apply_reset(op.qbits());
+    break;
+  case Opname::cif:
+    // check cbits and condition
+    matched = state.check_cif(op.cbits(), op.condition());
+    // apply op in instructions
+    if (matched) {
+      for (auto op_h : op.instructions()) {
+        apply_op(op_h, state);
+      }
     }
+    break;
+  // Other general gate
+  default: {
+    if (op.targe_num() == 1) {
+      auto mat_temp = op.mat();
+      complex<double>* mat = mat_temp.data();
+      if (op.control_num() == 0) {
+        state.apply_one_targe_gate_general<0>(op.positions(), mat);
+      } else if (op.control_num() == 1) {
+        state.apply_one_targe_gate_general<1>(op.positions(), mat);
+      } else {
+        state.apply_one_targe_gate_general<2>(op.positions(), mat);
+      }
+    } else if (op.targe_num() > 1) {
+      state.apply_multi_targe_gate_general(op.positions(), op.control_num(),
+                                           op.mat());
+    } else {
+      throw "Invalid target number";
+    }
+  }
+  }
+}
+
+void simulate(Circuit const& circuit, StateVector<data_t>& state) {
+  state.set_num(circuit.qubit_num());
+  state.set_creg(circuit.cbit_num());
+  // skip measure and handle it in qfvm.cpp
+  bool skip_measure = circuit.final_measure();
+  for (auto op : circuit.instructions()) {
+    if (skip_measure == true && op.name() == "measure")
+      continue;
+    apply_op(op, state);
+  }
+}
+
+template <size_t word_size>
+void apply_measure(circuit_simulator<word_size>& cs, const vector<pos_t>& qbits,
+                   const vector<pos_t>& cbits) {
+  for (size_t i = 0; i < qbits.size(); i++) {
+    cs.do_circuit_instruction(
+        {"measure", std::vector<size_t>{qbits[i]},
+         std::vector<double>{static_cast<double>(cbits[i])}});
+  }
 }
 
-void simulate(Circuit const& circuit, StateVector<data_t> & state){
-    state.set_num(circuit.qubit_num());
-    state.set_creg(circuit.cbit_num());
-    // skip measure and handle it in qfvm.cpp 
-    bool skip_measure = circuit.final_measure();
-    for (auto op : circuit.instructions()){
-        if(skip_measure == true && op.name() == "measure") continue;
-        apply_op(op , state);
+template <size_t word_size>
+void apply_op(QuantumOperator& op, circuit_simulator<word_size>& cs) {
+  // TODO: support args
+  switch (OPMAP[op.name()]) {
+  case Opname::measure:
+    apply_measure(cs, op.qbits(), op.cbits());
+    break;
+  case Opname::reset:
+    for (auto qubit : op.qbits()) {
+      cs.do_circuit_instruction(
+          {"reset", std::vector<size_t>{static_cast<size_t>(qubit)}});
     }
+    break;
+  default:
+    auto qubits = op.positions();
+    cs.do_circuit_instruction(
+        {op.name(), std::vector<size_t>(qubits.begin(), qubits.end())});
+  }
 }
 
+template <size_t word_size>
+void simulate(Circuit const& circuit, circuit_simulator<word_size>& cs) {
+  // skip measure and handle it in qfvm.cpp
+  bool skip_measure = circuit.final_measure();
+  for (auto op : circuit.instructions()) {
+    if (skip_measure == true && op.name() == "measure")
+      continue;
+    apply_op(op, cs);
+  }
+}
diff --git a/src/qfvm/statevector.hpp b/src/qfvm/statevector.hpp
index 99cf5f7..30c11fc 100644
--- a/src/qfvm/statevector.hpp
+++ b/src/qfvm/statevector.hpp
@@ -1,13 +1,13 @@
 #pragma once
 #include "types.hpp"
 #include "util.h"
-#include <stdlib.h>
-#include <iostream>
+#include <algorithm>
 #include <cmath>
-#include <omp.h>
 #include <functional>
-#include <algorithm>
+#include <iostream>
+#include <omp.h>
 #include <random>
+#include <stdlib.h>
 #ifdef USE_SIMD
 #ifdef _MSC_VER
 #include <intrin.h>
@@ -16,1029 +16,1004 @@
 #endif
 #endif
 
-template <class real_t = double>
-class StateVector{
-    private:
-        uint num_;
-         // classical bit
-        uint cbit_num_;  
-        vector<uint> creg_;
-        size_t size_;
-        std::unique_ptr<complex<real_t>[]> data_;
-        //random engine
-        std::mt19937_64 rng_;
-
-    public:
-        //construct function
-        StateVector();
-        explicit StateVector(uint num);
-        explicit StateVector(complex<real_t> *data, size_t data_size);
-        //move assign
-        // StateVector& operator=(StateVector&& other){
-        //     if(this != &other){
-        //         data_ = std::move(other.data_);
-        //         creg_ = std::move(other.creg_);
-        //         num_ = other.num_;
-        //         cbit_num_ = other.cbit_num_;
-        //         size_ = other.size_;
-
-        //     }
-        //     return *this;
-        // }
-
-        //Named gate function
-        void apply_x(pos_t pos);
-        void apply_y(pos_t pos);
-        void apply_z(pos_t pos);
-        void apply_h(pos_t pos);
-        void apply_s(pos_t pos);
-        void apply_sdag(pos_t pos);
-        void apply_t(pos_t pos);
-        void apply_tdag(pos_t pos);
-        void apply_p(pos_t pos, real_t phase);
-        void apply_rx(pos_t pos, real_t theta);
-        void apply_ry(pos_t pos, real_t theta);
-        void apply_rz(pos_t pos, real_t theta);
-        void apply_cnot(pos_t control, pos_t targe);
-        void apply_cz(pos_t control, pos_t targe);
-        void apply_cp(pos_t control, pos_t targe, real_t phase);
-        void apply_crx(pos_t control, pos_t targe,  real_t theta);
-        void apply_cry(pos_t control, pos_t targe,  real_t theta);
-        void apply_ccx(pos_t control1, pos_t control2, pos_t targe);
-        void apply_swap(pos_t q1, pos_t q2);
-
-        //General implementation
-        //One-target gate, ctrl_num equal 2 represent multi-controlled gate
-        template<int ctrl_num>
-        void apply_one_targe_gate_general(vector<pos_t> const& posv, complex<double> *mat);
-        template<int ctrl_num>
-        void apply_one_targe_gate_diag(vector<pos_t> const& posv, complex<double> *mat);
-        template<int ctrl_num>
-        void apply_one_targe_gate_real(vector<pos_t> const& posv, complex<double> *mat);
-        template<int ctrl_num>
-        void apply_one_targe_gate_x(vector<pos_t> const& posv);
-
-        //Multiple-target gate
-        void apply_multi_targe_gate_general(vector<pos_t> const& posv, uint control_num, RowMatrixXcd const&mat);
-
-        // Measure and Reset
-        std::pair<uint, double> sample_measure_probs(vector<pos_t> const& qbits);
-        vector<double> probabilities() const;
-        void apply_diagonal_matrix(vector<pos_t> const& qbits, vector<std::complex<double> > const& mdiag);
-        void update(vector<pos_t> const& qbits, const uint final_state, const uint meas_state, const double meas_prob);
-        void apply_measure(vector<pos_t> const& qbits,const vector<pos_t> &cbits);
-        void apply_reset(vector<pos_t> const& qbits);
-
-        // cif check
-        bool check_cif(const vector<pos_t> &cbits, const uint condition);
-
-        complex<real_t> operator[] (size_t j) const ;
-        void set_num(uint num);
-        void set_creg(uint num){
-            if(num > 0){
-                cbit_num_ = num;
-                creg_.resize(cbit_num_, 0);
-            }else{
-                throw std::logic_error("The number of cbit must be positive.");
-            }
-        }
+template <class real_t = double> class StateVector {
+private:
+  uint num_;
+  // classical bit
+  uint cbit_num_;
+  vector<uint> creg_;
+  size_t size_;
+  std::unique_ptr<complex<real_t>[]> data_;
+  // random engine
+  std::mt19937_64 rng_;
+
+public:
+  // construct function
+  StateVector();
+  explicit StateVector(uint num);
+  explicit StateVector(complex<real_t>* data, size_t data_size);
+  // move assign
+  //  StateVector& operator=(StateVector&& other){
+  //      if(this != &other){
+  //          data_ = std::move(other.data_);
+  //          creg_ = std::move(other.creg_);
+  //          num_ = other.num_;
+  //          cbit_num_ = other.cbit_num_;
+  //          size_ = other.size_;
+
+  //     }
+  //     return *this;
+  // }
+
+  // Named gate function
+  void apply_x(pos_t pos);
+  void apply_y(pos_t pos);
+  void apply_z(pos_t pos);
+  void apply_h(pos_t pos);
+  void apply_s(pos_t pos);
+  void apply_sdag(pos_t pos);
+  void apply_t(pos_t pos);
+  void apply_tdag(pos_t pos);
+  void apply_p(pos_t pos, real_t phase);
+  void apply_rx(pos_t pos, real_t theta);
+  void apply_ry(pos_t pos, real_t theta);
+  void apply_rz(pos_t pos, real_t theta);
+  void apply_cnot(pos_t control, pos_t targe);
+  void apply_cz(pos_t control, pos_t targe);
+  void apply_cp(pos_t control, pos_t targe, real_t phase);
+  void apply_crx(pos_t control, pos_t targe, real_t theta);
+  void apply_cry(pos_t control, pos_t targe, real_t theta);
+  void apply_ccx(pos_t control1, pos_t control2, pos_t targe);
+  void apply_swap(pos_t q1, pos_t q2);
+
+  // General implementation
+  // One-target gate, ctrl_num equal 2 represent multi-controlled gate
+  template <int ctrl_num>
+  void apply_one_targe_gate_general(vector<pos_t> const& posv,
+                                    complex<double>* mat);
+  template <int ctrl_num>
+  void apply_one_targe_gate_diag(vector<pos_t> const& posv,
+                                 complex<double>* mat);
+  template <int ctrl_num>
+  void apply_one_targe_gate_real(vector<pos_t> const& posv,
+                                 complex<double>* mat);
+  template <int ctrl_num>
+  void apply_one_targe_gate_x(vector<pos_t> const& posv);
+
+  // Multiple-target gate
+  void apply_multi_targe_gate_general(vector<pos_t> const& posv,
+                                      uint control_num,
+                                      RowMatrixXcd const& mat);
+
+  // Measure and Reset
+  std::pair<uint, double> sample_measure_probs(vector<pos_t> const& qbits);
+  vector<double> probabilities() const;
+  void apply_diagonal_matrix(vector<pos_t> const& qbits,
+                             vector<std::complex<double>> const& mdiag);
+  void update(vector<pos_t> const& qbits, const uint final_state,
+              const uint meas_state, const double meas_prob);
+  void apply_measure(vector<pos_t> const& qbits, const vector<pos_t>& cbits);
+  void apply_reset(vector<pos_t> const& qbits);
+
+  // cif check
+  bool check_cif(const vector<pos_t>& cbits, const uint condition);
+
+  complex<real_t> operator[](size_t j) const;
+  void set_num(uint num);
+  void set_creg(uint num) {
+    if (num > 0) {
+      cbit_num_ = num;
+      creg_.resize(cbit_num_, 0);
+    } else {
+      throw std::logic_error("The number of cbit must be positive.");
+    }
+  }
 
-        vector<uint> creg(){
-            return creg_;
-        }
+  vector<uint> creg() { return creg_; }
 
-        void set_rng(){
-            std::random_device rd;
-            rng_.seed(rd());
-        }
+  void set_rng() {
+    std::random_device rd;
+    rng_.seed(rd());
+  }
 
-        void print_state();
-        std::tuple<std::complex<real_t>*, size_t> move_data_to_python() {
-            auto data_ptr = data_.release();
-            return std::make_tuple(std::move(data_ptr), size_);
-        }
-        
-        complex<real_t>* data(){ return data_.get(); }
-        size_t size(){ return size_; }
-        uint num(){ return num_; }
-        uint cbit_num(){ return cbit_num_; }
-};
+  void print_state();
+  std::tuple<std::complex<real_t>*, size_t> move_data_to_python() {
+    auto data_ptr = data_.release();
+    return std::make_tuple(std::move(data_ptr), size_);
+  }
 
+  complex<real_t>* data() { return data_.get(); }
+  size_t size() { return size_; }
+  uint num() { return num_; }
+  uint cbit_num() { return cbit_num_; }
+};
 
 //////// constructors ///////
 
 template <class real_t>
-StateVector<real_t>::StateVector(uint num)
-: num_(num),
-size_(1ULL<<num)
-{   data_ = std::make_unique<complex<real_t>[]>(size_);
-    data_[0] = complex<real_t>(1., 0);
+StateVector<real_t>::StateVector(uint num) : num_(num), size_(1ULL << num) {
+  data_ = std::make_unique<complex<real_t>[]>(size_);
+  data_[0] = complex<real_t>(1., 0);
 };
 
-template <class real_t>
-StateVector<real_t>::StateVector() : StateVector(0){ }
+template <class real_t> StateVector<real_t>::StateVector() : StateVector(0) {}
 
 template <class real_t>
-StateVector<real_t>::StateVector(complex<real_t> *data, size_t data_size)
-:
-data_(data),
-size_(data_size)
-{   
-    num_ = static_cast<int>(std::log2(size_));
+StateVector<real_t>::StateVector(complex<real_t>* data, size_t data_size)
+    : data_(data), size_(data_size) {
+  num_ = static_cast<int>(std::log2(size_));
 }
 
-
-
 //// useful functions /////
 template <class real_t>
-std::complex<real_t> StateVector<real_t>::operator[] (size_t j) const{
-    return data_[j];
+std::complex<real_t> StateVector<real_t>::operator[](size_t j) const {
+  return data_[j];
 }
 
-template <class real_t>
-void StateVector<real_t>::set_num(uint num){
-    if (num_ > 0) {
-        // Initialized from statevector,
-        // should not resize
-        return;
-    }
-    num_ = num;
+template <class real_t> void StateVector<real_t>::set_num(uint num) {
+  if (num_ > 0) {
+    // Initialized from statevector,
+    // should not resize
+    return;
+  }
+  num_ = num;
 
-    if (size_ != 1ULL << num) {
-        data_.reset();
-        size_ = 1ULL << num;
-        data_ = std::make_unique<complex<real_t>[]>(size_);
-        data_[0] = complex<real_t>(1, 0);
-    }
+  if (size_ != 1ULL << num) {
+    data_.reset();
+    size_ = 1ULL << num;
+    data_ = std::make_unique<complex<real_t>[]>(size_);
+    data_[0] = complex<real_t>(1, 0);
+  }
 }
 template <class real_t>
-bool StateVector<real_t>::check_cif(const vector<pos_t> &cbits, const uint condition){
-    uint out = 0;
-    for(uint i = 0; i < cbits.size(); i++){
-        out *= 2;
-        out += creg_[cbits[i]];
-    }
-    return out == condition; 
+bool StateVector<real_t>::check_cif(const vector<pos_t>& cbits,
+                                    const uint condition) {
+  uint out = 0;
+  for (uint i = 0; i < cbits.size(); i++) {
+    out *= 2;
+    out += creg_[cbits[i]];
+  }
+  return out == condition;
 }
 
-template <class real_t>
-void StateVector<real_t>::print_state(){
-    std::cout << "state_data: ";
-    for (auto i=0;i<size_;i++){
-        std::cout << data_[i] << " ";
-    }
-    std::cout << std::endl;
+template <class real_t> void StateVector<real_t>::print_state() {
+  std::cout << "state_data: ";
+  for (auto i = 0; i < size_; i++) {
+    std::cout << data_[i] << " ";
+  }
+  std::cout << std::endl;
 }
 
-
 ////// apply gate ////////
 
-template <class real_t>
-void StateVector<real_t>::apply_x(pos_t pos){
-    const size_t offset = 1<<pos;
-    const size_t rsize = size_>>1;
-     if (pos == 0){ //single step
+template <class real_t> void StateVector<real_t>::apply_x(pos_t pos) {
+  const size_t offset = 1 << pos;
+  const size_t rsize = size_ >> 1;
+  if (pos == 0) { // single step
 #ifdef USE_SIMD
 #pragma omp parallel for
-         for(omp_i j = 0;j < size_;j+=2){
-             double* ptr = (double*)(data_.get() + j);
-            __m256d data = _mm256_loadu_pd(ptr);
-            data = _mm256_permute4x64_pd(data, 78);
-            _mm256_storeu_pd(ptr, data);
-         }
+    for (omp_i j = 0; j < size_; j += 2) {
+      double* ptr = (double*)(data_.get() + j);
+      __m256d data = _mm256_loadu_pd(ptr);
+      data = _mm256_permute4x64_pd(data, 78);
+      _mm256_storeu_pd(ptr, data);
+    }
 #else
 #pragma omp parallel for
-            for(omp_i j = 0;j < size_;j+=2){
-                std::swap(data_[j], data_[j+1]);
-            }
+    for (omp_i j = 0; j < size_; j += 2) {
+      std::swap(data_[j], data_[j + 1]);
+    }
 #endif
-     }
-     else{
+  } else {
 #ifdef USE_SIMD
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            double* ptr0 = (double*)(data_.get()+ i);
-            double* ptr1 = (double*)(data_.get() + i + offset);
-            __m256d data0 = _mm256_loadu_pd(ptr0);
-            __m256d data1 = _mm256_loadu_pd(ptr1);
-            _mm256_storeu_pd(ptr1, data0);
-            _mm256_storeu_pd(ptr0, data1);
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = (j & (offset - 1)) | (j >> pos << pos << 1);
+      double* ptr0 = (double*)(data_.get() + i);
+      double* ptr1 = (double*)(data_.get() + i + offset);
+      __m256d data0 = _mm256_loadu_pd(ptr0);
+      __m256d data1 = _mm256_loadu_pd(ptr1);
+      _mm256_storeu_pd(ptr1, data0);
+      _mm256_storeu_pd(ptr0, data1);
+    }
 #else
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            size_t i1 = i+1;
-            std::swap(data_[i], data_[i+offset]);
-            std::swap(data_[i1], data_[i1+offset]);
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = (j & (offset - 1)) | (j >> pos << pos << 1);
+      size_t i1 = i + 1;
+      std::swap(data_[i], data_[i + offset]);
+      std::swap(data_[i1], data_[i1 + offset]);
+    }
 #endif
-     }
+  }
 }
 
-template <class real_t>
-void StateVector<real_t>::apply_y(pos_t pos){
-    const size_t offset = 1<<pos;
-    const size_t rsize = size_>>1;
-    const complex<real_t> im = imag_I;
-     if (pos == 0){ //single step
+template <class real_t> void StateVector<real_t>::apply_y(pos_t pos) {
+  const size_t offset = 1 << pos;
+  const size_t rsize = size_ >> 1;
+  const complex<real_t> im = imag_I;
+  if (pos == 0) { // single step
 #ifdef USE_SIMD
-        __m256d minus_half = _mm256_set_pd(1, -1, -1, 1);
+    __m256d minus_half = _mm256_set_pd(1, -1, -1, 1);
 #pragma omp parallel for
-         for(omp_i j = 0;j < size_;j+=2){
-             double* ptr = (double*)(data_.get() + j);
-            __m256d data = _mm256_loadu_pd(ptr);
-            data = _mm256_permute4x64_pd(data, 27);
-            data = _mm256_mul_pd(data, minus_half);
-            _mm256_storeu_pd(ptr, data);
-         }
+    for (omp_i j = 0; j < size_; j += 2) {
+      double* ptr = (double*)(data_.get() + j);
+      __m256d data = _mm256_loadu_pd(ptr);
+      data = _mm256_permute4x64_pd(data, 27);
+      data = _mm256_mul_pd(data, minus_half);
+      _mm256_storeu_pd(ptr, data);
+    }
 #else
 #pragma omp parallel for
-        for(omp_i j = 0;j < size_;j+=2){
-            complex<real_t> temp = data_[j];
-            data_[j] = -im*data_[j+1];
-            data_[j+1] = im*temp;
-        }
+    for (omp_i j = 0; j < size_; j += 2) {
+      complex<real_t> temp = data_[j];
+      data_[j] = -im * data_[j + 1];
+      data_[j + 1] = im * temp;
+    }
 #endif
-     }
-     else{
+  } else {
 #ifdef USE_SIMD
-        __m256d minus_even = _mm256_set_pd(1, -1, 1, -1);
-        __m256d minus_odd = _mm256_set_pd(-1, 1, -1, 1);
+    __m256d minus_even = _mm256_set_pd(1, -1, 1, -1);
+    __m256d minus_odd = _mm256_set_pd(-1, 1, -1, 1);
 
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-
-            double* ptr0 = (double*)(data_.get() + i);
-            double* ptr1 = (double*)(data_.get() + i + offset);
-            __m256d data0 = _mm256_loadu_pd(ptr0);
-            __m256d data1 = _mm256_loadu_pd(ptr1);
-            data0 = _mm256_permute_pd(data0, 5);
-            data1 = _mm256_permute_pd(data1, 5);
-            data0 = _mm256_mul_pd(data0, minus_even);
-            data1 = _mm256_mul_pd(data1, minus_odd);
-            _mm256_storeu_pd(ptr1, data0);
-            _mm256_storeu_pd(ptr0, data1);
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = (j & (offset - 1)) | (j >> pos << pos << 1);
+
+      double* ptr0 = (double*)(data_.get() + i);
+      double* ptr1 = (double*)(data_.get() + i + offset);
+      __m256d data0 = _mm256_loadu_pd(ptr0);
+      __m256d data1 = _mm256_loadu_pd(ptr1);
+      data0 = _mm256_permute_pd(data0, 5);
+      data1 = _mm256_permute_pd(data1, 5);
+      data0 = _mm256_mul_pd(data0, minus_even);
+      data1 = _mm256_mul_pd(data1, minus_odd);
+      _mm256_storeu_pd(ptr1, data0);
+      _mm256_storeu_pd(ptr0, data1);
+    }
 #else
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            size_t i1 = i+1;
-            complex<real_t> temp = data_[i];
-            data_[i] = -im*data_[i+offset];
-            data_[i+offset] = im*temp;
-            complex<real_t> temp1 = data_[i1];
-            data_[i1] = -im*data_[i1+offset];
-            data_[i1+offset] = im*temp1;
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = (j & (offset - 1)) | (j >> pos << pos << 1);
+      size_t i1 = i + 1;
+      complex<real_t> temp = data_[i];
+      data_[i] = -im * data_[i + offset];
+      data_[i + offset] = im * temp;
+      complex<real_t> temp1 = data_[i1];
+      data_[i1] = -im * data_[i1 + offset];
+      data_[i1 + offset] = im * temp1;
+    }
 #endif
-     }
+  }
 }
 
-template <class real_t>
-void StateVector<real_t>::apply_z(pos_t pos){
-    const size_t offset = 1<<pos;
-    const size_t rsize = size_>>1;
-    if (pos == 0){ //single step
+template <class real_t> void StateVector<real_t>::apply_z(pos_t pos) {
+  const size_t offset = 1 << pos;
+  const size_t rsize = size_ >> 1;
+  if (pos == 0) { // single step
 #pragma omp parallel for
-        for(omp_i j = 1;j < size_;j+=2){
-            data_[j] *= -1;
-        }
-     }
-     else{
+    for (omp_i j = 1; j < size_; j += 2) {
+      data_[j] *= -1;
+    }
+  } else {
 #ifdef USE_SIMD
-        __m256d minus_one = _mm256_set_pd(-1, -1, -1, -1);
+    __m256d minus_one = _mm256_set_pd(-1, -1, -1, -1);
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            double* ptr1 = (double*)(data_.get() + i + offset);
-            __m256d data1 = _mm256_loadu_pd(ptr1);
-            data1 = _mm256_mul_pd(data1, minus_one);
-            _mm256_storeu_pd(ptr1, data1);
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = (j & (offset - 1)) | (j >> pos << pos << 1);
+      double* ptr1 = (double*)(data_.get() + i + offset);
+      __m256d data1 = _mm256_loadu_pd(ptr1);
+      data1 = _mm256_mul_pd(data1, minus_one);
+      _mm256_storeu_pd(ptr1, data1);
+    }
 #else
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = (j&(offset-1)) | (j>>pos<<pos<<1);
-            data_[i+offset] *= -1;
-            data_[i+offset+1] *= -1;
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = (j & (offset - 1)) | (j >> pos << pos << 1);
+      data_[i + offset] *= -1;
+      data_[i + offset + 1] *= -1;
+    }
 #endif
-     }
+  }
 }
 
-template <class real_t>
-void StateVector<real_t>::apply_h(pos_t pos){
-    const double sqrt2inv = 1. / std::sqrt(2.);
-    complex<double> mat[4] = {sqrt2inv, sqrt2inv, sqrt2inv, -sqrt2inv};
-    apply_one_targe_gate_real<0>(vector<pos_t>{pos}, mat);
-
+template <class real_t> void StateVector<real_t>::apply_h(pos_t pos) {
+  const double sqrt2inv = 1. / std::sqrt(2.);
+  complex<double> mat[4] = {sqrt2inv, sqrt2inv, sqrt2inv, -sqrt2inv};
+  apply_one_targe_gate_real<0>(vector<pos_t>{pos}, mat);
 }
 
-template <class real_t>
-void StateVector<real_t>::apply_s(pos_t pos){
-    complex<double> mat[2] = {1., imag_I};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
+template <class real_t> void StateVector<real_t>::apply_s(pos_t pos) {
+  complex<double> mat[2] = {1., imag_I};
+  apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
 }
 
-template <class real_t>
-void StateVector<real_t>::apply_sdag(pos_t pos){
-    complex<double> mat[2] = {1., -imag_I};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
-
+template <class real_t> void StateVector<real_t>::apply_sdag(pos_t pos) {
+  complex<double> mat[2] = {1., -imag_I};
+  apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
 }
 
-template <class real_t>
-void StateVector<real_t>::apply_t(pos_t pos){
-    complex<double> p = imag_I*PI/4.;
-    complex<double> mat[2] = {1., std::exp(p)};
-   apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
-
+template <class real_t> void StateVector<real_t>::apply_t(pos_t pos) {
+  complex<double> p = imag_I * PI / 4.;
+  complex<double> mat[2] = {1., std::exp(p)};
+  apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
 }
 
-template <class real_t>
-void StateVector<real_t>::apply_tdag(pos_t pos){
-    complex<double> p = -imag_I*PI/4.;
-    complex<double> mat[2] = {1., std::exp(p)};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
-
+template <class real_t> void StateVector<real_t>::apply_tdag(pos_t pos) {
+  complex<double> p = -imag_I * PI / 4.;
+  complex<double> mat[2] = {1., std::exp(p)};
+  apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_p(pos_t pos, real_t phase){
-    complex<double> p = imag_I*phase;
-    complex<double> mat[2] = {1., std::exp(p)};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
+void StateVector<real_t>::apply_p(pos_t pos, real_t phase) {
+  complex<double> p = imag_I * phase;
+  complex<double> mat[2] = {1., std::exp(p)};
+  apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
 }
 
-
 template <class real_t>
-void StateVector<real_t>::apply_rx(pos_t pos, real_t theta){
-    complex<double> mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)};
-    apply_one_targe_gate_general<0>(vector<pos_t>{pos}, mat);
+void StateVector<real_t>::apply_rx(pos_t pos, real_t theta) {
+  complex<double> mat[4] = {std::cos(theta / 2), -imag_I * std::sin(theta / 2),
+                            -imag_I * std::sin(theta / 2), std::cos(theta / 2)};
+  apply_one_targe_gate_general<0>(vector<pos_t>{pos}, mat);
 }
 
-
 template <class real_t>
-void StateVector<real_t>::apply_ry(pos_t pos, real_t theta){
-    complex<double> mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)};
-    apply_one_targe_gate_real<0>(vector<pos_t>{pos}, mat);
+void StateVector<real_t>::apply_ry(pos_t pos, real_t theta) {
+  complex<double> mat[4] = {std::cos(theta / 2), -std::sin(theta / 2),
+                            std::sin(theta / 2), std::cos(theta / 2)};
+  apply_one_targe_gate_real<0>(vector<pos_t>{pos}, mat);
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_rz(pos_t pos, real_t theta){
-    complex<double> z0 = -imag_I*theta/2.;
-    complex<double> z1 = imag_I*theta/2.;
-    complex<double> mat[2] = {std::exp(z0), std::exp(z1)};
-    apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
+void StateVector<real_t>::apply_rz(pos_t pos, real_t theta) {
+  complex<double> z0 = -imag_I * theta / 2.;
+  complex<double> z1 = imag_I * theta / 2.;
+  complex<double> mat[2] = {std::exp(z0), std::exp(z1)};
+  apply_one_targe_gate_diag<0>(vector<pos_t>{pos}, mat);
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_cnot(pos_t control, pos_t targe){
-    apply_one_targe_gate_x<1>(vector<pos_t>{control, targe});
+void StateVector<real_t>::apply_cnot(pos_t control, pos_t targe) {
+  apply_one_targe_gate_x<1>(vector<pos_t>{control, targe});
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_cz(pos_t control, pos_t targe){
-    complex<double> mat[2] = {1., -1.};
-    apply_one_targe_gate_diag<1>(vector<pos_t>{control, targe}, mat);
+void StateVector<real_t>::apply_cz(pos_t control, pos_t targe) {
+  complex<double> mat[2] = {1., -1.};
+  apply_one_targe_gate_diag<1>(vector<pos_t>{control, targe}, mat);
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_cp(pos_t control, pos_t targe, real_t phase){
-    complex<double> p = imag_I*phase;
-    complex<double> mat[2] = {1., std::exp(p)};
-    apply_one_targe_gate_diag<1>(vector<pos_t>{control, targe}, mat);
+void StateVector<real_t>::apply_cp(pos_t control, pos_t targe, real_t phase) {
+  complex<double> p = imag_I * phase;
+  complex<double> mat[2] = {1., std::exp(p)};
+  apply_one_targe_gate_diag<1>(vector<pos_t>{control, targe}, mat);
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_crx(pos_t control, pos_t targe,  real_t theta){
-    complex<double> mat[4] = {std::cos(theta/2), -imag_I*std::sin(theta/2), -imag_I*std::sin(theta/2), std::cos(theta/2)};
+void StateVector<real_t>::apply_crx(pos_t control, pos_t targe, real_t theta) {
+  complex<double> mat[4] = {std::cos(theta / 2), -imag_I * std::sin(theta / 2),
+                            -imag_I * std::sin(theta / 2), std::cos(theta / 2)};
 
-    apply_one_targe_gate_general<1>(vector<pos_t>{control, targe}, mat);
+  apply_one_targe_gate_general<1>(vector<pos_t>{control, targe}, mat);
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_cry(pos_t control, pos_t targe,  real_t theta){
-     complex<double> mat[4] = {std::cos(theta/2), -std::sin(theta/2),std::sin(theta/2), std::cos(theta/2)};
+void StateVector<real_t>::apply_cry(pos_t control, pos_t targe, real_t theta) {
+  complex<double> mat[4] = {std::cos(theta / 2), -std::sin(theta / 2),
+                            std::sin(theta / 2), std::cos(theta / 2)};
 
-    apply_one_targe_gate_real<1>(vector<pos_t>{control, targe}, mat);
+  apply_one_targe_gate_real<1>(vector<pos_t>{control, targe}, mat);
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_ccx(pos_t control1, pos_t control2, pos_t targe){
-    apply_one_targe_gate_x<2>(vector<pos_t>{control1, control2, targe});
+void StateVector<real_t>::apply_ccx(pos_t control1, pos_t control2,
+                                    pos_t targe) {
+  apply_one_targe_gate_x<2>(vector<pos_t>{control1, control2, targe});
 }
 
 /////// General implementation /////////
 
 template <class real_t>
 template <int ctrl_num>
-void StateVector<real_t>::apply_one_targe_gate_general(vector<pos_t> const& posv, complex<double> *mat)
-{
-    std::function<size_t(size_t)> getind_func_near;
-    std::function<size_t(size_t)> getind_func;
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control = 0;
-    size_t setbit;
-    size_t poffset;
-    bool has_control=false;
-    vector<pos_t> posv_sorted = posv;
-    if (ctrl_num == 0){
-        targe = posv[0];
-        offset = 1ll<<targe;
-        rsize = size_>>1;
-        getind_func_near = [&](size_t j)-> size_t {
-            return 2*j;
-        };
-
-        getind_func = [&](size_t j)-> size_t {
-            return (j&(offset-1)) | (j>>targe<<targe<<1);
-        };
+void StateVector<real_t>::apply_one_targe_gate_general(
+    vector<pos_t> const& posv, complex<double>* mat) {
+  std::function<size_t(size_t)> getind_func_near;
+  std::function<size_t(size_t)> getind_func;
+  size_t rsize;
+  size_t offset;
+  size_t targe;
+  size_t control = 0;
+  size_t setbit;
+  size_t poffset;
+  bool has_control = false;
+  vector<pos_t> posv_sorted = posv;
+  if (ctrl_num == 0) {
+    targe = posv[0];
+    offset = 1ll << targe;
+    rsize = size_ >> 1;
+    getind_func_near = [&](size_t j) -> size_t { return 2 * j; };
+
+    getind_func = [&](size_t j) -> size_t {
+      return (j & (offset - 1)) | (j >> targe << targe << 1);
+    };
 
+  } else if (ctrl_num == 1) {
+    has_control = true;
+    control = posv[0];
+    targe = posv[1];
+    offset = 1ll << targe;
+    setbit = 1ll << control;
+    if (control > targe) {
+      control--;
     }
-    else if(ctrl_num == 1){
-        has_control = true;
-        control = posv[0];
-        targe = posv[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size_>>2;
-        getind_func = [&](size_t j) -> size_t {
-            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
-            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
-            return i;
-        };
+    poffset = 1ll << control;
+    rsize = size_ >> 2;
+    getind_func = [&](size_t j) -> size_t {
+      size_t i = (j >> control << (control + 1)) | (j & (poffset - 1));
+      i = (i >> targe << (targe + 1)) | (i & (offset - 1)) | setbit;
+      return i;
+    };
 
-        getind_func_near = getind_func;
+    getind_func_near = getind_func;
 
+  } else if (ctrl_num == 2) {
+    has_control = true;
+    control = *min_element(posv.begin(), posv.end() - 1);
+    targe = *(posv.end() - 1);
+    offset = 1ll << targe;
+    sort(posv_sorted.begin(), posv_sorted.end());
+    rsize = size_ >> posv.size();
+    getind_func = [&](size_t j) -> size_t {
+      size_t i = j;
+      for (size_t k = 0; k < posv.size(); k++) {
+        size_t _pos = posv_sorted[k];
+        i = (i & ((1ll << _pos) - 1)) | (i >> _pos << _pos << 1);
+      }
+      for (size_t k = 0; k < posv.size() - 1; k++) {
+        i |= 1ll << posv[k];
+      }
+      return i;
+    };
+    getind_func_near = getind_func;
+  }
 
+  const complex<real_t> mat00 = mat[0];
+  const complex<real_t> mat01 = mat[1];
+  const complex<real_t> mat10 = mat[2];
+  const complex<real_t> mat11 = mat[3];
+  if (targe == 0) {
+#pragma omp parallel for
+    for (omp_i j = 0; j < rsize; j++) {
+      size_t i = getind_func_near(j);
+      complex<real_t> temp = data_[i];
+      data_[i] = mat00 * data_[i] + mat01 * data_[i + 1];
+      data_[i + 1] = mat10 * temp + mat11 * data_[i + 1];
     }
-    else if(ctrl_num == 2){
-        has_control = true;
-        control = *min_element(posv.begin(), posv.end()-1);
-        targe = *(posv.end()-1);
-        offset = 1ll<<targe;
-        sort(posv_sorted.begin(), posv_sorted.end());
-        rsize = size_>>posv.size();
-        getind_func = [&](size_t j)-> size_t{
-            size_t i = j;
-            for (size_t k=0;k < posv.size();k++)
-            {
-                size_t _pos = posv_sorted[k];
-                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-            }
-            for (size_t k=0;k < posv.size()-1;k++){
-                i |= 1ll<<posv[k];
-            }
-            return i;
-        };
-        getind_func_near = getind_func;
+  } else if (has_control && control == 0) { // single step
+#pragma omp parallel for
+    for (omp_i j = 0; j < rsize; j++) {
+      size_t i = getind_func(j);
+      complex<real_t> temp = data_[i];
+      data_[i] = mat00 * data_[i] + mat01 * data_[i + offset];
+      data_[i + offset] = mat10 * temp + mat11 * data_[i + offset];
     }
 
-    const complex<real_t> mat00 = mat[0];
-    const complex<real_t> mat01 = mat[1];
-    const complex<real_t> mat10 = mat[2];
-    const complex<real_t> mat11 = mat[3];
-    if (targe == 0){
-#pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func_near(j);
-                complex<real_t> temp = data_[i];
-                data_[i] = mat00*data_[i] + mat01*data_[i+1];
-                data_[i+1] = mat10*temp + mat11*data_[i+1];
-            }
-    }else if (has_control && control == 0){ //single step
-#pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func(j);
-                complex<real_t> temp = data_[i];
-                data_[i] = mat00*data_[i] + mat01*data_[i+offset];
-                data_[i+offset] = mat10*temp + mat11*data_[i+offset];
-            }
-
-    }else{//unroll to 2
+  } else { // unroll to 2
 #ifdef USE_SIMD
-    __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(),  mat[0].real());
-    __m256d m_00im = _mm256_set_pd(mat[0].imag(),  -mat[0].imag(),  mat[0].imag(),  -mat[0].imag());
-    __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(),  mat[1].real(), mat[1].real());
-    __m256d m_01im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(),  mat[1].imag(), -mat[1].imag());
-
-    __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real());
-    __m256d m_10im = _mm256_set_pd(mat[2].imag(),  -mat[2].imag(),mat[2].imag(), -mat[2].imag());
-    __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real());
-    __m256d m_11im = _mm256_set_pd(mat[3].imag(), -mat[3].imag(), mat[3].imag(),  -mat[3].imag());
+    __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(), mat[0].real(),
+                                   mat[0].real());
+    __m256d m_00im = _mm256_set_pd(mat[0].imag(), -mat[0].imag(), mat[0].imag(),
+                                   -mat[0].imag());
+    __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(), mat[1].real(),
+                                   mat[1].real());
+    __m256d m_01im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(), mat[1].imag(),
+                                   -mat[1].imag());
+
+    __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(),
+                                   mat[2].real());
+    __m256d m_10im = _mm256_set_pd(mat[2].imag(), -mat[2].imag(), mat[2].imag(),
+                                   -mat[2].imag());
+    __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(),
+                                   mat[3].real());
+    __m256d m_11im = _mm256_set_pd(mat[3].imag(), -mat[3].imag(), mat[3].imag(),
+                                   -mat[3].imag());
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize; j+= 2){
-            size_t i = getind_func(j);
-
-            double* p0 = (double*)(data_.get()+i);
-            double* p1 = (double*)(data_.get()+i+offset);
-            //load data
-            __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0
-            __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1
-            __m256d data0_p = _mm256_permute_pd(data0, 5);
-            __m256d data1_p = _mm256_permute_pd(data1, 5);
-
-             //row0
-            __m256d temp00re = _mm256_mul_pd(m_00re, data0);
-            __m256d temp00im = _mm256_mul_pd(m_00im, data0_p);
-            __m256d temp00 = _mm256_add_pd(temp00re, temp00im);
-            __m256d temp01re = _mm256_mul_pd(m_01re, data1);
-            __m256d temp01im = _mm256_mul_pd(m_01im, data1_p);
-            __m256d temp01 = _mm256_add_pd(temp01re, temp01im);
-            __m256d temp0 = _mm256_add_pd(temp00, temp01);
-
-            //row1
-            __m256d temp10re = _mm256_mul_pd(m_10re, data0);
-            __m256d temp10im = _mm256_mul_pd(m_10im, data0_p);
-            __m256d temp10 = _mm256_add_pd(temp10re, temp10im);
-            __m256d temp11re = _mm256_mul_pd(m_11re, data1);
-            __m256d temp11im = _mm256_mul_pd(m_11im, data1_p);
-            __m256d temp11 = _mm256_add_pd(temp11re, temp11im);
-            __m256d temp1 = _mm256_add_pd(temp10, temp11);
-
-            _mm256_storeu_pd(p0, temp0);
-            _mm256_storeu_pd(p1, temp1);
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = getind_func(j);
+
+      double* p0 = (double*)(data_.get() + i);
+      double* p1 = (double*)(data_.get() + i + offset);
+      // load data
+      __m256d data0 = _mm256_loadu_pd(p0); // lre_0, lim_0, rre_0, rim_0
+      __m256d data1 = _mm256_loadu_pd(p1); // lre_1, lim_1, rre_1, rim_1
+      __m256d data0_p = _mm256_permute_pd(data0, 5);
+      __m256d data1_p = _mm256_permute_pd(data1, 5);
+
+      // row0
+      __m256d temp00re = _mm256_mul_pd(m_00re, data0);
+      __m256d temp00im = _mm256_mul_pd(m_00im, data0_p);
+      __m256d temp00 = _mm256_add_pd(temp00re, temp00im);
+      __m256d temp01re = _mm256_mul_pd(m_01re, data1);
+      __m256d temp01im = _mm256_mul_pd(m_01im, data1_p);
+      __m256d temp01 = _mm256_add_pd(temp01re, temp01im);
+      __m256d temp0 = _mm256_add_pd(temp00, temp01);
+
+      // row1
+      __m256d temp10re = _mm256_mul_pd(m_10re, data0);
+      __m256d temp10im = _mm256_mul_pd(m_10im, data0_p);
+      __m256d temp10 = _mm256_add_pd(temp10re, temp10im);
+      __m256d temp11re = _mm256_mul_pd(m_11re, data1);
+      __m256d temp11im = _mm256_mul_pd(m_11im, data1_p);
+      __m256d temp11 = _mm256_add_pd(temp11re, temp11im);
+      __m256d temp1 = _mm256_add_pd(temp10, temp11);
+
+      _mm256_storeu_pd(p0, temp0);
+      _mm256_storeu_pd(p1, temp1);
+    }
 #else
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = getind_func(j);
-            size_t i1 = i+1;
-            complex<real_t> temp = data_[i];
-            complex<real_t> temp1 = data_[i1];
-            data_[i] = mat00*data_[i] + mat01*data_[i+offset];
-            data_[i+offset] = mat10*temp + mat11*data_[i+offset];
-            data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset];
-            data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset];
-        }
-#endif
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = getind_func(j);
+      size_t i1 = i + 1;
+      complex<real_t> temp = data_[i];
+      complex<real_t> temp1 = data_[i1];
+      data_[i] = mat00 * data_[i] + mat01 * data_[i + offset];
+      data_[i + offset] = mat10 * temp + mat11 * data_[i + offset];
+      data_[i1] = mat00 * data_[i1] + mat01 * data_[i1 + offset];
+      data_[i1 + offset] = mat10 * temp1 + mat11 * data_[i1 + offset];
     }
+#endif
+  }
 }
 
-
 template <class real_t>
 template <int ctrl_num>
-void StateVector<real_t>::apply_one_targe_gate_x(vector<pos_t> const& posv)
-{
-    std::function<size_t(size_t)> getind_func_near;
-    std::function<size_t(size_t)> getind_func;
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control;
-    size_t setbit;
-    size_t poffset;
-    vector<pos_t> posv_sorted = posv;
-    bool has_control=false;
-    if (ctrl_num == 0){
-        targe = posv[0];
-        offset = 1ll<<targe;
-        rsize = size_>>1;
-        getind_func_near = [&](size_t j)-> size_t {
-            return 2*j;
-        };
-
-        getind_func = [&](size_t j)-> size_t {
-            return (j&(offset-1)) | (j>>targe<<targe<<1);
-        };
+void StateVector<real_t>::apply_one_targe_gate_x(vector<pos_t> const& posv) {
+  std::function<size_t(size_t)> getind_func_near;
+  std::function<size_t(size_t)> getind_func;
+  size_t rsize;
+  size_t offset;
+  size_t targe;
+  size_t control;
+  size_t setbit;
+  size_t poffset;
+  vector<pos_t> posv_sorted = posv;
+  bool has_control = false;
+  if (ctrl_num == 0) {
+    targe = posv[0];
+    offset = 1ll << targe;
+    rsize = size_ >> 1;
+    getind_func_near = [&](size_t j) -> size_t { return 2 * j; };
+
+    getind_func = [&](size_t j) -> size_t {
+      return (j & (offset - 1)) | (j >> targe << targe << 1);
+    };
 
+  } else if (ctrl_num == 1) {
+    has_control = true;
+    control = posv[0];
+    targe = posv[1];
+    offset = 1ll << targe;
+    setbit = 1ll << control;
+    if (control > targe) {
+      control--;
     }
-    else if(ctrl_num == 1){
-        has_control = true;
-        control = posv[0];
-        targe = posv[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size_>>2;
-        getind_func = [&](size_t j) -> size_t {
-            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
-            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
-            return i;
-        };
-        getind_func_near = getind_func;
-    }
-    else if(ctrl_num == 2){
-        has_control = true;
-        control = *min_element(posv.begin(), posv.end()-1);
-        targe = *(posv.end()-1);
-        offset = 1ll<<targe;
-        sort(posv_sorted.begin(), posv_sorted.end());
-        rsize = size_>>posv.size();
-
-        getind_func = [&](size_t j) -> size_t{
-            size_t i = j;
-            for (size_t k=0;k < posv.size();k++){
-                size_t _pos = posv_sorted[k];
-                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-            }
-            for (size_t k=0;k < posv.size()-1;k++){
-                i |= 1ll<<posv[k];
-            }
-            return i;
-        };
-        getind_func_near = getind_func;
-    }
+    poffset = 1ll << control;
+    rsize = size_ >> 2;
+    getind_func = [&](size_t j) -> size_t {
+      size_t i = (j >> control << (control + 1)) | (j & (poffset - 1));
+      i = (i >> targe << (targe + 1)) | (i & (offset - 1)) | setbit;
+      return i;
+    };
+    getind_func_near = getind_func;
+  } else if (ctrl_num == 2) {
+    has_control = true;
+    control = *min_element(posv.begin(), posv.end() - 1);
+    targe = *(posv.end() - 1);
+    offset = 1ll << targe;
+    sort(posv_sorted.begin(), posv_sorted.end());
+    rsize = size_ >> posv.size();
+
+    getind_func = [&](size_t j) -> size_t {
+      size_t i = j;
+      for (size_t k = 0; k < posv.size(); k++) {
+        size_t _pos = posv_sorted[k];
+        i = (i & ((1ll << _pos) - 1)) | (i >> _pos << _pos << 1);
+      }
+      for (size_t k = 0; k < posv.size() - 1; k++) {
+        i |= 1ll << posv[k];
+      }
+      return i;
+    };
+    getind_func_near = getind_func;
+  }
 
-    if (targe == 0){
+  if (targe == 0) {
 #ifdef USE_SIMD
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j++){
-            size_t i = getind_func_near(j);
-            double* ptr = (double*)(data_.get() + i);
-            __m256d data = _mm256_loadu_pd(ptr);
-            data = _mm256_permute4x64_pd(data, 78);
-            _mm256_storeu_pd(ptr, data);
-        }
+    for (omp_i j = 0; j < rsize; j++) {
+      size_t i = getind_func_near(j);
+      double* ptr = (double*)(data_.get() + i);
+      __m256d data = _mm256_loadu_pd(ptr);
+      data = _mm256_permute4x64_pd(data, 78);
+      _mm256_storeu_pd(ptr, data);
+    }
 #else
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j++){
-            size_t i = getind_func(j);
-            std::swap(data_[i], data_[i+1]);
-        }
+    for (omp_i j = 0; j < rsize; j++) {
+      size_t i = getind_func(j);
+      std::swap(data_[i], data_[i + 1]);
+    }
 #endif
-    }else if (has_control && control == 0){ //single step
+  } else if (has_control && control == 0) { // single step
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j++){
-            size_t i = getind_func(j);
-            std::swap(data_[i], data_[i+offset]);
-        }
+    for (omp_i j = 0; j < rsize; j++) {
+      size_t i = getind_func(j);
+      std::swap(data_[i], data_[i + offset]);
+    }
 
-    }else{//unroll to 2
+  } else { // unroll to 2
 #ifdef USE_SIMD
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize; j+= 2){
-            size_t i = getind_func(j);
-            double* ptr0 = (double*)(data_.get() + i);
-            double* ptr1 = (double*)(data_.get() + i + offset);
-            __m256d data0 = _mm256_loadu_pd(ptr0);
-            __m256d data1 = _mm256_loadu_pd(ptr1);
-            _mm256_storeu_pd(ptr1, data0);
-            _mm256_storeu_pd(ptr0, data1);
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = getind_func(j);
+      double* ptr0 = (double*)(data_.get() + i);
+      double* ptr1 = (double*)(data_.get() + i + offset);
+      __m256d data0 = _mm256_loadu_pd(ptr0);
+      __m256d data1 = _mm256_loadu_pd(ptr1);
+      _mm256_storeu_pd(ptr1, data0);
+      _mm256_storeu_pd(ptr0, data1);
+    }
 #else
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = getind_func(j);
-            size_t i1 = i+1;
-            std::swap(data_[i], data_[i+offset]);
-            std::swap(data_[i1], data_[i1+offset]);
-        }
-#endif
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = getind_func(j);
+      size_t i1 = i + 1;
+      std::swap(data_[i], data_[i + offset]);
+      std::swap(data_[i1], data_[i1 + offset]);
     }
+#endif
+  }
 }
 
 template <class real_t>
 template <int ctrl_num>
-void StateVector<real_t>::apply_one_targe_gate_real(vector<pos_t> const& posv, complex<double> *mat)
-{
-    std::function<size_t(size_t)> getind_func_near;
-    std::function<size_t(size_t)> getind_func;
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control = 0;
-    size_t setbit;
-    size_t poffset;
-    bool has_control=false;
-    vector<pos_t> posv_sorted = posv;
-    if (ctrl_num == 0){
-        targe = posv[0];
-        offset = 1ll<<targe;
-        rsize = size_>>1;
-        getind_func_near = [&](size_t j)-> size_t {
-            return 2*j;
-        };
-
-        getind_func = [&](size_t j)-> size_t {
-            return (j&(offset-1)) | (j>>targe<<targe<<1);
-        };
+void StateVector<real_t>::apply_one_targe_gate_real(vector<pos_t> const& posv,
+                                                    complex<double>* mat) {
+  std::function<size_t(size_t)> getind_func_near;
+  std::function<size_t(size_t)> getind_func;
+  size_t rsize;
+  size_t offset;
+  size_t targe;
+  size_t control = 0;
+  size_t setbit;
+  size_t poffset;
+  bool has_control = false;
+  vector<pos_t> posv_sorted = posv;
+  if (ctrl_num == 0) {
+    targe = posv[0];
+    offset = 1ll << targe;
+    rsize = size_ >> 1;
+    getind_func_near = [&](size_t j) -> size_t { return 2 * j; };
+
+    getind_func = [&](size_t j) -> size_t {
+      return (j & (offset - 1)) | (j >> targe << targe << 1);
+    };
 
+  } else if (ctrl_num == 1) {
+    has_control = true;
+    control = posv[0];
+    targe = posv[1];
+    offset = 1ll << targe;
+    setbit = 1ll << control;
+    if (control > targe) {
+      control--;
     }
-    else if(ctrl_num == 1){
-        has_control = true;
-        control = posv[0];
-        targe = posv[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size_>>2;
-        getind_func = [&](size_t j) -> size_t {
-            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
-            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
-            return i;
-        };
+    poffset = 1ll << control;
+    rsize = size_ >> 2;
+    getind_func = [&](size_t j) -> size_t {
+      size_t i = (j >> control << (control + 1)) | (j & (poffset - 1));
+      i = (i >> targe << (targe + 1)) | (i & (offset - 1)) | setbit;
+      return i;
+    };
 
-        getind_func_near = getind_func;
-    }
-    else if(ctrl_num == 2){
-        has_control = true;
-        control = *min_element(posv.begin(), posv.end()-1);
-        targe = *(posv.end()-1);
-        offset = 1ll<<targe;
-        sort(posv_sorted.begin(), posv_sorted.end());
-        rsize = size_>>posv.size();
-        getind_func = [&](size_t j)-> size_t{
-            size_t i = j;
-            for (size_t k=0;k < posv.size();k++){
-                size_t _pos = posv_sorted[k];
-                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-            }
-            for (size_t k=0;k < posv.size()-1;k++){
-                i |= 1ll<<posv[k];
-            }
-            return i;
-        };
-        getind_func_near = getind_func;
-    }
+    getind_func_near = getind_func;
+  } else if (ctrl_num == 2) {
+    has_control = true;
+    control = *min_element(posv.begin(), posv.end() - 1);
+    targe = *(posv.end() - 1);
+    offset = 1ll << targe;
+    sort(posv_sorted.begin(), posv_sorted.end());
+    rsize = size_ >> posv.size();
+    getind_func = [&](size_t j) -> size_t {
+      size_t i = j;
+      for (size_t k = 0; k < posv.size(); k++) {
+        size_t _pos = posv_sorted[k];
+        i = (i & ((1ll << _pos) - 1)) | (i >> _pos << _pos << 1);
+      }
+      for (size_t k = 0; k < posv.size() - 1; k++) {
+        i |= 1ll << posv[k];
+      }
+      return i;
+    };
+    getind_func_near = getind_func;
+  }
 
-    const double mat00 = mat[0].real();
-    const double mat01 = mat[1].real();
-    const double mat10 = mat[2].real();
-    const double mat11 = mat[3].real();
-    if (targe == 0){
+  const double mat00 = mat[0].real();
+  const double mat01 = mat[1].real();
+  const double mat10 = mat[2].real();
+  const double mat11 = mat[3].real();
+  if (targe == 0) {
 #pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func_near(j);
-                complex<real_t> temp = data_[i];
-                data_[i] = mat00*data_[i] + mat01*data_[i+1];
-                data_[i+1] = mat10*temp + mat11*data_[i+1];
-            }
+    for (omp_i j = 0; j < rsize; j++) {
+      size_t i = getind_func_near(j);
+      complex<real_t> temp = data_[i];
+      data_[i] = mat00 * data_[i] + mat01 * data_[i + 1];
+      data_[i + 1] = mat10 * temp + mat11 * data_[i + 1];
+    }
 
-    }else if (has_control && control == 0){ //single step
+  } else if (has_control && control == 0) { // single step
 
 #pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func(j);
-                complex<real_t> temp = data_[i];
-                data_[i] = mat00*data_[i] + mat01*data_[i+offset];
-                data_[i+offset] = mat10*temp + mat11*data_[i+offset];
-            }
-    }else{//unroll to 2
+    for (omp_i j = 0; j < rsize; j++) {
+      size_t i = getind_func(j);
+      complex<real_t> temp = data_[i];
+      data_[i] = mat00 * data_[i] + mat01 * data_[i + offset];
+      data_[i + offset] = mat10 * temp + mat11 * data_[i + offset];
+    }
+  } else { // unroll to 2
 #ifdef USE_SIMD
-    __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(),  mat[0].real());
-    __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(),  mat[1].real(), mat[1].real());
-    __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(), mat[2].real());
-    __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(), mat[3].real());
+    __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(), mat[0].real(),
+                                   mat[0].real());
+    __m256d m_01re = _mm256_set_pd(mat[1].real(), mat[1].real(), mat[1].real(),
+                                   mat[1].real());
+    __m256d m_10re = _mm256_set_pd(mat[2].real(), mat[2].real(), mat[2].real(),
+                                   mat[2].real());
+    __m256d m_11re = _mm256_set_pd(mat[3].real(), mat[3].real(), mat[3].real(),
+                                   mat[3].real());
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize; j+= 2){
-            size_t i = getind_func(j);
-
-            double* p0 = (double*)(data_.get()+i);
-            double* p1 = (double*)(data_.get()+i+offset);
-             //load data
-            __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0
-            __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1
-            __m256d data0_p = _mm256_permute_pd(data0, 5);
-            __m256d data1_p = _mm256_permute_pd(data1, 5);
-
-                //row0
-            __m256d temp00re = _mm256_mul_pd(m_00re, data0);
-            __m256d temp01re = _mm256_mul_pd(m_01re, data1);
-            __m256d temp0 = _mm256_add_pd(temp00re, temp01re);
-
-            //row1
-            __m256d temp10re = _mm256_mul_pd(m_10re, data0);
-            __m256d temp11re = _mm256_mul_pd(m_11re, data1);
-            __m256d temp1 = _mm256_add_pd(temp10re, temp11re);
-
-            _mm256_storeu_pd(p0, temp0);
-            _mm256_storeu_pd(p1, temp1);
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = getind_func(j);
+
+      double* p0 = (double*)(data_.get() + i);
+      double* p1 = (double*)(data_.get() + i + offset);
+      // load data
+      __m256d data0 = _mm256_loadu_pd(p0); // lre_0, lim_0, rre_0, rim_0
+      __m256d data1 = _mm256_loadu_pd(p1); // lre_1, lim_1, rre_1, rim_1
+      __m256d data0_p = _mm256_permute_pd(data0, 5);
+      __m256d data1_p = _mm256_permute_pd(data1, 5);
+
+      // row0
+      __m256d temp00re = _mm256_mul_pd(m_00re, data0);
+      __m256d temp01re = _mm256_mul_pd(m_01re, data1);
+      __m256d temp0 = _mm256_add_pd(temp00re, temp01re);
+
+      // row1
+      __m256d temp10re = _mm256_mul_pd(m_10re, data0);
+      __m256d temp11re = _mm256_mul_pd(m_11re, data1);
+      __m256d temp1 = _mm256_add_pd(temp10re, temp11re);
+
+      _mm256_storeu_pd(p0, temp0);
+      _mm256_storeu_pd(p1, temp1);
+    }
 #else
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = getind_func(j);
-            size_t i1 = i+1;
-            complex<real_t> temp = data_[i];
-            complex<real_t> temp1 = data_[i1];
-            data_[i] = mat00*data_[i] + mat01*data_[i+offset];
-            data_[i+offset] = mat10*temp + mat11*data_[i+offset];
-            data_[i1] = mat00*data_[i1] + mat01*data_[i1+offset];
-            data_[i1+offset] = mat10*temp1 + mat11*data_[i1+offset];
-        }
-#endif
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = getind_func(j);
+      size_t i1 = i + 1;
+      complex<real_t> temp = data_[i];
+      complex<real_t> temp1 = data_[i1];
+      data_[i] = mat00 * data_[i] + mat01 * data_[i + offset];
+      data_[i + offset] = mat10 * temp + mat11 * data_[i + offset];
+      data_[i1] = mat00 * data_[i1] + mat01 * data_[i1 + offset];
+      data_[i1 + offset] = mat10 * temp1 + mat11 * data_[i1 + offset];
     }
+#endif
+  }
 }
 
-
 template <class real_t>
 template <int ctrl_num>
-void StateVector<real_t>::apply_one_targe_gate_diag(vector<pos_t> const& posv, complex<double> *mat)
-{
-    std::function<size_t(size_t)> getind_func_near;
-    std::function<size_t(size_t)> getind_func;
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control = 0;
-    size_t setbit;
-    size_t poffset;
-    bool has_control=false;
-    vector<pos_t> posv_sorted = posv;
-    if (ctrl_num == 0){
-        targe = posv[0];
-        offset = 1ll<<targe;
-        rsize = size_>>1;
-        getind_func_near = [&](size_t j)-> size_t {
-            return 2*j;
-        };
+void StateVector<real_t>::apply_one_targe_gate_diag(vector<pos_t> const& posv,
+                                                    complex<double>* mat) {
+  std::function<size_t(size_t)> getind_func_near;
+  std::function<size_t(size_t)> getind_func;
+  size_t rsize;
+  size_t offset;
+  size_t targe;
+  size_t control = 0;
+  size_t setbit;
+  size_t poffset;
+  bool has_control = false;
+  vector<pos_t> posv_sorted = posv;
+  if (ctrl_num == 0) {
+    targe = posv[0];
+    offset = 1ll << targe;
+    rsize = size_ >> 1;
+    getind_func_near = [&](size_t j) -> size_t { return 2 * j; };
+
+    getind_func = [&](size_t j) -> size_t {
+      return (j & (offset - 1)) | (j >> targe << targe << 1);
+    };
 
-        getind_func = [&](size_t j)-> size_t {
-            return (j&(offset-1)) | (j>>targe<<targe<<1);
-        };
+  } else if (ctrl_num == 1) {
 
+    has_control = true;
+    control = posv[0];
+    targe = posv[1];
+    offset = 1ll << targe;
+    setbit = 1ll << control;
+    if (control > targe) {
+      control--;
     }
-    else if(ctrl_num == 1){
-
-        has_control = true;
-        control = posv[0];
-        targe = posv[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size_>>2;
-        getind_func = [&](size_t j) -> size_t {
-            size_t i = (j>>control<<(control+1))|(j&(poffset-1));
-            i = (i>>targe<<(targe+1))|(i&(offset-1))|setbit;
-            return i;
-        };
+    poffset = 1ll << control;
+    rsize = size_ >> 2;
+    getind_func = [&](size_t j) -> size_t {
+      size_t i = (j >> control << (control + 1)) | (j & (poffset - 1));
+      i = (i >> targe << (targe + 1)) | (i & (offset - 1)) | setbit;
+      return i;
+    };
 
-        getind_func_near = getind_func;
+    getind_func_near = getind_func;
 
-    }
-    else if(ctrl_num == 2){
-        has_control = true;
-        control = *min_element(posv.begin(), posv.end()-1);
-        targe = *(posv.end()-1);
-        offset = 1ll<<targe;
-        sort(posv_sorted.begin(), posv_sorted.end());
-        rsize = size_>>posv.size();
-        getind_func = [&](size_t j)-> size_t
-        {
-            size_t i = j;
-            for (size_t k=0;k < posv.size();k++){
-                size_t _pos = posv_sorted[k];
-                i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-            }
-            for (size_t k=0;k < posv.size()-1;k++){
-                i |= 1ll<<posv[k];
-            }
-            return i;
-        };
-        getind_func_near = getind_func;
-    }
+  } else if (ctrl_num == 2) {
+    has_control = true;
+    control = *min_element(posv.begin(), posv.end() - 1);
+    targe = *(posv.end() - 1);
+    offset = 1ll << targe;
+    sort(posv_sorted.begin(), posv_sorted.end());
+    rsize = size_ >> posv.size();
+    getind_func = [&](size_t j) -> size_t {
+      size_t i = j;
+      for (size_t k = 0; k < posv.size(); k++) {
+        size_t _pos = posv_sorted[k];
+        i = (i & ((1ll << _pos) - 1)) | (i >> _pos << _pos << 1);
+      }
+      for (size_t k = 0; k < posv.size() - 1; k++) {
+        i |= 1ll << posv[k];
+      }
+      return i;
+    };
+    getind_func_near = getind_func;
+  }
 
-    if (targe == 0){
+  if (targe == 0) {
 #pragma omp parallel for
-            for(omp_i j = 0;j < rsize;j++){
-                size_t i = getind_func_near(j);
-                data_[i] *= mat[0];
-                data_[i+1] *= mat[1];
-            }
+    for (omp_i j = 0; j < rsize; j++) {
+      size_t i = getind_func_near(j);
+      data_[i] *= mat[0];
+      data_[i + 1] *= mat[1];
+    }
 
-    }else if (has_control && control == 0){ //single step
+  } else if (has_control && control == 0) { // single step
 
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j++){
-            size_t i = getind_func(j);
-            complex<real_t> temp = data_[i];
-            data_[i] *= mat[0];
-            data_[i+offset] *= mat[1];
-        }
+    for (omp_i j = 0; j < rsize; j++) {
+      size_t i = getind_func(j);
+      complex<real_t> temp = data_[i];
+      data_[i] *= mat[0];
+      data_[i + offset] *= mat[1];
+    }
 
-    }else{//unroll to 2
+  } else { // unroll to 2
 #ifdef USE_SIMD
-     __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(),mat[0].real(),  mat[0].real());
-    __m256d m_00im = _mm256_set_pd(mat[0].imag(),  -mat[0].imag(),  mat[0].imag(),  -mat[0].imag());
-    __m256d m_11re = _mm256_set_pd(mat[1].real(), mat[1].real(),  mat[1].real(), mat[1].real());
-    __m256d m_11im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(),  mat[1].imag(), -mat[1].imag());
+    __m256d m_00re = _mm256_set_pd(mat[0].real(), mat[0].real(), mat[0].real(),
+                                   mat[0].real());
+    __m256d m_00im = _mm256_set_pd(mat[0].imag(), -mat[0].imag(), mat[0].imag(),
+                                   -mat[0].imag());
+    __m256d m_11re = _mm256_set_pd(mat[1].real(), mat[1].real(), mat[1].real(),
+                                   mat[1].real());
+    __m256d m_11im = _mm256_set_pd(mat[1].imag(), -mat[1].imag(), mat[1].imag(),
+                                   -mat[1].imag());
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize; j+= 2){
-            size_t i = getind_func(j);
-
-            double* p0 = (double*)(data_.get()+i);
-            double* p1 = (double*)(data_.get()+i+offset);
-
-            //load data
-            __m256d data0 = _mm256_loadu_pd(p0); //lre_0, lim_0, rre_0, rim_0
-            __m256d data1 = _mm256_loadu_pd(p1); //lre_1, lim_1, rre_1, rim_1
-            __m256d data0_p = _mm256_permute_pd(data0, 5);
-            __m256d data1_p = _mm256_permute_pd(data1, 5);
-
-             //row0
-            __m256d temp00re = _mm256_mul_pd(m_00re, data0);
-            __m256d temp00im = _mm256_mul_pd(m_00im, data0_p);
-            __m256d temp00 = _mm256_add_pd(temp00re, temp00im);
-
-            //row1
-            __m256d temp11re = _mm256_mul_pd(m_11re, data1);
-            __m256d temp11im = _mm256_mul_pd(m_11im, data1_p);
-            __m256d temp11 = _mm256_add_pd(temp11re, temp11im);
-
-            _mm256_storeu_pd(p0, temp00);
-            _mm256_storeu_pd(p1, temp11);
-        }
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = getind_func(j);
+
+      double* p0 = (double*)(data_.get() + i);
+      double* p1 = (double*)(data_.get() + i + offset);
+
+      // load data
+      __m256d data0 = _mm256_loadu_pd(p0); // lre_0, lim_0, rre_0, rim_0
+      __m256d data1 = _mm256_loadu_pd(p1); // lre_1, lim_1, rre_1, rim_1
+      __m256d data0_p = _mm256_permute_pd(data0, 5);
+      __m256d data1_p = _mm256_permute_pd(data1, 5);
+
+      // row0
+      __m256d temp00re = _mm256_mul_pd(m_00re, data0);
+      __m256d temp00im = _mm256_mul_pd(m_00im, data0_p);
+      __m256d temp00 = _mm256_add_pd(temp00re, temp00im);
+
+      // row1
+      __m256d temp11re = _mm256_mul_pd(m_11re, data1);
+      __m256d temp11im = _mm256_mul_pd(m_11im, data1_p);
+      __m256d temp11 = _mm256_add_pd(temp11re, temp11im);
+
+      _mm256_storeu_pd(p0, temp00);
+      _mm256_storeu_pd(p1, temp11);
+    }
 #else
 #pragma omp parallel for
-        for(omp_i j = 0;j < rsize;j += 2){
-            size_t i = getind_func(j);
-            size_t i1 = i+1;
-            data_[i] *= mat[0];
-            data_[i+offset] *= mat[1];
-            data_[i1] *= mat[0];
-            data_[i1+offset] *= mat[1];
-        }
-#endif
+    for (omp_i j = 0; j < rsize; j += 2) {
+      size_t i = getind_func(j);
+      size_t i1 = i + 1;
+      data_[i] *= mat[0];
+      data_[i + offset] *= mat[1];
+      data_[i1] *= mat[0];
+      data_[i1 + offset] *= mat[1];
     }
+#endif
+  }
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_multi_targe_gate_general(vector<pos_t> const& posv, uint control_num, RowMatrixXcd const& mat)
-{
-    auto posv_sorted = posv;
-    auto targs = vector<pos_t>(posv.begin()+control_num, posv.end());
-    sort(posv_sorted.begin(), posv_sorted.end());
-    size_t rsize = size_ >> posv.size();
-    uint targe_num = targs.size();
-    size_t matsize= 1<< targe_num;
-    std::vector<uint> targ_mask(matsize);
-    //create target mask
-    for (size_t m = 0; m < matsize;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
+void StateVector<real_t>::apply_multi_targe_gate_general(
+    vector<pos_t> const& posv, uint control_num, RowMatrixXcd const& mat) {
+  auto posv_sorted = posv;
+  auto targs = vector<pos_t>(posv.begin() + control_num, posv.end());
+  sort(posv_sorted.begin(), posv_sorted.end());
+  size_t rsize = size_ >> posv.size();
+  uint targe_num = targs.size();
+  size_t matsize = 1 << targe_num;
+  std::vector<uint> targ_mask(matsize);
+  // create target mask
+  for (size_t m = 0; m < matsize; m++) {
+    for (size_t j = 0; j < targe_num; j++) {
+      if ((m >> j) & 1) {
+        auto mask_pos = targs[j];
+        targ_mask[m] |= 1ll << mask_pos;
+      }
     }
+  }
 
-    //apply matrix
-//TODO: Disalbe Parallel when matsize is very large
+  // apply matrix
+// TODO: Disalbe Parallel when matsize is very large
 #pragma omp parallel for
-    for (omp_i j = 0;j < rsize;j++){
-        size_t i = j;
-        // Insert zeros
-        for(size_t k=0;k < posv.size();k++){
-            size_t _pos = posv_sorted[k];
-            i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-        }
-        // Set control
-        for (size_t k=0; k < control_num;k++){
-            i |= 1ll<<posv[k];
-        }
+  for (omp_i j = 0; j < rsize; j++) {
+    size_t i = j;
+    // Insert zeros
+    for (size_t k = 0; k < posv.size(); k++) {
+      size_t _pos = posv_sorted[k];
+      i = (i & ((1ll << _pos) - 1)) | (i >> _pos << _pos << 1);
+    }
+    // Set control
+    for (size_t k = 0; k < control_num; k++) {
+      i |= 1ll << posv[k];
+    }
 
-        //load block vector
-        Eigen::VectorXcd vec_block(matsize);
-        for (size_t m = 0; m < matsize;m++){
-            vec_block(m) = data_[i | targ_mask[m]];
-            auto ele = vec_block(m);
-        }
+    // load block vector
+    Eigen::VectorXcd vec_block(matsize);
+    for (size_t m = 0; m < matsize; m++) {
+      vec_block(m) = data_[i | targ_mask[m]];
+      auto ele = vec_block(m);
+    }
 
-        //Eigen matrix multiply
-        vec_block = mat * vec_block;
+    // Eigen matrix multiply
+    vec_block = mat * vec_block;
 
-        //write back
-        for (size_t m = 0; m < matsize;m++){
-            data_[i | targ_mask[m]] = vec_block(m);
-        }
+    // write back
+    for (size_t m = 0; m < matsize; m++) {
+      data_[i | targ_mask[m]] = vec_block(m);
     }
+  }
 }
 
-
 uint index0(vector<pos_t> const& qubits_sorted, const uint k) {
-    uint lowbits, retval = k;
-    for (size_t j = 0; j < qubits_sorted.size(); j++) {
-        lowbits = retval & ((1LL << qubits_sorted[j]) -1);
-        retval >>= qubits_sorted[j];
-        retval <<= qubits_sorted[j] + 1;
-        retval |= lowbits;
-    }
-    return retval;
+  uint lowbits, retval = k;
+  for (size_t j = 0; j < qubits_sorted.size(); j++) {
+    lowbits = retval & ((1LL << qubits_sorted[j]) - 1);
+    retval >>= qubits_sorted[j];
+    retval <<= qubits_sorted[j] + 1;
+    retval |= lowbits;
+  }
+  return retval;
 }
 
 using indexes_t = vector<uint>;
-inline indexes_t indexes(vector<pos_t> const& qbits, vector<pos_t> const& qubits_sorted, const uint k) {
+inline indexes_t indexes(vector<pos_t> const& qbits,
+                         vector<pos_t> const& qubits_sorted, const uint k) {
   const auto N = qubits_sorted.size();
   indexes_t ret(1LL << N, 0);
   // Get index0
@@ -1054,302 +1029,312 @@ inline indexes_t indexes(vector<pos_t> const& qbits, vector<pos_t> const& qubits
 
 template <class real_t>
 vector<double> StateVector<real_t>::probabilities() const {
-    const int len = 1LL << num_;
-    vector<double> probs(len, 0.);
+  const int len = 1LL << num_;
+  vector<double> probs(len, 0.);
 #pragma omp parallel for
-    for (int j = 0; j < len; j++) {
-        probs[j] = std::real(data_[j] * std::conj(data_[j]));
-    }
-    return probs;
+  for (int j = 0; j < len; j++) {
+    probs[j] = std::real(data_[j] * std::conj(data_[j]));
+  }
+  return probs;
 }
 
-vector<std::complex<double>> convert(const vector<std::complex<double>> &v){
-    vector<std::complex<double>> ret(v.size(), 0.);
-    for (size_t i = 0; i< v.size(); ++i)
-        ret[i] = v[i];
-    return ret;
+vector<std::complex<double>> convert(const vector<std::complex<double>>& v) {
+  vector<std::complex<double>> ret(v.size(), 0.);
+  for (size_t i = 0; i < v.size(); ++i)
+    ret[i] = v[i];
+  return ret;
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_diagonal_matrix(vector<pos_t> const&qbits, vector<std::complex<double> > const&diag){
-    // just one qubit
-    if(qbits.size() == 1){
-        const uint qubit = qbits[0];
-        vector<pos_t> qbit0{qubit};
-        if (diag[0] == 1.0) { // [[1, 0], [0, z]] matrix
-            if (diag[1] == 1.0)
-                return; // Identity
-            if (diag[1] == std::complex<double>(0., -1.)) { // [[1, 0], [0, -i]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    const auto k = inds[1];
-                    double cache = data_[k].imag();
-                    data_[k].imag(data_[k].real() * -1.);
-                    data_[k].real(cache);
-                };
-#pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            if (diag[1] == std::complex<double>(0., 1.)) {
-            // [[1, 0], [0, i]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    const auto k = inds[1];
-                    double cache = data_[k].imag();
-                    data_[k].imag(data_[k].real());
-                    data_[k].real(cache * -1.);
-                };
-#pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            if (diag[0] == 0.0) {
-            // [[1, 0], [0, 0]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    data_[inds[1]] = 0.0;
-                };
-#pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            // general [[1, 0], [0, z]]
-            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
-                const auto k = inds[1];
-                data_[k] *= _mat[1];
-            };
+void StateVector<real_t>::apply_diagonal_matrix(
+    vector<pos_t> const& qbits, vector<std::complex<double>> const& diag) {
+  // just one qubit
+  if (qbits.size() == 1) {
+    const uint qubit = qbits[0];
+    vector<pos_t> qbit0{qubit};
+    if (diag[0] == 1.0) { // [[1, 0], [0, z]] matrix
+      if (diag[1] == 1.0)
+        return;                                       // Identity
+      if (diag[1] == std::complex<double>(0., -1.)) { // [[1, 0], [0, -i]]
+        auto func = [&](const indexes_t& inds) -> void {
+          const auto k = inds[1];
+          double cache = data_[k].imag();
+          data_[k].imag(data_[k].real() * -1.);
+          data_[k].real(cache);
+        };
 #pragma omp parallel for
-            for (int k = 0; k < (size_ >> 1); k+=1){
-                const auto inds = indexes(qbit0, qbit0, k);
-                func(inds, convert(diag));
-            }
-            return;
-        } else if (diag[1] == 1.0) {
-            // [[z, 0], [0, 1]] matrix
-            if (diag[0] == std::complex<double>(0., -1.)) {
-                // [[-i, 0], [0, 1]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    const auto k = inds[1];
-                    double cache = data_[k].imag();
-                    data_[k].imag(data_[k].real() * -1.);
-                    data_[k].real(cache);
-                };
+        for (int k = 0; k < (size_ >> 1); k += 1) {
+          const auto inds = indexes(qbit0, qbit0, k);
+          func(inds);
+        }
+        return;
+      }
+      if (diag[1] == std::complex<double>(0., 1.)) {
+        // [[1, 0], [0, i]]
+        auto func = [&](const indexes_t& inds) -> void {
+          const auto k = inds[1];
+          double cache = data_[k].imag();
+          data_[k].imag(data_[k].real());
+          data_[k].real(cache * -1.);
+        };
 #pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            if (diag[0] == std::complex<double>(0., 1.)) {
-                // [[i, 0], [0, 1]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    const auto k = inds[1];
-                    double cache = data_[k].imag();
-                    data_[k].imag(data_[k].real());
-                    data_[k].real(cache * -1.);
-                };
+        for (int k = 0; k < (size_ >> 1); k += 1) {
+          const auto inds = indexes(qbit0, qbit0, k);
+          func(inds);
+        }
+        return;
+      }
+      if (diag[0] == 0.0) {
+        // [[1, 0], [0, 0]]
+        auto func = [&](const indexes_t& inds) -> void {
+          data_[inds[1]] = 0.0;
+        };
 #pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }
-                return;
-            }
-            if (diag[0] == 0.0) {
-                // [[0, 0], [0, 1]]
-                auto func = [&](const indexes_t &inds) -> void {
-                    data_[inds[0]] = 0.0;
-                };
+        for (int k = 0; k < (size_ >> 1); k += 1) {
+          const auto inds = indexes(qbit0, qbit0, k);
+          func(inds);
+        }
+        return;
+      }
+      // general [[1, 0], [0, z]]
+      auto func = [&](const indexes_t& inds,
+                      const vector<std::complex<double>>& _mat) -> void {
+        const auto k = inds[1];
+        data_[k] *= _mat[1];
+      };
 #pragma omp parallel for
-                for (int k = 0; k < (size_ >> 1); k+=1){
-                    const auto inds = indexes(qbit0, qbit0, k);
-                    func(inds);
-                }                
-                return;
-            }
-            // general [[z, 0], [0, 1]]
-            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
-                const auto k = inds[0];
-                data_[k] *= _mat[0];
-            };
+      for (int k = 0; k < (size_ >> 1); k += 1) {
+        const auto inds = indexes(qbit0, qbit0, k);
+        func(inds, convert(diag));
+      }
+      return;
+    } else if (diag[1] == 1.0) {
+      // [[z, 0], [0, 1]] matrix
+      if (diag[0] == std::complex<double>(0., -1.)) {
+        // [[-i, 0], [0, 1]]
+        auto func = [&](const indexes_t& inds) -> void {
+          const auto k = inds[1];
+          double cache = data_[k].imag();
+          data_[k].imag(data_[k].real() * -1.);
+          data_[k].real(cache);
+        };
 #pragma omp parallel for
-            for (int k = 0; k < (size_ >> 1); k+=1){
-                const auto inds = indexes(qbit0, qbit0, k);
-                func(inds, convert(diag));
-            }   
-            return;
-        } else {
-            // Lambda function for diagonal matrix multiplication
-            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
-                const auto k0 = inds[0];
-                const auto k1 = inds[1];
-                data_[k0] *= _mat[0];
-                data_[k1] *= _mat[1];
-            };
+        for (int k = 0; k < (size_ >> 1); k += 1) {
+          const auto inds = indexes(qbit0, qbit0, k);
+          func(inds);
+        }
+        return;
+      }
+      if (diag[0] == std::complex<double>(0., 1.)) {
+        // [[i, 0], [0, 1]]
+        auto func = [&](const indexes_t& inds) -> void {
+          const auto k = inds[1];
+          double cache = data_[k].imag();
+          data_[k].imag(data_[k].real());
+          data_[k].real(cache * -1.);
+        };
 #pragma omp parallel for
-            for (int k = 0; k < (size_ >> 1); k+=1){
-                const auto inds = indexes(qbit0, qbit0, k);
-                func(inds, convert(diag));
-            }  
+        for (int k = 0; k < (size_ >> 1); k += 1) {
+          const auto inds = indexes(qbit0, qbit0, k);
+          func(inds);
         }
         return;
-    }
-    const uint N = qbits.size();
-    auto func = [&](const indexes_t &inds, const vector<std::complex<double>> &_diag) -> void { 
-        for(int i=0; i<2; ++i){
-            const int k = inds[i];
-            int iv = 0;
-            for(int j=0;j<N;j++){
-                if((k & (1ULL << qbits[j])) !=0 )
-                    iv += (1ULL << j);
-            }
-            if(_diag[iv] != (double)1.0)
-                data_[k] *= _diag[iv];
+      }
+      if (diag[0] == 0.0) {
+        // [[0, 0], [0, 1]]
+        auto func = [&](const indexes_t& inds) -> void {
+          data_[inds[0]] = 0.0;
+        };
+#pragma omp parallel for
+        for (int k = 0; k < (size_ >> 1); k += 1) {
+          const auto inds = indexes(qbit0, qbit0, k);
+          func(inds);
         }
-    };
-    // apply func
-    vector<pos_t> qbit0{qbits[0]};
+        return;
+      }
+      // general [[z, 0], [0, 1]]
+      auto func = [&](const indexes_t& inds,
+                      const vector<std::complex<double>>& _mat) -> void {
+        const auto k = inds[0];
+        data_[k] *= _mat[0];
+      };
 #pragma omp parallel for
-    for (int k = 0; k < (size_ >> 1); k+=1){
+      for (int k = 0; k < (size_ >> 1); k += 1) {
         const auto inds = indexes(qbit0, qbit0, k);
         func(inds, convert(diag));
+      }
+      return;
+    } else {
+      // Lambda function for diagonal matrix multiplication
+      auto func = [&](const indexes_t& inds,
+                      const vector<std::complex<double>>& _mat) -> void {
+        const auto k0 = inds[0];
+        const auto k1 = inds[1];
+        data_[k0] *= _mat[0];
+        data_[k1] *= _mat[1];
+      };
+#pragma omp parallel for
+      for (int k = 0; k < (size_ >> 1); k += 1) {
+        const auto inds = indexes(qbit0, qbit0, k);
+        func(inds, convert(diag));
+      }
+    }
+    return;
+  }
+  const uint N = qbits.size();
+  auto func = [&](const indexes_t& inds,
+                  const vector<std::complex<double>>& _diag) -> void {
+    for (int i = 0; i < 2; ++i) {
+      const int k = inds[i];
+      int iv = 0;
+      for (int j = 0; j < N; j++) {
+        if ((k & (1ULL << qbits[j])) != 0)
+          iv += (1ULL << j);
+      }
+      if (_diag[iv] != (double)1.0)
+        data_[k] *= _diag[iv];
     }
+  };
+  // apply func
+  vector<pos_t> qbit0{qbits[0]};
+#pragma omp parallel for
+  for (int k = 0; k < (size_ >> 1); k += 1) {
+    const auto inds = indexes(qbit0, qbit0, k);
+    func(inds, convert(diag));
+  }
 }
 
 template <class real_t>
-void StateVector<real_t>::update(vector<pos_t> const& qbits, const uint final_state, const uint meas_state, const double meas_prob){
-    const uint dim = 1ULL << qbits.size();
-    vector<std::complex<double> >  matdiag(dim, 0.);
-    matdiag[meas_state] = 1./ std::sqrt(meas_prob);
-    apply_diagonal_matrix(qbits, matdiag);
-    
-    //TODO: Add reset
-    // for reset
-     if(final_state != meas_state){
-        if(qbits.size() == 1){
-            // apply a x gate
-            apply_x(qbits[0]);
-        }else{
-            // Diagonal matrix for projecting and renormalizing to measurement outcome
-            vector<std::complex<double> > perm(dim*dim, 0.);
-            perm[final_state * dim + meas_state] = 1.;
-            perm[meas_state * dim + final_state] = 1.;
-            for (uint j = 0; j < dim; j++){
-                if(j != final_state && j != meas_state) 
-                    perm[j * dim + j] = 1.;
-            }
-            // apply permutation to swap state
-            const uint N = qbits.size();
-            const uint DIM = 1ULL << N;
-            auto func = [&](const indexes_t &inds, const vector<std::complex<double> > &_mat) -> void {
-                // std::array<std::complex<double>, 1ULL << N > cache;
-                vector<std::complex<double> > cache(1ULL << N, 0.);
-                for(uint i = 0; i< DIM; i++){
-                    const auto ii = inds[i];
-                    cache[i] = data_[ii];
-                    data_[ii] = 0.;
-                }
-                for(uint i=0; i< DIM; i++)
-                    for(uint j=0; j<DIM; j++)
-                        data_[inds[i]] += _mat[i + DIM * j] * cache[j];
-            };
-            vector<pos_t> qs(qbits.begin(), qbits.end());
-            vector<pos_t> qs_sorted(qs.begin(), qs.end());
-            std::sort(qs_sorted.begin(), qs_sorted.end());
-            uint END = size_ >> qs.size();
-#pragma omp parallel for
-            for (int k = 0; k < END; k+=1){
-                const auto inds = indexes(qs, qs_sorted, k);
-                func(inds, convert(perm));
-            }  
+void StateVector<real_t>::update(vector<pos_t> const& qbits,
+                                 const uint final_state, const uint meas_state,
+                                 const double meas_prob) {
+  const uint dim = 1ULL << qbits.size();
+  vector<std::complex<double>> matdiag(dim, 0.);
+  matdiag[meas_state] = 1. / std::sqrt(meas_prob);
+  apply_diagonal_matrix(qbits, matdiag);
+
+  // TODO: Add reset
+  //  for reset
+  if (final_state != meas_state) {
+    if (qbits.size() == 1) {
+      // apply a x gate
+      apply_x(qbits[0]);
+    } else {
+      // Diagonal matrix for projecting and renormalizing to measurement outcome
+      vector<std::complex<double>> perm(dim * dim, 0.);
+      perm[final_state * dim + meas_state] = 1.;
+      perm[meas_state * dim + final_state] = 1.;
+      for (uint j = 0; j < dim; j++) {
+        if (j != final_state && j != meas_state)
+          perm[j * dim + j] = 1.;
+      }
+      // apply permutation to swap state
+      const uint N = qbits.size();
+      const uint DIM = 1ULL << N;
+      auto func = [&](const indexes_t& inds,
+                      const vector<std::complex<double>>& _mat) -> void {
+        // std::array<std::complex<double>, 1ULL << N > cache;
+        vector<std::complex<double>> cache(1ULL << N, 0.);
+        for (uint i = 0; i < DIM; i++) {
+          const auto ii = inds[i];
+          cache[i] = data_[ii];
+          data_[ii] = 0.;
         }
+        for (uint i = 0; i < DIM; i++)
+          for (uint j = 0; j < DIM; j++)
+            data_[inds[i]] += _mat[i + DIM * j] * cache[j];
+      };
+      vector<pos_t> qs(qbits.begin(), qbits.end());
+      vector<pos_t> qs_sorted(qs.begin(), qs.end());
+      std::sort(qs_sorted.begin(), qs_sorted.end());
+      uint END = size_ >> qs.size();
+#pragma omp parallel for
+      for (int k = 0; k < END; k += 1) {
+        const auto inds = indexes(qs, qs_sorted, k);
+        func(inds, convert(perm));
+      }
     }
+  }
 }
 
-template <typename T>
-void printVector(const std::vector<T>& vec) {
-    for (const T& element : vec) {
-        std::cout << element << " ";
-    }
-    std::cout << std::endl;
+template <typename T> void printVector(const std::vector<T>& vec) {
+  for (const T& element : vec) {
+    std::cout << element << " ";
+  }
+  std::cout << std::endl;
 }
 
 template <class real_t>
-std::pair<uint, double> StateVector<real_t>::sample_measure_probs(vector<pos_t> const& qbits){
-    // 1. caculate actual measurement outcome
-    const int64_t N = qbits.size();
-    const int64_t DIM = 1LL << N;
-    const int64_t END = 1LL << (num_ - N);
-    vector<double> probs(DIM, 0.);
-    vector<uint> qubits_sorted(qbits.begin(), qbits.end());
-    
-    std::sort(qubits_sorted.begin(), qubits_sorted.end());
-    if ((num_ == N) && ( qubits_sorted == qbits )){
-        probs = probabilities();
-    }else{
-        vector<double> probs_private(DIM, 0.);
+std::pair<uint, double>
+StateVector<real_t>::sample_measure_probs(vector<pos_t> const& qbits) {
+  // 1. caculate actual measurement outcome
+  const int64_t N = qbits.size();
+  const int64_t DIM = 1LL << N;
+  const int64_t END = 1LL << (num_ - N);
+  vector<double> probs(DIM, 0.);
+  vector<uint> qubits_sorted(qbits.begin(), qbits.end());
+
+  std::sort(qubits_sorted.begin(), qubits_sorted.end());
+  if ((num_ == N) && (qubits_sorted == qbits)) {
+    probs = probabilities();
+  } else {
+    vector<double> probs_private(DIM, 0.);
 #pragma omp parallel for
-        for (int64_t k = 0; k < END; k++){
-            auto idx = indexes(qbits, qubits_sorted, k);
-            // std::cout<<"indexes"<<k<<": ";
-            // printVector(idx);
-            for(int64_t m = 0; m < DIM; ++m){
-                double local_prob = std::real(data_[idx[m]] * std::conj(data_[idx[m]]));
-                #pragma omp critical
-                probs_private[m] += local_prob;
-            }
-        }
-        // std::cout<<"probs_private:";
-        // printVector(probs_private);
-        #pragma omp critical
-        for(int64_t m = 0; m < DIM; ++m){
-            probs[m] += probs_private[m];
-        }
+    for (int64_t k = 0; k < END; k++) {
+      auto idx = indexes(qbits, qubits_sorted, k);
+      // std::cout<<"indexes"<<k<<": ";
+      // printVector(idx);
+      for (int64_t m = 0; m < DIM; ++m) {
+        double local_prob = std::real(data_[idx[m]] * std::conj(data_[idx[m]]));
+#pragma omp critical
+        probs_private[m] += local_prob;
+      }
+    }
+// std::cout<<"probs_private:";
+// printVector(probs_private);
+#pragma omp critical
+    for (int64_t m = 0; m < DIM; ++m) {
+      probs[m] += probs_private[m];
     }
-    set_rng();
-    // std::cout<<"probs:";
-    // printVector(probs);
-    uint outcome = std::discrete_distribution<uint>(probs.begin(), probs.end())(rng_);
-    return std::make_pair(outcome, probs[outcome]);
+  }
+  set_rng();
+  // std::cout<<"probs:";
+  // printVector(probs);
+  uint outcome =
+      std::discrete_distribution<uint>(probs.begin(), probs.end())(rng_);
+  return std::make_pair(outcome, probs[outcome]);
 }
 
 // change to bit endian
-vector<uint> int2vec(uint n, uint base){
-    vector<uint> ret;
-    while(n >= base){
-        ret.push_back(n%base);
-        n /= base;
-    }
-    ret.push_back(n);
-    return ret;
+vector<uint> int2vec(uint n, uint base) {
+  vector<uint> ret;
+  while (n >= base) {
+    ret.push_back(n % base);
+    n /= base;
+  }
+  ret.push_back(n);
+  return ret;
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_measure(vector<pos_t> const& qbits, vector<pos_t> const& cbits){
-    // 1. caculate actual measurement outcome
-    const auto meas = sample_measure_probs(qbits);
-    //2. update statevector
-    update(qbits, meas.first, meas.first, meas.second);
-    //3. store measure
-    vector<uint> outcome = int2vec(meas.first, 2);
-    if(outcome.size() < qbits.size()){
-        outcome.resize(qbits.size());
-    }
-    for(uint j=0; j < outcome.size(); j++){
-        creg_[cbits[j]] = outcome[j];
-    }
+void StateVector<real_t>::apply_measure(vector<pos_t> const& qbits,
+                                        vector<pos_t> const& cbits) {
+  // 1. caculate actual measurement outcome
+  const auto meas = sample_measure_probs(qbits);
+  // 2. update statevector
+  update(qbits, meas.first, meas.first, meas.second);
+  // 3. store measure
+  vector<uint> outcome = int2vec(meas.first, 2);
+  if (outcome.size() < qbits.size()) {
+    outcome.resize(qbits.size());
+  }
+  for (uint j = 0; j < outcome.size(); j++) {
+    creg_[cbits[j]] = outcome[j];
+  }
 }
 
 template <class real_t>
-void StateVector<real_t>::apply_reset(vector<pos_t> const& qbits){
-    const auto meas = sample_measure_probs(qbits);
-    update(qbits, 0, meas.first, meas.second);
-}
\ No newline at end of file
+void StateVector<real_t>::apply_reset(vector<pos_t> const& qbits) {
+  const auto meas = sample_measure_probs(qbits);
+  update(qbits, 0, meas.first, meas.second);
+}
diff --git a/src/qfvm/types.hpp b/src/qfvm/types.hpp
index 5ece0f7..8b20904 100644
--- a/src/qfvm/types.hpp
+++ b/src/qfvm/types.hpp
@@ -1,11 +1,11 @@
 #pragma once
 
+#include <Eigen/Core>
 #include <complex>
-#include <vector>
-#include <string>
 #include <iostream>
+#include <string>
 #include <tuple>
-#include <Eigen/Core>
+#include <vector>
 
 #ifdef _MSC_VER
 using omp_i = signed long long;
@@ -20,14 +20,14 @@ using omp_i = size_t;
 #endif
 #endif
 
-
 typedef unsigned int uint;
 using pos_t = uint;
 using data_t = double;
 using std::complex;
-using std::vector;
 using std::string;
-using RowMatrixXcd = Eigen::Matrix<complex<double>, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+using std::vector;
+using RowMatrixXcd = Eigen::Matrix<complex<double>, Eigen::Dynamic,
+                                   Eigen::Dynamic, Eigen::RowMajor>;
 
 const complex<double> imag_I = complex<double>(0, 1.);
 const double PI = 3.14159265358979323846;
diff --git a/src/qfvm/util.h b/src/qfvm/util.h
index 950b48e..d4962bd 100644
--- a/src/qfvm/util.h
+++ b/src/qfvm/util.h
@@ -1,99 +1,99 @@
 #pragma once
 
+#include <bitset>
 #include <iostream>
-#include <vector>
-#include <time.h>
 #include <regex>
+#include <time.h>
 #include <type_traits>
-#include <bitset>
+#include <vector>
 
-namespace Qfutil{
+namespace Qfutil {
 
-std::vector<int> randomArr(size_t length, size_t max){
-    srand((unsigned)time(NULL));
-    std::vector<int> arr(length);
-    for (size_t i=0;i < arr.size();i++){
-        arr[i] = rand()%max;
-    }
-    return arr;
+std::vector<int> randomArr(size_t length, size_t max) {
+  srand((unsigned)time(NULL));
+  std::vector<int> arr(length);
+  for (size_t i = 0; i < arr.size(); i++) {
+    arr[i] = rand() % max;
+  }
+  return arr;
 }
 
-int randomint(int min, int max){
-    srand((unsigned)time(NULL));
-    return (rand()%(max - min + 1)) + min;
+int randomint(int min, int max) {
+  srand((unsigned)time(NULL));
+  return (rand() % (max - min + 1)) + min;
 }
 
-static uint32_t randomize(uint32_t i){
-    i = (i^61)^(i>>16);
-    i *=9;
-    i ^= i<<4;
-    i *= 0x27d4eb2d;
-    i ^= i >> 15;
-    return i;
+static uint32_t randomize(uint32_t i) {
+  i = (i ^ 61) ^ (i >> 16);
+  i *= 9;
+  i ^= i << 4;
+  i *= 0x27d4eb2d;
+  i ^= i >> 15;
+  return i;
 }
 
-template<class T>
-void printVector(std::vector<T> const &arr){
-    for (auto i : arr){
-        std::cout << i << " ";
-    }
-    std::cout << std::endl;
+template <class T> void printVector(std::vector<T> const& arr) {
+  for (auto i : arr) {
+    std::cout << i << " ";
+  }
+  std::cout << std::endl;
 }
 
-
 std::vector<std::string> split_string(const std::string& str, char delim) {
-    std::vector<std::string> elems;
-    auto lastPos = str.find_first_not_of(delim, 0);
-    auto pos = str.find_first_of(delim, lastPos);
-    while (pos != std::string::npos || lastPos != std::string::npos) {
-        elems.push_back(str.substr(lastPos, pos - lastPos));
-        lastPos = str.find_first_not_of(delim, pos);
-        pos = str.find_first_of(delim, lastPos);
-    }
-    return elems;
+  std::vector<std::string> elems;
+  auto lastPos = str.find_first_not_of(delim, 0);
+  auto pos = str.find_first_of(delim, lastPos);
+  while (pos != std::string::npos || lastPos != std::string::npos) {
+    elems.push_back(str.substr(lastPos, pos - lastPos));
+    lastPos = str.find_first_not_of(delim, pos);
+    pos = str.find_first_of(delim, lastPos);
+  }
+  return elems;
 }
 
-std::vector<std::string> split_string(const std::string& str, char delim, uint num){
-    auto end = str.length();
-    std::vector<std::string> elems;
-    auto lastPos = str.find_first_not_of(delim, 0);
-    auto pos = str.find_first_of(delim, lastPos);
-    while ((pos != std::string::npos || lastPos != std::string::npos) && elems.size() < num) {
-        elems.push_back(str.substr(lastPos, pos - lastPos));
-        lastPos = str.find_first_not_of(delim, pos);
-        pos = str.find_first_of(delim, lastPos);
-    }
+std::vector<std::string> split_string(const std::string& str, char delim,
+                                      uint num) {
+  auto end = str.length();
+  std::vector<std::string> elems;
+  auto lastPos = str.find_first_not_of(delim, 0);
+  auto pos = str.find_first_of(delim, lastPos);
+  while ((pos != std::string::npos || lastPos != std::string::npos) &&
+         elems.size() < num) {
+    elems.push_back(str.substr(lastPos, pos - lastPos));
+    lastPos = str.find_first_not_of(delim, pos);
+    pos = str.find_first_of(delim, lastPos);
+  }
 
-    if ((pos != std::string::npos || lastPos != std::string::npos)){
-        elems.push_back(str.substr(lastPos, end));
-    }
-    return elems;
+  if ((pos != std::string::npos || lastPos != std::string::npos)) {
+    elems.push_back(str.substr(lastPos, end));
+  }
+  return elems;
 }
 
-template<class real_t>
-std::vector<real_t> find_numbers(const std::string &str){
-    std::smatch matchs;
-    std::vector<real_t> res;
-    std::regex pattern;
-    if (std::is_unsigned<real_t>::value){
-        pattern = std::regex("\\d+");
-    }else if(std::is_floating_point<real_t>::value){
-        pattern = std::regex("-?(([1-9]\\d*\\.\\d*)|(0\\.\\d*[1-9]\\d*))|\\d+");
-    }
+template <class real_t>
+std::vector<real_t> find_numbers(const std::string& str) {
+  std::smatch matchs;
+  std::vector<real_t> res;
+  std::regex pattern;
+  if (std::is_unsigned<real_t>::value) {
+    pattern = std::regex("\\d+");
+  } else if (std::is_floating_point<real_t>::value) {
+    pattern = std::regex("-?(([1-9]\\d*\\.\\d*)|(0\\.\\d*[1-9]\\d*))|\\d+");
+  }
 
-    auto begin = std::sregex_iterator(str.begin(), str.end(), pattern);
-    const std::sregex_iterator end;
+  auto begin = std::sregex_iterator(str.begin(), str.end(), pattern);
+  const std::sregex_iterator end;
 
-    for (std::sregex_iterator i = begin; i != end; ++i) {
-        std::string match_str = i->str();
-         if (std::is_unsigned<real_t>::value){
-            res.push_back(std::stoi(match_str));
-        }else if (std::is_floating_point<real_t>::value){
-            res.push_back(std::stod(match_str));
-        }
+  for (std::sregex_iterator i = begin; i != end; ++i) {
+    std::string match_str = i->str();
+    if (std::is_unsigned<real_t>::value) {
+      res.push_back(std::stoi(match_str));
+    } else if (std::is_floating_point<real_t>::value) {
+      res.push_back(std::stod(match_str));
     }
+  }
 
-    return res;
+  return res;
 }
 
-}//namespace Qfutil
\ No newline at end of file
+} // namespace Qfutil
diff --git a/src/qfvm_clifford/bit.h b/src/qfvm_clifford/bit.h
new file mode 100644
index 0000000..1be3467
--- /dev/null
+++ b/src/qfvm_clifford/bit.h
@@ -0,0 +1,55 @@
+#ifndef BIT_H_
+#define BIT_H_
+
+#include <cstddef>
+#include <cstdint>
+
+// bit in byte
+struct bit {
+  uint8_t* byte;
+  uint8_t byte_index;
+
+  bit(void* ptr, size_t offset)
+      : byte(((uint8_t*)ptr + (offset / 8))), byte_index(offset & 7) {}
+
+  // copy assignment for bit in byte
+  inline bit& operator=(bool value) {
+    // make bit be 0
+    *byte &= ~((uint8_t)1 << byte_index);
+    // assignment
+    *byte |= uint8_t(value) << byte_index;
+    return *this;
+  }
+
+  inline bit& operator=(const bit& other) {
+    *this = bool(other);
+    return *this;
+  }
+
+  // bit operator
+  inline bit& operator^=(bool value) {
+    *byte ^= uint8_t(value) << byte_index;
+    return *this;
+  }
+
+  inline bit& operator&=(bool value) {
+    *byte &= (uint8_t(value) << byte_index) | ~(uint8_t(1) << byte_index);
+    return *this;
+  }
+
+  inline bit& operator|=(bool value) {
+    *byte |= uint8_t(value) << byte_index;
+    return *this;
+  }
+
+  // conversion operator
+  inline operator bool() const { return (*byte >> byte_index) & 1; }
+
+  void swap(bit other) {
+    bool b = bool(other);
+    other = bool(*this);
+    *this = b;
+  }
+};
+
+#endif
diff --git a/src/qfvm_clifford/bit_word.h b/src/qfvm_clifford/bit_word.h
new file mode 100644
index 0000000..e832964
--- /dev/null
+++ b/src/qfvm_clifford/bit_word.h
@@ -0,0 +1,512 @@
+#ifndef TABLEAU_WORD_H_
+#define TABLEAU_WORD_H_
+
+#include "utils.h"
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+
+#ifdef USE_SIMD
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#endif
+
+#include <iostream>
+#include <sstream>
+
+// A bit_word is a bag of bits, which can be operated by individual CPU
+// instructions.
+// This template is necessary due to the varying interfaces between native types
+// such as uint64_t and intrinsics like __m256i across different architectures
+// and operating systems. In certain contexts, operators can be used on __m256i
+// values (e.g. a ^= b), while in others this is not possible. The bitword
+// template implementations establish a standard set of methods that are
+// essential for Clifford's operation, allowing the same code to be compiled
+// using either 256-bit or 64-bit registers depending on what is appropriate.
+template <size_t word_size> struct bit_word;
+
+/* =================================================== */
+/* ================ bit_word operation =============== */
+
+template <size_t word_size>
+inline bool operator==(const bit_word<word_size>& left,
+                       const bit_word<word_size>& right) {
+  return left.to_u64_array() == right.to_u64_array();
+}
+
+template <size_t word_size>
+inline bool operator!=(const bit_word<word_size>& left,
+                       const bit_word<word_size>& right) {
+  return !(left == right);
+}
+
+template <size_t word_size>
+inline bool operator<(const bit_word<word_size>& left,
+                      const bit_word<word_size>& right) {
+  auto array1 = left.to_u64_array();
+  auto array2 = right.to_u64_array();
+
+  for (size_t i = 0; i < array1.size(); ++i) {
+    if (array1[i] != array2[i]) {
+      return array1[i] < array2[i];
+    }
+  }
+  return false;
+}
+
+template <size_t word_size>
+inline bool operator==(const bit_word<word_size>& left, int right) {
+  return left == bit_word<word_size>(right);
+}
+
+template <size_t word_size>
+inline bool operator!=(const bit_word<word_size>& left, int right) {
+  return left != right;
+}
+
+template <size_t word_size>
+inline bool operator==(const bit_word<word_size>& left, uint64_t right) {
+  return left == bit_word<word_size>(right);
+}
+
+template <size_t word_size>
+inline bool operator!=(const bit_word<word_size>& left, uint64_t right) {
+  return left != right;
+}
+
+template <size_t word_size>
+inline bool operator==(const bit_word<word_size>& left, int64_t right) {
+  return left == bit_word<word_size>(right);
+}
+
+template <size_t word_size>
+inline bool operator!=(const bit_word<word_size>& left, int64_t right) {
+  return left != right;
+}
+
+// output 1 for bit 1, . for bit 0
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os, const bit_word<word_size>& word) {
+  os << "bit_word<" << word_size << ">{";
+  auto array1 = word.to_u64_array();
+  for (size_t i = 0; i < array1.size(); ++i) {
+    for (size_t j = 0; j < 64; ++j) {
+      if ((i | j) && (j & 7) == 0) {
+        os << ' ';
+      }
+      // ".1" is a char array
+      os << ".1"[(array1[i] >> j) & 1];
+    }
+  }
+  os << "}";
+  return os;
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator<<(const bit_word<word_size>& word,
+                                      int offset) {
+  return word.shift(offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator>>(const bit_word<word_size>& word,
+                                      int offset) {
+  return word.shift(-offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator<<=(bit_word<word_size>& word, int offset) {
+  return word = word.shift(offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator>>=(bit_word<word_size>& word, int offset) {
+  return word = word.shift(-offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator<<(const bit_word<word_size>& word,
+                                      uint64_t offset) {
+  return word.shift((int)offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator>>(const bit_word<word_size>& word,
+                                      uint64_t offset) {
+  return word.shift(-(int)offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator<<=(bit_word<word_size>& word,
+                                       uint64_t offset) {
+  return word = word.shift((int)offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator>>=(bit_word<word_size>& word,
+                                       uint64_t offset) {
+  return word = word.shift(-(int)offset);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator&(const bit_word<word_size>& word,
+                                     uint64_t mask) {
+  return word & bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator|(const bit_word<word_size>& word,
+                                     uint64_t mask) {
+  return word | bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator^(const bit_word<word_size>& word,
+                                     uint64_t mask) {
+  return word ^ bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator&(const bit_word<word_size>& word,
+                                     int64_t mask) {
+  return word & bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator|(const bit_word<word_size>& word,
+                                     int64_t mask) {
+  return word | bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator^(const bit_word<word_size>& word,
+                                     int64_t mask) {
+  return word ^ bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator&(const bit_word<word_size>& word,
+                                     int mask) {
+  return word & bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator|(const bit_word<word_size>& word,
+                                     int mask) {
+  return word | bit_word<word_size>(mask);
+}
+
+template <size_t word_size>
+inline bit_word<word_size> operator^(const bit_word<word_size>& word,
+                                     int mask) {
+  return word ^ bit_word<word_size>(mask);
+}
+
+/* =================================================== */
+/* ================ 64 bit version =================== */
+template <> struct bit_word<64> {
+  constexpr static size_t WORD_SIZE = 64;
+  constexpr static size_t BIT_POW = 6;
+
+  union {
+    uint8_t u8[8];
+    uint64_t u64[1];
+  };
+
+  inline constexpr bit_word<64>() : u64{} {}
+  inline constexpr bit_word<64>(uint64_t v) : u64{v} {}
+  inline constexpr bit_word<64>(int64_t v) : u64{(uint64_t)v} {}
+  inline constexpr bit_word<64>(int v) : u64{(uint64_t)v} {}
+
+  inline operator bool() const { return bool(u64[0]); }
+  inline operator int64_t() const { return int64_t(u64[0]); }
+  inline operator int() const { return int64_t(*this); }
+  inline operator uint64_t() const { return u64[0]; }
+
+  inline bit_word<64>& operator^=(const bit_word<64>& other) {
+    u64[0] ^= other.u64[0];
+    return *this;
+  }
+
+  inline bit_word<64>& operator&=(const bit_word<64>& other) {
+    u64[0] &= other.u64[0];
+    return *this;
+  }
+
+  inline bit_word<64>& operator|=(const bit_word<64>& other) {
+    u64[0] |= other.u64[0];
+    return *this;
+  }
+
+  inline bit_word<64> operator^(const bit_word<64>& other) const {
+    return bit_word<64>(u64[0] ^ other.u64[0]);
+  }
+
+  inline bit_word<64> operator&(const bit_word<64>& other) const {
+    return bit_word<64>(u64[0] & other.u64[0]);
+  }
+
+  inline bit_word<64> operator|(const bit_word<64>& other) const {
+    return bit_word<64>(u64[0] | other.u64[0]);
+  }
+
+  inline bit_word<64> andnot(const bit_word<64>& other) const {
+    return bit_word<64>(~u64[0] & other.u64[0]);
+  }
+
+  // convert bit word to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  std::array<uint64_t, 1> to_u64_array() const {
+    return std::array<uint64_t, 1>{u64[0]};
+  }
+
+  inline bit_word<64> shift(int offset) const {
+    auto array1 = u64[0];
+    if (64 <= offset || -64 >= offset) {
+      return 0;
+    } else if (offset > 0) {
+      return bit_word<64>(array1 << offset);
+    } else {
+      return bit_word<64>(array1 >> -offset);
+    }
+  }
+
+  inline uint16_t count() const { return count_uint64_bits(u64[0]); }
+
+  static void* aligned_malloc(size_t bits) { return malloc(bits); }
+
+  static void aligned_free(void* ptr) { free(ptr); }
+
+  template <uint64_t mask, uint64_t shift>
+  static void inplace_transpose_64_step(uint64_t* data, size_t stride) {
+    for (size_t k = 0; k < 64; k++) {
+      if (k & shift)
+        continue;
+      uint64_t& x = data[stride * k];
+      uint64_t& y = data[stride * (k + shift)];
+      uint64_t a = x & mask;
+      uint64_t b = x & ~mask;
+      uint64_t c = y & mask;
+      uint64_t d = y & ~mask;
+      x = a | (c << shift);
+      y = (b >> shift) | d;
+    }
+  }
+
+  static void inplace_transpose_square(bit_word<64>* block_start,
+                                       size_t stride) {
+    inplace_transpose_64_step<0x5555555555555555ull, 1>((uint64_t*)block_start,
+                                                        stride);
+    inplace_transpose_64_step<0x3333333333333333ull, 2>((uint64_t*)block_start,
+                                                        stride);
+    inplace_transpose_64_step<0x0F0F0F0F0F0F0F0Full, 4>((uint64_t*)block_start,
+                                                        stride);
+    inplace_transpose_64_step<0x00FF00FF00FF00FFull, 8>((uint64_t*)block_start,
+                                                        stride);
+    inplace_transpose_64_step<0x0000FFFF0000FFFFull, 16>((uint64_t*)block_start,
+                                                         stride);
+    inplace_transpose_64_step<0x00000000FFFFFFFFull, 32>((uint64_t*)block_start,
+                                                         stride);
+  }
+};
+
+#ifdef USE_SIMD
+/* =================================================== */
+/* ================ 256 bit version ================== */
+template <> struct bit_word<256> {
+  constexpr static size_t WORD_SIZE = 256;
+  constexpr static size_t BIT_POW = 8;
+
+  union {
+    uint8_t u8[32];
+    uint64_t u64[4];
+    __m256i m256;
+  };
+
+  inline constexpr bit_word<256>() : m256(__m256i{}) {}
+  inline constexpr bit_word<256>(__m256i v) : m256(v) {}
+  inline bit_word<256>(uint64_t v) : m256{_mm256_set_epi64x(0, 0, 0, v)} {}
+  inline bit_word<256>(int64_t v)
+      : m256{_mm256_set_epi64x(-(v < 0), -(v < 0), -(v < 0), v)} {}
+  inline bit_word<256>(int v)
+      : m256{_mm256_set_epi64x(-(v < 0), -(v < 0), -(v < 0), v)} {}
+
+  inline operator bool() const {
+    return bool(u64[0] | u64[1] | u64[2] | u64[3]);
+  }
+  inline operator int64_t() const {
+    auto words = to_u64_array();
+    // x86 is little endian default
+    int64_t result = int64_t(words[0]);
+    uint64_t expected = result < 0 ? uint64_t(-1) : uint64_t(0);
+    if (words[1] != expected || words[2] != expected || words[3] != expected) {
+      throw std::runtime_error("int64_t overflow");
+    }
+    return result;
+  }
+  inline operator int() const { return int64_t(*this); }
+  inline operator uint64_t() const {
+    if (u64[1] || u64[2] || u64[3]) {
+      throw std::runtime_error("uint64_t overflow");
+    }
+    return u64[0];
+  }
+
+  inline bit_word<256>& operator^=(const bit_word<256>& other) {
+    m256 = _mm256_xor_si256(m256, other.m256);
+    return *this;
+  }
+
+  inline bit_word<256>& operator&=(const bit_word<256>& other) {
+    m256 = _mm256_and_si256(m256, other.m256);
+    return *this;
+  }
+
+  inline bit_word<256>& operator|=(const bit_word<256>& other) {
+    m256 = _mm256_or_si256(m256, other.m256);
+    return *this;
+  }
+
+  inline bit_word<256> operator^(const bit_word<256>& other) const {
+    return bit_word<256>(_mm256_xor_si256(m256, other.m256));
+  }
+
+  inline bit_word<256> operator&(const bit_word<256>& other) const {
+    return bit_word<256>(_mm256_and_si256(m256, other.m256));
+  }
+
+  inline bit_word<256> operator|(const bit_word<256>& other) const {
+    return bit_word<256>(_mm256_or_si256(m256, other.m256));
+  }
+
+  inline bit_word<256> andnot(const bit_word<256>& other) const {
+    return bit_word<256>(_mm256_andnot_si256(m256, other.m256));
+  }
+
+  // convert bit word to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  std::array<uint64_t, 4> to_u64_array() const {
+    return std::array<uint64_t, 4>{u64[0], u64[1], u64[2], u64[3]};
+  }
+
+  inline bit_word<256> shift(int offset) const {
+    auto array = to_u64_array();
+    while (offset <= -64) {
+      array[0] = array[1];
+      array[1] = array[2];
+      array[2] = array[3];
+      array[3] = 0;
+      offset += 64;
+    }
+
+    while (offset >= 64) {
+      array[3] = array[2];
+      array[2] = array[1];
+      array[1] = array[0];
+      array[0] = 0;
+      offset -= 64;
+    }
+
+    __m256i low2high;
+    __m256i high2low;
+    if (offset < 0) {
+      low2high = _mm256_set_epi64x(0, array[3], array[2], array[1]);
+      high2low = _mm256_set_epi64x(array[3], array[2], array[1], array[0]);
+      offset += 64;
+    } else {
+      low2high = _mm256_set_epi64x(array[3], array[2], array[1], array[0]);
+      high2low = _mm256_set_epi64x(array[2], array[1], array[0], 0);
+    }
+
+    uint64_t mask = (uint64_t{1} << offset) - 1;
+    low2high = _mm256_slli_epi64(low2high, offset);
+    high2low = _mm256_srli_epi64(high2low, 64 - offset);
+    // for offset < 0, only w[1] in lower 64 bits is used
+    low2high = _mm256_and_si256(low2high, _mm256_set1_epi64x(~mask));
+    // for offset > 0, only w[0] in upper 64 bits is used
+    high2low = _mm256_and_si256(high2low, _mm256_set1_epi64x(mask));
+    return _mm256_or_si256(low2high, high2low);
+  }
+
+  inline uint16_t count() const {
+    return count_uint64_bits(u64[0]) + count_uint64_bits(u64[1]) +
+           count_uint64_bits(u64[2]) + count_uint64_bits(u64[3]);
+  }
+
+  static void* aligned_malloc(size_t bits) {
+    return _mm_malloc(bits, sizeof(__m256i));
+  }
+
+  static void aligned_free(void* ptr) { _mm_free(ptr); }
+
+  template <uint64_t shift>
+  static void inplace_transpose_256_step(__m256i mask, __m256i* data,
+                                         size_t stride) {
+    for (std::size_t k = 0; k < 256; k++) {
+      if (k & shift)
+        continue;
+
+      __m256i& x = data[stride * k];
+      __m256i& y = data[stride * (k + shift)];
+      __m256i a = _mm256_and_si256(x, mask);
+      __m256i b = _mm256_andnot_si256(mask, x);
+      __m256i c = _mm256_and_si256(y, mask);
+      __m256i d = _mm256_andnot_si256(mask, y);
+
+      x = _mm256_or_si256(a, _mm256_slli_epi64(c, shift));
+      y = _mm256_or_si256(_mm256_srli_epi64(b, shift), d);
+    }
+  }
+
+  static void inplace_transpose_64_and_128_step(bit_word<256>* data,
+                                                size_t stride) {
+    uint64_t* u64_ptr = (uint64_t*)data;
+    stride <<= 2;
+    for (std::size_t k = 0; k < 64; k++) {
+      std::swap(u64_ptr[stride * (k + 64 * 0) + 1],
+                u64_ptr[stride * (k + 64 * 1) + 0]);
+      std::swap(u64_ptr[stride * (k + 64 * 0) + 2],
+                u64_ptr[stride * (k + 64 * 2) + 0]);
+      std::swap(u64_ptr[stride * (k + 64 * 0) + 3],
+                u64_ptr[stride * (k + 64 * 3) + 0]);
+      std::swap(u64_ptr[stride * (k + 64 * 1) + 2],
+                u64_ptr[stride * (k + 64 * 2) + 1]);
+      std::swap(u64_ptr[stride * (k + 64 * 1) + 3],
+                u64_ptr[stride * (k + 64 * 3) + 1]);
+      std::swap(u64_ptr[stride * (k + 64 * 2) + 3],
+                u64_ptr[stride * (k + 64 * 3) + 2]);
+    }
+  }
+
+  static void inplace_transpose_square(bit_word<256>* data, size_t stride) {
+    inplace_transpose_256_step<1>(_mm256_set1_epi8(0x55), (__m256i*)data,
+                                  stride);
+    inplace_transpose_256_step<2>(_mm256_set1_epi8(0x33), (__m256i*)data,
+                                  stride);
+    inplace_transpose_256_step<4>(_mm256_set1_epi8(0x0F), (__m256i*)data,
+                                  stride);
+    inplace_transpose_256_step<8>(_mm256_set1_epi16(0x00FF), (__m256i*)data,
+                                  stride);
+    inplace_transpose_256_step<16>(_mm256_set1_epi32(0x0000FFFF),
+                                   (__m256i*)data, stride);
+    inplace_transpose_256_step<32>(_mm256_set1_epi64x(0x00000000FFFFFFFFull),
+                                   (__m256i*)data, stride);
+  }
+};
+#endif
+
+#endif
diff --git a/src/qfvm_clifford/circuit.h b/src/qfvm_clifford/circuit.h
new file mode 100644
index 0000000..958bad6
--- /dev/null
+++ b/src/qfvm_clifford/circuit.h
@@ -0,0 +1,179 @@
+#ifndef CIRCUIT_H_
+#define CIRCUIT_H_
+
+#include "span_ref.h"
+#include <algorithm>
+#include <cstddef>
+#include <string>
+#include <vector>
+
+// gate instruction, include gate name, target qubits and arguments
+struct circuit_instruction {
+  std::string gate;
+  span_ref<const size_t> targets;
+  span_ref<const double> args;
+
+  circuit_instruction() = delete;
+  circuit_instruction(std::string gate, span_ref<const size_t> targets,
+                      span_ref<const double> args = {})
+      : gate(gate), targets(targets), args(args) {}
+
+  // TODO: add validation for instruction
+  void validate() {}
+};
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const circuit_instruction& instr) {
+  os << instr.gate << " "
+     << "targets: ";
+  for (auto& target : instr.targets) {
+    os << target << " ";
+  }
+
+  if (instr.args.size() > 0) {
+    os << "args: ";
+    for (auto& arg : instr.args) {
+      os << arg << " ";
+    }
+  }
+  os << "\n";
+  return os;
+}
+
+// quantum circuit, include a list of gate instructions, and buffer for targets
+// and args
+struct quantum_circuit {
+  std::vector<circuit_instruction> instr_list;
+  monotonic_buffer<size_t> targets_buf;
+  monotonic_buffer<double> args_buf;
+
+  quantum_circuit() : instr_list(), targets_buf(), args_buf() {}
+  quantum_circuit(const std::string& gate, const std::vector<size_t>& targets,
+                  const std::vector<double>& args = {})
+      : instr_list(), targets_buf(), args_buf() {
+    _append(gate, targets, args);
+  }
+
+  // copy constructor
+  quantum_circuit(const quantum_circuit& other)
+      : instr_list(other.instr_list),
+        targets_buf(other.targets_buf.total_allocated()),
+        args_buf(other.args_buf.total_allocated()) {
+
+    // take copy of targets and args
+    for (auto& instr : instr_list) {
+      instr.targets = targets_buf.take_copy(instr.targets);
+      instr.args = args_buf.take_copy(instr.args);
+    }
+  };
+
+  // move constructor
+  quantum_circuit(quantum_circuit&& other) noexcept
+      : instr_list(std::move(other.instr_list)),
+        targets_buf(std::move(other.targets_buf)),
+        args_buf(std::move(other.args_buf)) {}
+
+  // copy assignment
+  quantum_circuit& operator=(const quantum_circuit& other) {
+    instr_list = other.instr_list;
+    targets_buf = monotonic_buffer<size_t>(other.targets_buf.total_allocated());
+    args_buf = monotonic_buffer<double>(other.args_buf.total_allocated());
+    for (auto& instr : instr_list) {
+      instr.targets = targets_buf.take_copy(instr.targets);
+      instr.args = args_buf.take_copy(instr.args);
+    }
+    return *this;
+  }
+
+  // move assignment
+  quantum_circuit& operator=(quantum_circuit&& other) noexcept {
+    instr_list = std::move(other.instr_list);
+    targets_buf = std::move(other.targets_buf);
+    args_buf = std::move(other.args_buf);
+    return *this;
+  }
+
+  // concatenate two quantum circuits
+  quantum_circuit& operator+=(const quantum_circuit& other) {
+    span_ref<const circuit_instruction> other_instrs(other.instr_list);
+
+    if (&other == this) {
+      instr_list.insert(instr_list.end(), other_instrs.begin(),
+                        other_instrs.end());
+      return *this;
+    }
+
+    for (auto& instr : other_instrs) {
+      auto instr_targets = targets_buf.take_copy(instr.targets);
+      auto instr_args = args_buf.take_copy(instr.args);
+      instr_list.push_back({instr.gate, instr_targets, instr_args});
+    }
+
+    return *this;
+  }
+
+  // concatenate two quantum circuits
+  quantum_circuit operator+(const quantum_circuit& other) {
+    quantum_circuit result(*this);
+    result += other;
+    return result;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const quantum_circuit& qc) {
+    for (auto& instr : qc.instr_list) {
+      os << instr;
+    }
+    return os;
+  }
+
+  operator std::string() const { return str(); }
+
+  std::string str() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  quantum_circuit& append(const std::string& gate,
+                          const std::vector<size_t>& targets,
+                          const std::vector<double>& args = {}) {
+
+    _append(gate, targets, args);
+
+    return *this;
+  }
+
+  quantum_circuit& _append(const std::string& gate,
+                           span_ref<const size_t> targets,
+                           span_ref<const double> args) {
+    circuit_instruction instr(gate, targets, args);
+    instr.validate();
+
+    // TODO: fuse some instr
+    instr.targets = targets_buf.take_copy(targets);
+    instr.args = args_buf.take_copy(args);
+    instr_list.push_back(instr);
+
+    return *this;
+  }
+
+  size_t max_qubit() const {
+    size_t max_qubit = 0;
+    for (auto& instr : instr_list) {
+      for (auto& target : instr.targets) {
+        max_qubit = std::max(max_qubit, target);
+      }
+    }
+
+    return max_qubit;
+  }
+
+  template <typename func>
+  void for_each_circuit_instruction(const func& callback) const {
+    for (auto& instr : instr_list) {
+      callback(instr);
+    }
+  }
+};
+
+#endif
diff --git a/src/qfvm_clifford/clifford_simulator.h b/src/qfvm_clifford/clifford_simulator.h
new file mode 100644
index 0000000..cc01cf6
--- /dev/null
+++ b/src/qfvm_clifford/clifford_simulator.h
@@ -0,0 +1,185 @@
+#ifndef SIMULATOR_H_
+#define SIMULATOR_H_
+
+#include "circuit.h"
+#include "span_ref.h"
+#include "table.h"
+#include "tableau.h"
+#include <cstddef>
+#include <ios>
+#include <ostream>
+#include <tuple>
+
+// strore measurement result
+struct measurement_record {
+  using result = std::tuple<size_t, size_t, bool>;
+  std::vector<result> storage;
+
+  void record(size_t qubit, size_t cbit, bool result) {
+    storage.push_back({qubit, cbit, result});
+  }
+  auto operator[](size_t index) { return storage[index]; }
+  auto const operator[](size_t index) const { return storage[index]; }
+  auto begin() { return storage.begin(); }
+  auto end() { return storage.end(); }
+  auto begin() const { return storage.begin(); }
+  auto end() const { return storage.end(); }
+  auto cbegin() const { return storage.cbegin(); }
+  auto cend() const { return storage.cend(); }
+  auto size() const { return storage.size(); }
+  void clear() { storage.clear(); }
+};
+
+// quantum circuit simulator, include original tableau, measurement record and a
+// random number generator
+template <size_t word_size> struct circuit_simulator {
+
+  tableau<word_size> sim_tableau;
+  measurement_record sim_record;
+  std::mt19937_64 rng;
+
+  // constructor
+  // The number of qubits is specified by num_qubits. And the random number
+  // generator is used to generate random numbers for error gate and
+  // measurement.
+  explicit circuit_simulator(size_t num_qubits, size_t seed = 42)
+      : sim_tableau(num_qubits), sim_record(), rng() {
+    rng.seed(seed);
+  }
+
+  // do a quantum gate except measurement gate
+  template <size_t vec_size, size_t... S>
+  result unpack_vector(auto f, tableau<word_size>& t, auto& vec,
+                       std::index_sequence<S...>) {
+
+    // TODO: maybe can be better
+    return std::get<vec_size - 1>(f)(t, vec[S]...);
+  }
+
+  template <size_t vec_size>
+  result unpack_vector(auto f, tableau<word_size>& t, auto& vec) {
+    if (vec.size() != vec_size)
+      throw std::runtime_error("wrong number of qubits for gate");
+    return unpack_vector<vec_size>(f, t, vec,
+                                   std::make_index_sequence<vec_size>());
+  }
+
+  // do a measurement gate
+  template <size_t... S>
+  result unpack_vector(auto f, tableau<word_size>& t, auto& vec,
+                       std::mt19937_64& rng, std::index_sequence<S...>) {
+
+    // TODO: maybe can be better
+    return std::get<gate_type::COLLAPSING_GATE>(f)(t, rng, vec[S]...);
+  }
+
+  template <size_t vec_size>
+  result unpack_vector(auto f, tableau<word_size>& t, auto& vec,
+                       std::mt19937_64& rng) {
+    if (vec.size() != vec_size)
+      throw std::runtime_error("wrong number of qubits for gate");
+    return unpack_vector(f, t, vec, rng, std::make_index_sequence<vec_size>());
+  }
+
+  // do a quantum circuit, check the max qubit of the quantum circuit, if it is
+  // larger than the number of qubits of the tableau, expand the tableau
+  void do_circuit(const quantum_circuit& qc) {
+    if (qc.max_qubit() >= sim_tableau.num_qubits) {
+      sim_tableau.expand(qc.max_qubit(), 1.0);
+      std::cerr << "WARNING: expanding tableau to " << qc.max_qubit()
+                << " qubits\n";
+    }
+
+    // according quantum circuit instruction type to do the instruction
+    qc.for_each_circuit_instruction([&](const circuit_instruction& ci) {
+      if (!gate_map<word_size>.contains(ci.gate + "_gate")) {
+        throw std::runtime_error("unknown gate");
+      }
+
+      auto pair = gate_map<word_size>[ci.gate + "_gate"];
+      if (SINGLE_QUBIT_GATE == pair.first) {
+        unpack_vector<1>(pair.second, sim_tableau, ci.targets);
+      } else if (TWO_QUBIT_GATE == pair.first) {
+        unpack_vector<2>(pair.second, sim_tableau, ci.targets);
+      } else if (COLLAPSING_GATE == pair.first) {
+        auto record =
+            unpack_vector<1>(pair.second, sim_tableau, ci.targets, rng);
+        if (record.has_value())
+          sim_record.record(ci.targets[0], static_cast<size_t>(ci.args[0]),
+                            record.value());
+      } else if (ERROR_QUBIT_GATE == pair.first) {
+        std::bernoulli_distribution d(ci.args[0]);
+        if (d(rng))
+          unpack_vector<1>(pair.second, sim_tableau, ci.targets);
+      } else {
+        throw std::runtime_error("unknown gate");
+      }
+    });
+  }
+
+  // do a quantum circuit instruction
+  void do_circuit_instruction(const circuit_instruction& ci) {
+
+    if (!gate_map<word_size>.contains(ci.gate + "_gate")) {
+      throw std::runtime_error("unknown gate");
+    }
+
+    auto pair = gate_map<word_size>[ci.gate + "_gate"];
+
+    if (SINGLE_QUBIT_GATE == pair.first) {
+      unpack_vector<1>(pair.second, sim_tableau, ci.targets);
+    } else if (TWO_QUBIT_GATE == pair.first) {
+      unpack_vector<2>(pair.second, sim_tableau, ci.targets);
+    } else if (COLLAPSING_GATE == pair.first) {
+      auto record = unpack_vector<1>(pair.second, sim_tableau, ci.targets, rng);
+      if (record.has_value())
+        sim_record.record(ci.targets[0], static_cast<size_t>(ci.args[0]),
+                          record.value());
+    } else if (ERROR_QUBIT_GATE == pair.first) {
+      std::bernoulli_distribution d(ci.args[0]);
+      if (d(rng))
+        unpack_vector<1>(pair.second, sim_tableau, ci.targets);
+    } else {
+      throw std::runtime_error("unknown gate");
+    }
+  }
+
+  // sample the quantum circuit, after each iteration, reset the tableau to
+  // identity
+  void sample(const quantum_circuit& qc, size_t num_samples) {
+    for (size_t i = 0; i < num_samples; i++) {
+      reset_tableau();
+      do_circuit(qc);
+    }
+  }
+
+  // reset the tableau to identity
+  void reset_tableau() { sim_tableau.reset(); }
+
+  // z-basis reset
+  void reset(size_t qubit) { sim_tableau.reset(rng, qubit); }
+
+  // // x-basis reset
+  // void reset_x(size_t qubit) { sim_tableau.reset_x(rng, qubit); }
+
+  // // y-basis reset
+  // void reset_y(size_t qubit) { sim_tableau.reset_y(rng, qubit); }
+
+  // measurement record size
+  size_t record_size() { return sim_record.size(); }
+
+  // get the measurement record
+  auto current_measurement_record() const { return sim_record; }
+
+  auto measure_all() {
+    measurement_record mr;
+    for (size_t i = 0; i < sim_tableau.num_qubits; i++) {
+      auto record = sim_tableau.m_gate(rng, i);
+      if (record.has_value())
+        mr.record(record.value());
+    }
+    return mr;
+  }
+};
+
+#endif
diff --git a/src/qfvm_clifford/gate_list.h b/src/qfvm_clifford/gate_list.h
new file mode 100644
index 0000000..3d74a18
--- /dev/null
+++ b/src/qfvm_clifford/gate_list.h
@@ -0,0 +1,246 @@
+#include "gate_macro.h"
+#include "tableau.h"
+#include <type_traits>
+
+SINGLE_QUBIT_GATE(h, {
+  t.distabilizer[qubit].swap(t.stabilizer[qubit]);
+  return {};
+})
+
+SINGLE_QUBIT_GATE(i, { return {}; })
+
+SINGLE_QUBIT_GATE(x, {
+  t.stabilizer[qubit].sign ^= true;
+  return {};
+})
+
+SINGLE_QUBIT_GATE(y, {
+  t.distabilizer[qubit].sign ^= true;
+  t.stabilizer[qubit].sign ^= true;
+  return {};
+})
+
+SINGLE_QUBIT_GATE(z, {
+  t.distabilizer[qubit].sign ^= true;
+  return {};
+})
+
+ERROR_QUBIT_GATE(x_error, {
+  t.stabilizer[qubit].sign ^= true;
+  return {};
+})
+
+ERROR_QUBIT_GATE(y_error, {
+  t.distabilizer[qubit].sign ^= true;
+  t.stabilizer[qubit].sign ^= true;
+  return {};
+})
+
+ERROR_QUBIT_GATE(z_error, {
+  t.distabilizer[qubit].sign ^= true;
+  return {};
+})
+
+SINGLE_QUBIT_GATE(h_yz, {
+  t.stabilizer[qubit].mul_ignore_anti_commute(t.distabilizer[qubit]);
+  t.z_gate(qubit);
+  return {};
+})
+
+SINGLE_QUBIT_GATE(s_dag, {
+  t.distabilizer[qubit].mul_ignore_anti_commute(t.stabilizer[qubit]);
+  return {};
+})
+
+SINGLE_QUBIT_GATE(s, {
+  t.s_dag_gate(qubit);
+  t.z_gate(qubit);
+  return {};
+})
+
+// control, target
+TWO_QUBIT_GATE(cnot, {
+  t.stabilizer[qubit2] *= t.stabilizer[qubit1];
+  t.distabilizer[qubit1] *= t.distabilizer[qubit2];
+  return {};
+})
+
+// control, target
+TWO_QUBIT_GATE(cx, {
+  t.stabilizer[qubit2] *= t.stabilizer[qubit1];
+  t.distabilizer[qubit1] *= t.distabilizer[qubit2];
+  return {};
+})
+
+TWO_QUBIT_GATE(swap, {
+  t.stabilizer[qubit1].swap(t.stabilizer[qubit2]);
+  t.distabilizer[qubit1].swap(t.distabilizer[qubit2]);
+  return {};
+})
+
+// Z-basis measurement. Projects each target qubit into |0> or |1> and reports
+// its value (false=|0>, true=|1>).
+COLLAPSING_GATE(m, {
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  return t.stabilizer.signs[qubit];
+})
+
+// Z-basis measurement. Projects each target qubit into |0> or |1> and reports
+// its value (false=|0>, true=|1>).
+COLLAPSING_GATE(measure, {
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  return t.stabilizer.signs[qubit];
+})
+
+// Z-basis measurement. Projects each target qubit into |0> or |1> and reports
+// its value (false=|0>, true=|1>).
+COLLAPSING_GATE(mz, {
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  return t.stabilizer.signs[qubit];
+})
+
+// Y-basis measurement. Projects each target qubit into |i> or |-i> and reports
+// its value (false=|i>, true=|-i>).
+COLLAPSING_GATE(my, {
+  if (!t.is_deterministic_y(qubit)) {
+
+    t.h_gate(qubit);
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+    t.h_gate(qubit);
+  }
+
+  return t.eval_y_obs(qubit).sign;
+})
+
+// X-basis measurement. Projects each target qubit into |+> or |-> and reports
+// its value (false=|+>, true=|->).
+COLLAPSING_GATE(mx, {
+  if (!t.is_deterministic_x(qubit)) {
+
+    t.h_yz_gate(qubit);
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+    t.h_yz_gate(qubit);
+  }
+
+  return t.distabilizer.signs[qubit];
+})
+
+// Z-basis reset. Forces each target qubit into the |0> state by silently
+// measuring it in the Z basis and applying an X gate if it ended up in the |1>
+// state.
+COLLAPSING_GATE(r, {
+  // Collapse the qubits to be reset.
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  // Force the collapsed qubits into the ground state.
+  t.distabilizer.signs[qubit] = false;
+  t.stabilizer.signs[qubit] = false;
+
+  return {};
+})
+
+// Z-basis reset. Forces each target qubit into the |0> state by silently
+// measuring it in the Z basis and applying an X gate if it ended up in the |1>
+// state.
+COLLAPSING_GATE(reset, {
+  // Collapse the qubits to be reset.
+  if (!t.is_deterministic_z(qubit)) {
+
+    tableau_trans<word_size> t_trans(t);
+    t.collapse_qubit_along_z(t_trans, qubit, rng);
+  }
+
+  // Force the collapsed qubits into the ground state.
+  t.distabilizer.signs[qubit] = false;
+  t.stabilizer.signs[qubit] = false;
+
+  return {};
+})
+
+// // Z-basis reset. Forces each target qubit into the |0> state by silently
+// // measuring it in the Z basis and applying an X gate if it ended up in the
+// |1>
+// // state.
+// COLLAPSING_GATE(rz, {
+//   // Collapse the qubits to be reset.
+//   if (!t.is_deterministic_z(qubit)) {
+
+//     tableau_trans<word_size> t_trans(t);
+//     t.collapse_qubit_along_z(t_trans, qubit, rng);
+//   }
+
+//   // Force the collapsed qubits into the ground state.
+//   t.distabilizer.signs[qubit] = false;
+//   t.stabilizer.signs[qubit] = false;
+
+//   return {};
+// })
+
+// // X-basis reset. Forces each target qubit into the |+> state by silently
+// // measuring it in the X basis and applying a Z gate if it ended up in the
+// |->
+// // state.
+// COLLAPSING_GATE(rx, {
+//   // Collapse the qubits to be reset.
+//   if (!t.is_deterministic_x(qubit)) {
+
+//     t.h_yz_gate(qubit);
+//     tableau_trans<word_size> t_trans(t);
+//     t.collapse_qubit_along_z(t_trans, qubit, rng);
+//     t.h_yz_gate(qubit);
+//   }
+
+//   // Force the collapsed qubits into the ground state.
+//   t.distabilizer.signs[qubit] = false;
+//   t.stabilizer.signs[qubit] = false;
+
+//   return {};
+// })
+
+// // Y-basis reset. Forces each target qubit into the |i> state by silently
+// // measuring it in the Y basis and applying an X gate if it ended up in the
+// |-i>
+// // state.
+// COLLAPSING_GATE(ry, {
+//   // Collapse the qubits to be reset.
+//   if (!t.is_deterministic_y(qubit)) {
+
+//     t.h_gate(qubit);
+//     tableau_trans<word_size> t_trans(t);
+//     t.collapse_qubit_along_z(t_trans, qubit, rng);
+//     t.h_gate(qubit);
+//   }
+
+//   // Force the collapsed qubits into the ground state.
+//   t.distabilizer.signs[qubit] = false;
+//   t.stabilizer.signs[qubit] = false;
+//   t.stabilizer.signs[qubit] ^= t.eval_y_obs(qubit).sign;
+
+//   return {};
+// })
+
+#undef SINGLE_QUBIT_GATE
+#undef TWO_QUBIT_GATE
+#undef COLLAPSING_GATE
+#undef ERROR_QUBIT_GATE
diff --git a/src/qfvm_clifford/gate_macro.h b/src/qfvm_clifford/gate_macro.h
new file mode 100644
index 0000000..4544bea
--- /dev/null
+++ b/src/qfvm_clifford/gate_macro.h
@@ -0,0 +1,65 @@
+#include <functional>
+
+#ifdef FUNCTION_REGISTRATION
+#define SINGLE_QUBIT_GATE(GATE_NAME, ...)                                      \
+  template <size_t word_size>                                                  \
+  result GATE_NAME##_gate(tableau<word_size>& t, const size_t qubit)           \
+      __VA_ARGS__
+
+#define TWO_QUBIT_GATE(GATE_NAME, ...)                                         \
+  template <size_t word_size>                                                  \
+  result GATE_NAME##_gate(tableau<word_size>& t, const size_t qubit1,          \
+                          const size_t qubit2) __VA_ARGS__
+
+#define COLLAPSING_GATE(GATE_NAME, ...)                                        \
+  template <size_t word_size>                                                  \
+  result GATE_NAME##_gate(tableau<word_size>& t, std::mt19937_64& rng,         \
+                          const size_t qubit) __VA_ARGS__
+
+#define ERROR_QUBIT_GATE(GATE_NAME, ...)                                       \
+  template <size_t word_size>                                                  \
+  result GATE_NAME##_gate(tableau<word_size>& t, const size_t qubit)           \
+      __VA_ARGS__
+#endif
+
+#ifdef STRUCT_FUNCTION_REGISTRATION
+#define SINGLE_QUBIT_GATE(GATE_NAME, ...)                                      \
+  result GATE_NAME##_gate(const size_t qubit) {                                \
+    tableau<word_size>& t = *this;                                             \
+    __VA_ARGS__                                                                \
+  }
+
+#define TWO_QUBIT_GATE(GATE_NAME, ...)                                         \
+  result GATE_NAME##_gate(const size_t qubit1, const size_t qubit2) {          \
+    tableau<word_size>& t = *this;                                             \
+    __VA_ARGS__                                                                \
+  }
+
+#define COLLAPSING_GATE(GATE_NAME, ...)                                        \
+  result GATE_NAME##_gate(std::mt19937_64& rng, const size_t qubit) {          \
+    tableau<word_size>& t = *this;                                             \
+    __VA_ARGS__                                                                \
+  }
+
+#define ERROR_QUBIT_GATE(GATE_NAME, ...)                                       \
+  result GATE_NAME##_gate(const size_t qubit) {                                \
+    tableau<word_size>& t = *this;                                             \
+    __VA_ARGS__                                                                \
+  }
+#endif
+
+#ifdef GATE_MAP_REGISTRATION
+#define SINGLE_QUBIT_GATE(GATE_NAME, ...)                                      \
+  {STRINGIZE(GATE_NAME##_gate),                                                \
+             {SINGLE_QUBIT_GATE, GATE_NAME##_gate<word_size>}},
+
+#define TWO_QUBIT_GATE(GATE_NAME, ...)                                         \
+  {STRINGIZE(GATE_NAME##_gate), {TWO_QUBIT_GATE, GATE_NAME##_gate<word_size>}},
+
+#define COLLAPSING_GATE(GATE_NAME, ...)                                        \
+  {STRINGIZE(GATE_NAME##_gate), {COLLAPSING_GATE, GATE_NAME##_gate<word_size>}},
+
+#define ERROR_QUBIT_GATE(GATE_NAME, ...)                                       \
+  {STRINGIZE(GATE_NAME##_gate),                                                \
+             {ERROR_QUBIT_GATE, GATE_NAME##_gate<word_size>}},
+#endif
diff --git a/src/qfvm_clifford/packed_bit_word.h b/src/qfvm_clifford/packed_bit_word.h
new file mode 100644
index 0000000..5cb3fab
--- /dev/null
+++ b/src/qfvm_clifford/packed_bit_word.h
@@ -0,0 +1,350 @@
+#ifndef PACKED_BIT_WORD_H_
+#define PACKED_BIT_WORD_H_
+
+#include "bit.h"
+#include "bit_word.h"
+#include "packed_bit_word_slice.h"
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <random>
+#include <sstream>
+
+// Pack bit_words, allocated with alignment and padding enabling SIMD
+// instructions. Due to padding, the smallest tableaus are 256 bits. For
+// performance tableau_element does not store "intendend" size. Only store the
+// padded size.
+template <size_t word_size> struct packed_bit_word {
+  size_t num_bit_words;
+  union {
+    uint8_t* u8;
+    uint64_t* u64;
+    bit_word<word_size>* bw;
+  };
+
+  explicit packed_bit_word(size_t num_bits)
+      : num_bit_words(bits_to_word_padded<word_size>(num_bits)),
+        u64(malloc_aligned_padded(num_bits)) {}
+
+  ~packed_bit_word() {
+    if (nullptr != u64) {
+      bit_word<word_size>::aligned_free(u64);
+      u64 = nullptr;
+      num_bit_words = 0;
+    }
+  }
+
+  // copy constructor
+  packed_bit_word(const packed_bit_word& other)
+      : num_bit_words(other.num_bit_words),
+        u64(malloc_aligned_padded(other.num_bit_words * word_size)) {
+    memcpy(u8, other.u8, other.num_bit_words * word_size / 8);
+  }
+
+  packed_bit_word(const packed_bit_word_slice<word_size>& other)
+      : num_bit_words(other.num_bit_words),
+        u64(malloc_aligned_padded(other.num_bit_words * word_size)) {
+    memcpy(u8, other.u8, other.num_bit_words * word_size / 8);
+  }
+
+  // move constructor, is not allowed to throw generally
+  packed_bit_word(packed_bit_word&& other) noexcept
+      : num_bit_words(other.num_bit_words), u64(other.u64) {
+    other.u64 = nullptr;
+    other.num_bit_words = 0;
+  }
+
+  // copy assignment, deep copy
+  packed_bit_word& operator=(const packed_bit_word& other) {
+    return *this = packed_bit_word_slice<word_size>(other);
+  }
+
+  packed_bit_word<word_size>&
+  operator=(const packed_bit_word_slice<word_size>& other) {
+    if (num_bit_words == other.num_bit_words) {
+      // avoid re-allocating memory
+      packed_bit_word_slice<word_size>(*this) = other;
+      return *this;
+    }
+
+    this->~packed_bit_word();
+    new (this) packed_bit_word(other);
+    return *this;
+  }
+
+  // move assignment
+  packed_bit_word& operator=(const packed_bit_word&& other) noexcept {
+    this->~packed_bit_word();
+    new (this) packed_bit_word(std::move(other));
+    return *this;
+  }
+
+  // equality
+  bool operator==(const packed_bit_word<word_size>& other) const {
+    return num_bit_words == other.num_bit_words &&
+           memcmp(bw, other.bw, word_size * num_bit_words / 8) == 0;
+  }
+
+  bool operator==(const packed_bit_word_slice<word_size>& other) const {
+    return num_bit_words == other.num_bit_words &&
+           memcmp(bw, other.bw, word_size * num_bit_words / 8) == 0;
+  }
+
+  // inequality
+  bool operator!=(const packed_bit_word<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  bool operator!=(const packed_bit_word_slice<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  // convert packed_bit_word to packed_bit_word_slice
+  operator packed_bit_word_slice<word_size>() {
+    return packed_bit_word_slice<word_size>(bw, num_bit_words);
+  }
+
+  operator const packed_bit_word_slice<word_size>() const {
+    return packed_bit_word_slice<word_size>(bw, num_bit_words);
+  }
+
+  // index operation
+  bit operator[](size_t index) { return bit(u64, index); }
+  const bit operator[](size_t index) const { return bit(u64, index); }
+
+  // assignment
+  packed_bit_word<word_size>&
+  operator^=(const packed_bit_word<word_size>& other) {
+    packed_bit_word_slice<word_size>(*this) ^=
+        packed_bit_word_slice<word_size>(other);
+    return *this;
+  }
+
+  packed_bit_word<word_size>
+  operator&=(const packed_bit_word<word_size>& other) {
+    packed_bit_word_slice<word_size>(*this) &=
+        packed_bit_word_slice<word_size>(other);
+    return *this;
+  }
+
+  packed_bit_word<word_size>
+  operator|=(const packed_bit_word<word_size>& other) {
+    packed_bit_word_slice<word_size>(*this) |=
+        packed_bit_word_slice<word_size>(other);
+    return *this;
+  }
+
+  packed_bit_word<word_size>&
+  operator+=(const packed_bit_word<word_size>& other) {
+    size_t num_u64 = (num_bit_words * word_size) >> 6;
+    for (size_t i = 0; i < num_u64 - 1; ++i) {
+      u64[i] += other.u64[i];
+      // carry
+      u64[i + 1] += (u64[i] < other.u64[i]);
+    }
+    u64[num_u64 - 1] += other.u64[num_u64 - 1];
+    return *this;
+  }
+
+  // compare operation
+  bool operator<(const packed_bit_word<word_size>& other) const {
+    return packed_bit_word_slice<word_size>(*this) <
+           packed_bit_word_slice<word_size>(other);
+  }
+
+  // shift operator
+  packed_bit_word<word_size>& operator>>=(int offset) {
+    packed_bit_word_slice<word_size>(*this) >>= offset;
+    return *this;
+  }
+
+  packed_bit_word<word_size>& operator<<=(int offset) {
+    packed_bit_word_slice<word_size>(*this) <<= offset;
+    return *this;
+  }
+
+  // convert packed_bit_word to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // swap two packed_bit_word
+  void swap(packed_bit_word<word_size> other) {
+    packed_bit_word_slice<word_size>(*this).swap(
+        packed_bit_word_slice<word_size>(other));
+  }
+
+  // slice operator, offset and num_bit_words should be the number of bit words
+  inline packed_bit_word_slice<word_size> slice(size_t offset,
+                                                size_t num_bit_words) {
+    return packed_bit_word_slice<word_size>(bw + offset, num_bit_words);
+  }
+
+  // slice operator, offset and num_bit_words should be the number of bit words
+  inline const packed_bit_word_slice<word_size>
+  slice(size_t offset, size_t num_bit_words) const {
+    return packed_bit_word_slice<word_size>(bw + offset, num_bit_words);
+  }
+
+  // determine the list of bit_word is not all zero
+  bool is_not_all_zero() const {
+    return packed_bit_word_slice<word_size>(*this).is_not_all_zero();
+  }
+
+  // generate random packed_bit_word
+  static packed_bit_word<word_size> random(size_t num_bits,
+                                           std::mt19937_64& rng) {
+    packed_bit_word<word_size> result(num_bits);
+    result.randomize(num_bits, rng);
+    return result;
+  }
+
+  // if num_bits != this.num_bit_words * word_size, this function will change
+  // data from low to high
+  void randomize(size_t num_bits, std::mt19937_64& rng) {
+    auto num_u64 = num_bits >> 6;
+    for (size_t k = 0; k < num_u64; ++k)
+      u64[k] = rng();
+
+    auto remaining_bits = num_bits & 63;
+    if (remaining_bits) {
+      auto mask = (uint64_t(1) << remaining_bits) - 1;
+      u64[num_u64] &= ~mask;
+      u64[num_u64] |= rng() & mask;
+    }
+  }
+
+  void truncated_overwrite_from(packed_bit_word_slice<word_size>& other,
+                                size_t num_bits) {
+    size_t num_u8 = num_bits >> 3;
+    memcpy(u8, other.u8, num_u8);
+    auto remaining_bits = num_bits & 7;
+    if (remaining_bits) {
+      auto mask = (uint8_t(1) << remaining_bits) - 1;
+      u8[num_u8] &= ~mask;
+      u8[num_u8] |= other.u8[num_u8] & mask;
+    }
+  }
+
+  // count the number of 1s
+  size_t count() { return packed_bit_word_slice<word_size>(*this).count(); }
+
+  // malloc aligned padded
+  uint64_t* malloc_aligned_padded(size_t bits) {
+    size_t num_u8 = bits_to_bits_padded<word_size>(bits);
+    void* result = bit_word<word_size>::aligned_malloc(num_u8);
+    memset(result, 0, num_u8);
+    return reinterpret_cast<uint64_t*>(result);
+  }
+};
+
+// bit operations
+template <size_t word_size>
+packed_bit_word<word_size>
+operator^(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  assert(left.num_bit_words == right.num_bit_words);
+  packed_bit_word<word_size> result(left.num_bit_words);
+  packed_bit_word_slice<word_size>(result).for_each_word(
+      left, right, [](auto& a, auto& b, auto& c) { a = b ^ c; });
+  return result;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size> operator^(const packed_bit_word<word_size>& left,
+                                     const packed_bit_word<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) ^
+         packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator^(const packed_bit_word<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) ^ right;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator^(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word<word_size>& right) {
+  return left ^ packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator&(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  assert(left.num_bit_words == right.num_bit_words);
+  packed_bit_word<word_size> result(left.num_bit_words);
+  packed_bit_word_slice<word_size>(result).for_each_word(
+      left, right, [](auto& a, auto& b, auto& c) { a = b & c; });
+  return result;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size> operator&(const packed_bit_word<word_size>& left,
+                                     const packed_bit_word<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) &
+         packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator&(const packed_bit_word<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) & right;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator&(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word<word_size>& right) {
+  return left & packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator|(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  assert(left.num_bit_words == right.num_bit_words);
+  packed_bit_word<word_size> result(left.num_bit_words);
+  packed_bit_word_slice<word_size>(result).for_each_word(
+      left, right, [](auto& a, auto& b, auto& c) { a = b | c; });
+  return result;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size> operator|(const packed_bit_word<word_size>& left,
+                                     const packed_bit_word<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) |
+         packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator|(const packed_bit_word<word_size>& left,
+          const packed_bit_word_slice<word_size>& right) {
+  return packed_bit_word_slice<word_size>(left) | right;
+}
+
+template <size_t word_size>
+packed_bit_word<word_size>
+operator|(const packed_bit_word_slice<word_size>& left,
+          const packed_bit_word<word_size>& right) {
+  return left | packed_bit_word_slice<word_size>(right);
+}
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os,
+                         const packed_bit_word<word_size> word) {
+  for (size_t i = 0; i < word.num_bit_words * word_size; ++i)
+    os << "_1"[word[i]];
+
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/packed_bit_word_slice.h b/src/qfvm_clifford/packed_bit_word_slice.h
new file mode 100644
index 0000000..6041225
--- /dev/null
+++ b/src/qfvm_clifford/packed_bit_word_slice.h
@@ -0,0 +1,307 @@
+#ifndef PACKED_BIT_WORD_SLICE_H_
+#define PACKED_BIT_WORD_SLICE_H_
+
+#include "bit.h"
+#include "bit_word.h"
+#include "utils.h"
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <ostream>
+#include <random>
+#include <sstream>
+
+// reference to a slice of a packed bit word
+template <size_t word_size> struct packed_bit_word_slice {
+  const size_t num_bit_words;
+
+  union {
+    uint8_t* u8;
+    uint64_t* u64;
+    bit_word<word_size>* bw;
+  };
+
+  packed_bit_word_slice(bit_word<word_size>* bw, size_t num_bit_words)
+      : num_bit_words(num_bit_words), bw(bw) {}
+
+  // assignment operator
+  packed_bit_word_slice<word_size>&
+  operator=(const packed_bit_word_slice<word_size>& other) {
+    memcpy(u8, other.u8, num_bit_words * word_size / 8);
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>&
+  operator^=(const packed_bit_word_slice<word_size>& other) {
+    for_each_word(other, [](auto& a, auto& b) { a ^= b; });
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>&
+  operator&=(const packed_bit_word_slice<word_size>& other) {
+    for_each_word(other, [](auto& a, auto& b) { a &= b; });
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>&
+  operator|=(const packed_bit_word_slice<word_size>& other) {
+    for_each_word(other, [](auto& a, auto& b) { a |= b; });
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>&
+  operator+=(const packed_bit_word_slice<word_size>& other) {
+    size_t num_u64 = (num_bit_words * word_size) >> 6;
+    for (size_t i = 0; i < num_u64 - 1; ++i) {
+      u64[i] += other.u64[i];
+      // carry
+      u64[i + 1] += (u64[i] < other.u64[i]);
+    }
+    u64[num_u64 - 1] += other.u64[num_u64 - 1];
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>& operator>>=(int offset) {
+    uint64_t incoming_word;
+    uint64_t current_word;
+
+    if (0 == offset)
+      return *this;
+
+    // move right every u64 by offset
+    while (64 <= offset) {
+      incoming_word = 0;
+      for (int w = ((num_bit_words * word_size) >> 6) - 1; w >= 0; w--) {
+        current_word = u64[w];
+        u64[w] = incoming_word;
+        incoming_word = current_word;
+      }
+      offset -= 64;
+    }
+
+    if (0 == offset)
+      return *this;
+
+    incoming_word = 0;
+    for (int w = ((num_bit_words * word_size) >> 6) - 1; w >= 0; w--) {
+      current_word = u64[w];
+      // move right
+      u64[w] >>= offset;
+      // add high bits
+      u64[w] |= incoming_word << (64 - offset);
+      // update next incoming word
+      incoming_word = current_word & ((uint64_t(1) << offset) - 1);
+    }
+
+    return *this;
+  }
+
+  packed_bit_word_slice<word_size>& operator<<=(int offset) {
+    uint64_t incoming_word;
+    uint64_t current_word;
+
+    if (0 == offset)
+      return *this;
+
+    // move left every u64 by offset
+    while (64 <= offset) {
+      incoming_word = 0;
+      for (uint64_t w = 0; w < (num_bit_words * word_size) >> 6; w++) {
+        current_word = u64[w];
+        u64[w] = incoming_word;
+        incoming_word = current_word;
+      }
+      offset -= 64;
+    }
+
+    if (0 == offset)
+      return *this;
+
+    incoming_word = 0;
+    for (uint64_t w = 0; w < (num_bit_words * word_size) >> 6; w++) {
+      current_word = u64[w];
+      // move left
+      u64[w] <<= offset;
+      // add low bits
+      u64[w] |= incoming_word;
+      // update next incoming word
+      incoming_word = current_word >> (64 - offset);
+    }
+
+    return *this;
+  }
+
+  // equality operator
+  bool operator==(const packed_bit_word_slice<word_size>& other) const {
+    return num_bit_words == other.num_bit_words &&
+           0 == memcmp(bw, other.bw, num_bit_words * word_size / 8);
+  }
+
+  bool operator!=(const packed_bit_word_slice<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  // index operator
+  inline bit operator[](size_t index) { return bit(u8, index); }
+
+  inline const bit operator[](size_t index) const { return bit(u8, index); }
+
+  // convert packed bit word slice to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  void swap(packed_bit_word_slice<word_size> other) {
+    for_each_word(other, [](auto& a, auto& b) { std::swap(a, b); });
+  }
+
+  // slice operator, offset and num_bit_words should be the number of bit words
+  inline packed_bit_word_slice<word_size> slice(size_t offset,
+                                                size_t num_bit_words) {
+    return packed_bit_word_slice<word_size>(bw + offset, num_bit_words);
+  }
+
+  inline const packed_bit_word_slice<word_size>
+  slice(size_t offset, size_t num_bit_words) const {
+    return packed_bit_word_slice<word_size>(bw + offset, num_bit_words);
+  }
+
+  // determine the list of bit_word is not all zero
+  bool is_not_all_zero() const {
+    bit_word<word_size> res{};
+    for_each_word([&res](auto& a) { res |= a; });
+    return bool(res);
+  }
+
+  // if num_bits != this.num_bit_words * word_size, this function will change
+  // data from low to high
+  void randomize(size_t num_bits, std::mt19937_64& rng) {
+    auto num_u64 = num_bits >> 6;
+    for (size_t k = 0; k < num_u64; ++k)
+      u64[k] = rng();
+
+    auto remaining_bits = num_bits & 63;
+    if (remaining_bits) {
+      auto mask = (uint64_t(1) << remaining_bits) - 1;
+      u64[num_u64] &= ~mask;
+      u64[num_u64] |= rng() & mask;
+    }
+  }
+
+  // write the truncated data from other to this
+  void truncated_overwrite_from(packed_bit_word_slice<word_size> other,
+                                size_t num_bits) {
+    size_t num_u8 = num_bits >> 3;
+    memcpy(u8, other.u8, num_u8);
+    auto remaining_bits = num_bits & 7;
+    if (remaining_bits) {
+      auto mask = (uint8_t(1) << remaining_bits) - 1;
+      u8[num_u8] &= ~mask;
+      u8[num_u8] |= other.u8[num_u8] & mask;
+    }
+  }
+
+  // count the number of 1s
+  template <size_t W> size_t count() const {
+    auto end = u64 + (num_bit_words * word_size >> 6);
+    size_t result = 0;
+    for (const uint64_t* p = u64; p != end; p++) {
+      result += count_uint64_bits(*p);
+    }
+    return result;
+  }
+
+  template <typename func> inline void for_each_word(func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    while (bw_start != bw_end) {
+      f(*bw_start);
+      ++bw_start;
+    }
+  }
+
+  template <typename func>
+  inline void for_each_word(packed_bit_word_slice<word_size> other,
+                            func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    auto* other_bw_start = other.bw;
+    while (bw_start != bw_end) {
+      f(*bw_start, *other_bw_start);
+      ++bw_start;
+      ++other_bw_start;
+    }
+  }
+
+  template <typename func>
+  inline void for_each_word(packed_bit_word_slice<word_size> other1,
+                            packed_bit_word_slice<word_size> other2,
+                            func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    auto* other1_bw_start = other1.bw;
+    auto* other2_bw_start = other2.bw;
+    while (bw_start != bw_end) {
+      f(*bw_start, *other1_bw_start, *other2_bw_start);
+      ++bw_start;
+      ++other1_bw_start;
+      ++other2_bw_start;
+    }
+  }
+
+  template <typename func>
+  inline void for_each_word(packed_bit_word_slice<word_size> other1,
+                            packed_bit_word_slice<word_size> other2,
+                            packed_bit_word_slice<word_size> other3,
+                            func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    auto* other1_bw_start = other1.bw;
+    auto* other2_bw_start = other2.bw;
+    auto* other3_bw_start = other3.bw;
+    while (bw_start != bw_end) {
+      f(*bw_start, *other1_bw_start, *other2_bw_start, *other3_bw_start);
+      ++bw_start;
+      ++other1_bw_start;
+      ++other2_bw_start;
+      ++other3_bw_start;
+    }
+  }
+
+  template <typename func>
+  inline void for_each_word(packed_bit_word_slice<word_size> other1,
+                            packed_bit_word_slice<word_size> other2,
+                            packed_bit_word_slice<word_size> other3,
+                            packed_bit_word_slice<word_size> other4,
+                            func f) const {
+    auto* bw_start = bw;
+    auto* bw_end = bw + num_bit_words;
+    auto* other1_bw_start = other1.bw;
+    auto* other2_bw_start = other2.bw;
+    auto* other3_bw_start = other3.bw;
+    auto* other4_bw_start = other4.bw;
+    while (bw_start != bw_end) {
+      f(*bw_start, *other1_bw_start, *other2_bw_start, *other3_bw_start,
+        *other4_bw_start);
+      ++bw_start;
+      ++other1_bw_start;
+      ++other2_bw_start;
+      ++other3_bw_start;
+      ++other4_bw_start;
+    }
+  }
+};
+
+// output operator for packed bit word slice
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os,
+                         const packed_bit_word_slice<word_size>& word) {
+  for (size_t i = 0; i < word.num_bit_words * word_size; ++i)
+    os << "_1"[word[i]];
+
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/pauli.h b/src/qfvm_clifford/pauli.h
new file mode 100644
index 0000000..4e7604a
--- /dev/null
+++ b/src/qfvm_clifford/pauli.h
@@ -0,0 +1,176 @@
+#ifndef PAULI_H_
+#define PAULI_H_
+
+#include "bit.h"
+#include "packed_bit_word.h"
+#include "pauli_slice.h"
+#include "table.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <ostream>
+#include <random>
+#include <string>
+
+// a pauli string is a product of pauli operators (I, X, Y, Z) on n qubits
+template <size_t word_size> struct pauli_string {
+  // the length of the pauli string
+  size_t num_qubits;
+
+  // whether the pauli string is a negative, true if negative and false if
+  // positive
+  bool sign;
+
+  // the paulis in the pauli string, paulis are encoded by xz (I=00, X=10, Y=11,
+  // Z=01)
+  packed_bit_word<word_size> xs, zs;
+
+  explicit pauli_string(size_t n) : num_qubits(n), sign(false), xs(n), zs(n) {}
+
+  explicit pauli_string(std::string& str)
+      : num_qubits(0), sign(false), xs(0), zs(0) {
+    *this = std::move(from_cstr(str.c_str()));
+  }
+
+  // copy constructor
+  pauli_string(const pauli_string<word_size>& other)
+      : num_qubits(other.num_qubits), sign(other.sign), xs(other.xs),
+        zs(other.zs) {}
+
+  pauli_string(const pauli_string_slice<word_size>& other)
+      : num_qubits(other.num_qubits), sign(other.sign), xs(other.xs),
+        zs(other.zs) {}
+
+  // move constructor
+  pauli_string(pauli_string<word_size>&& other) noexcept
+      : num_qubits(other.num_qubits), sign(other.sign), xs(std::move(other.xs)),
+        zs(std::move(other.zs)) {}
+
+  // copy assignment
+  pauli_string<word_size>& operator=(const pauli_string<word_size>& other) {
+    this->num_qubits = other.num_qubits, this->sign = other.sign;
+    this->xs = other.xs, this->zs = other.zs;
+
+    return *this;
+  }
+
+  pauli_string<word_size>&
+  operator=(const pauli_string_slice<word_size>& other) {
+    this->num_qubits = other.num_qubits, this->sign = other.sign;
+    this->xs = other.xs, this->zs = other.zs;
+
+    return *this;
+  }
+
+  // move assignment
+  pauli_string<word_size>& operator=(pauli_string<word_size>&& other) {
+    this->~pauli_string();
+    new (this) pauli_string<word_size>(std::move(other));
+
+    return *this;
+  }
+
+  // equality operator
+  bool operator==(const pauli_string<word_size>& other) const {
+    return num_qubits == other.num_qubits && bool(sign) == bool(other.sign) &&
+           xs == other.xs && zs == other.zs;
+  }
+
+  bool operator==(const pauli_string_slice<word_size>& other) const {
+    return num_qubits == other.num_qubits && bool(sign) == bool(other.sign) &&
+           xs == other.xs && zs == other.zs;
+  }
+
+  bool operator!=(const pauli_string<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  bool operator!=(const pauli_string_slice<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  // convert to pauli string slice
+  operator const pauli_string_slice<word_size>() const {
+    return pauli_string_slice<word_size>(num_qubits, bit((void*)&sign, 0), xs,
+                                         zs);
+  }
+
+  operator pauli_string_slice<word_size>() {
+    return pauli_string_slice<word_size>(num_qubits, bit(&sign, 0), xs, zs);
+  }
+
+  // convert pauli string to string
+  std::string str() const { return std::string(*this); }
+
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // generate a random pauli string
+  static pauli_string<word_size> random(size_t num_qubits,
+                                        std::mt19937_64& rng) {
+    auto result = pauli_string<word_size>(num_qubits);
+    result.xs.randomize(num_qubits, rng);
+    result.zs.randomize(num_qubits, rng);
+    result.sign ^= rng() & 1;
+    return result;
+  }
+
+  // parse char one by one
+  static pauli_string<word_size>
+  parse_cstr(size_t num_qubits, bool sign,
+             const std::function<char(size_t)>& func) {
+    pauli_string<word_size> ret(num_qubits);
+    ret.sign = sign;
+
+    for (size_t i = 0; i < num_qubits; i++) {
+      bool x, z;
+      switch (func(i)) {
+      case 'X':
+        x = true, z = false;
+        break;
+      case 'Y':
+        x = true, z = true;
+        break;
+      case 'Z':
+        x = false, z = true;
+        break;
+      case 'I':
+        x = false, z = false;
+        break;
+      default:
+        throw std::invalid_argument("can not parse character: " +
+                                    std::to_string(func(i)));
+      }
+
+      ret.xs.u64[i / 64] ^= (uint64_t)x << (i & 63);
+      ret.zs.u64[i / 64] ^= (uint64_t)z << (i & 63);
+    }
+
+    return ret;
+  }
+
+  // convert c style string to pauli string
+  static pauli_string<word_size> from_cstr(const char* cstr) {
+    // default is positive
+    auto sign = cstr[0] == '-';
+    if ('-' == cstr[0] || '+' == cstr[0])
+      cstr++;
+
+    return parse_cstr(strlen(cstr), sign, [&](size_t i) { return cstr[i]; });
+  }
+};
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os, const pauli_string<word_size>& str) {
+  os << (str.sign ? '-' : '+');
+  for (size_t i = 0; i < str.num_qubits; i++) {
+    os << "IXZY"[str.xs[i] | (str.zs[i] << 1)];
+  }
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/pauli_slice.h b/src/qfvm_clifford/pauli_slice.h
new file mode 100644
index 0000000..eef0cc0
--- /dev/null
+++ b/src/qfvm_clifford/pauli_slice.h
@@ -0,0 +1,145 @@
+#ifndef PAULI_SLICE_H_
+#define PAULI_SLICE_H_
+
+#include "bit.h"
+#include "packed_bit_word_slice.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+#include <sstream>
+
+// reference to a slice of a pauli string
+template <size_t word_size> struct pauli_string_slice {
+  size_t num_qubits;
+  bit sign;
+  packed_bit_word_slice<word_size> xs, zs;
+
+  // num_qubits should be the same with the number of padded bit_words, that
+  // means, (num_qubits + word_size - 1 / word_size) == xs.num_bit_words ==
+  // zs.num_bit_words
+  pauli_string_slice(size_t num_qubits, bit sign,
+                     packed_bit_word_slice<word_size> xs,
+                     packed_bit_word_slice<word_size> zs)
+      : num_qubits(num_qubits), sign(sign), xs(xs), zs(zs) {}
+
+  // assign operator
+  pauli_string_slice<word_size>&
+  operator=(const pauli_string_slice<word_size>& other) {
+    num_qubits = other.num_qubits;
+    sign = other.sign;
+    xs = other.xs;
+    zs = other.zs;
+
+    return *this;
+  }
+
+  // mulitply a commuting pauli string
+  pauli_string_slice<word_size>&
+  operator*=(const pauli_string_slice<word_size>& other) {
+    auto res = inplace_right_mul(other);
+    // must be commute
+    assert((res & 1) == 0);
+    // if the result phase is positive, then compute excluse-or of the signs
+    sign ^= res & 2;
+    return *this;
+  }
+
+  // mulitply a pauli string, ignore anti-commuting terms
+  pauli_string_slice<word_size>&
+  mul_ignore_anti_commute(const pauli_string_slice<word_size>& other) {
+    auto res = inplace_right_mul(other);
+    // if the result phase is positive, then compute excluse-or of the signs
+    sign ^= res & 2;
+    return *this;
+  }
+
+  // euqality operator
+  bool operator==(const packed_bit_word_slice<word_size>& other) {
+    return num_qubits == other.num_qubits && sign == other.sign &&
+           xs == other.xs && zs == other.zs;
+  }
+
+  bool operator!=(const packed_bit_word_slice<word_size>& other) {
+    return !(*this == other);
+  }
+
+  // convert pauli string to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // swap operator, swap signs, x matrix and z matrix
+  void swap(pauli_string_slice<word_size> other) {
+    sign.swap(other.sign);
+    xs.swap(other.xs);
+    zs.swap(other.zs);
+  }
+
+  // Intuitively, this functionreturns the exponent to which the imaginary
+  // number i is raised when the Pauli matrices represented by x1z1 and x2z2
+  // are multiplied together. For example, if x1 = z2 = 0 and z1 = x2 = 1 then
+  // Definition 2 shows that x1z1 and x2z2 represent Z and X, respectively.
+  // Multiplying Z and X together gives ZX = iY . Since the exponent on i is 1,
+  // the result of this function is 1.
+  // Returns:
+  //  0 if the product is 1
+  //  1 if the product is i
+  //  2 if the product is -1
+  //  3 if the product is -i
+  uint8_t
+  inplace_right_mul(const pauli_string_slice<word_size>& other) noexcept {
+    bit_word<word_size> count1{};
+    bit_word<word_size> count2{};
+
+    xs.for_each_word(
+        zs, other.xs, other.zs,
+        [&count1, &count2](auto& x1, auto& z1, auto& x2, auto& z2) {
+          // accumulate anti-commutation (+i or -i) counts
+          auto x1z2 = x1 & z2;
+          auto anti_commutes = (x2 & z1) ^ x1z2;
+
+          // update left side pauli
+          x1 ^= x2;
+          z1 ^= z2;
+
+          // accumulate anti-commutation (+i or -i) counts
+          count2 ^= (count1 ^ x1 ^ z1 ^ x1z2) & anti_commutes;
+          count1 ^= anti_commutes;
+        });
+
+    // combine final anti-commutation phase tally (mod 4)
+    auto s = count1.count();
+    s ^= count2.count() << 1;
+    s ^= other.sign << 1;
+    return s & 3;
+  }
+
+  // determines if the pauli string commutes with the given pauli string
+  bool commutes(const pauli_string_slice<word_size>& other) const noexcept {
+    if (num_qubits > other.num_qubits)
+      return other.commutes(*this);
+
+    bit_word<word_size> count{};
+    xs.for_each_word(zs, other.xs, other.zs,
+                     [&count](auto& x1, auto& z1, auto& x2, auto& z2) {
+                       count ^= (x1 & z2) ^ (x2 & z1);
+                     });
+    return (count.count() & 1) == 0;
+  }
+};
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os,
+                         const pauli_string_slice<word_size> str) {
+  os << "+-"[str.sign];
+  for (size_t i = 0; i < str.num_qubits; i++) {
+    os << "IXZY"[str.xs[i] | (str.zs[i] << 1)];
+  }
+
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/span_ref.h b/src/qfvm_clifford/span_ref.h
new file mode 100644
index 0000000..8214087
--- /dev/null
+++ b/src/qfvm_clifford/span_ref.h
@@ -0,0 +1,331 @@
+#ifndef SPAN_REF_H_
+#define SPAN_REF_H_
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+// A significant distinction between the semantics of this class and the
+// std::span class introduced in C++20 is that this class defines equality and
+// ordering operators based on *the content being pointed to* rather than the
+// values of the pointers themselves. Two range references are not considered
+// equal simply because they have identical pointers; they are deemed equal
+// because they point to ranges with matching contents. In essence, this class
+// behaves more like a *reference* rather than a pointer.
+
+template <typename T> struct span_ref {
+  T* ptr_start;
+  T* ptr_end;
+
+  span_ref() : ptr_start(nullptr), ptr_end(nullptr) {}
+  span_ref(T* begin, T* end) : ptr_start(begin), ptr_end(end) {}
+
+  // Implicit conversions.
+  span_ref(T* singleton) : ptr_start(singleton), ptr_end(singleton + 1) {}
+
+  span_ref(const span_ref<typename std::remove_const<T>::type>& other)
+      : ptr_start(other.ptr_start), ptr_end(other.ptr_end) {}
+
+  span_ref(std::vector<T>& items)
+      : ptr_start(items.data()), ptr_end(items.data() + items.size()) {}
+
+  span_ref(const std::vector<typename std::remove_const<T>::type>& items)
+      : ptr_start(items.data()), ptr_end(items.data() + items.size()) {}
+
+  template <size_t K>
+  span_ref(std::array<T, K>& items)
+      : ptr_start(items.data()), ptr_end(items.data() + items.size()) {}
+
+  template <size_t K>
+  span_ref(const std::array<typename std::remove_const<T>::type, K>& items)
+      : ptr_start(items.data()), ptr_end(items.data() + items.size()) {}
+
+  span_ref sub(size_t start_offset, size_t end_offset) const {
+    return span_ref<T>(ptr_start + start_offset, ptr_start + end_offset);
+  }
+
+  size_t size() const { return ptr_end - ptr_start; }
+
+  const T* begin() const { return ptr_start; }
+
+  const T* end() const { return ptr_end; }
+
+  const T& back() const { return *(ptr_end - 1); }
+
+  const T& front() const { return *ptr_start; }
+
+  bool empty() const { return ptr_end == ptr_start; }
+
+  T* begin() { return ptr_start; }
+
+  T* end() { return ptr_end; }
+
+  T& back() { return *(ptr_end - 1); }
+
+  T& front() { return *ptr_start; }
+
+  const T& operator[](size_t index) const { return ptr_start[index]; }
+
+  T& operator[](size_t index) { return ptr_start[index]; }
+
+  bool operator==(const span_ref<const T>& other) const {
+    size_t n = size();
+    if (n != other.size()) {
+      return false;
+    }
+    for (size_t k = 0; k < n; k++) {
+      if (ptr_start[k] != other[k]) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool
+  operator==(const span_ref<typename std::remove_const<T>::type>& other) const {
+    return span_ref<const T>(ptr_start, ptr_end) ==
+           span_ref<const T>(other.ptr_start, other.ptr_end);
+  }
+
+  bool operator!=(const span_ref<const T>& other) const {
+    return !(*this == other);
+  }
+
+  bool
+  operator!=(const span_ref<typename std::remove_const<T>::type>& other) const {
+    return !(*this == other);
+  }
+
+  std::string str() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // Lexicographic ordering.
+  bool operator<(const span_ref<const T>& other) const {
+    auto n = std::min(size(), other.size());
+    for (size_t k = 0; k < n; k++) {
+      if ((*this)[k] != other[k]) {
+        return (*this)[k] < other[k];
+      }
+    }
+    return size() < other.size();
+  }
+
+  bool
+  operator<(const span_ref<typename std::remove_const<T>::type>& other) const {
+    return span_ref<const T>(ptr_start, ptr_end) <
+           span_ref<const T>(other.ptr_start, other.ptr_end);
+  }
+};
+
+// Wraps an iterable object so that its values are printed with comma
+// separators.
+template <typename t_iter> struct seperator;
+
+/// A wrapper indicating a range of values should be printed with comma
+/// separators.
+template <typename t_iter> struct seperator {
+  const t_iter& iter;
+  const char* sep;
+  std::string str() const {
+    std::stringstream out;
+    out << *this;
+    return out.str();
+  }
+};
+
+template <typename t_iter>
+seperator<t_iter> seperate(const t_iter& v, const char* sep = ", ") {
+  return seperator<t_iter>{v, sep};
+}
+
+template <typename t_iter>
+std::ostream& operator<<(std::ostream& out, const seperator<t_iter>& v) {
+  bool first = true;
+  for (const auto& t : v.iter) {
+    if (first) {
+      first = false;
+    } else {
+      out << v.sep;
+    }
+    out << t;
+  }
+  return out;
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& out, const span_ref<T>& v) {
+  out << "span_ref{" << seperate(v) << "}";
+  return out;
+}
+
+// A memory resource that efficiently accumulates data incrementally.
+
+// There are three important types of "region" involved: the tail region, the
+// current region, and old regions.
+
+// The tail is used for adding contiguous data to the buffer in an incremental
+// manner. When the tail exceeds the available storage, more memory is allocated
+// and the tail is copied into it to maintain contiguity. The tail can be
+// discarded or committed at any time. Discarding allows reusing the covered
+// memory when writing the next tail. Committing permanently preserves the data
+// (until clearing or deconstructing the monotonic buffer) and ensures that it
+// will not move so that pointers to it can be stored.
+
+// The current region is a continuous block of memory where the tail is being
+// written. If the tail grows beyond this region and triggers an allocation,
+// then this current region becomes an old region, while newly allocated memory
+// becomes the new current region. Each subsequent current region will have at
+// least double size compared to its predecessor.
+
+// Old regions are finalized memory segments that will be retained until
+// clearing or deconstructing of the buffer occurs.
+
+// some ref: https://zhuanlan.zhihu.com/p/96089089
+template <typename T> struct monotonic_buffer {
+
+  // Contiguous memory that is being appended to, but has not yet been
+  // committed.
+  span_ref<T> tail;
+
+  // The current contiguous memory region with a mix of committed, staged, and
+  // unused memory.
+  span_ref<T> cur;
+
+  // Old contiguous memory regions that have been committed and now need to be
+  // kept.
+  std::vector<span_ref<T>> old_areas;
+
+  // Constructs an empty monotonic buffer.
+  monotonic_buffer() : tail(), cur(), old_areas() {}
+
+  // Constructs an empty monotonic buffer with initial capacity for its current
+  // region.
+  monotonic_buffer(size_t reserve) { ensure_available(reserve); }
+
+  void _soft_clear() {
+    cur.ptr_start = nullptr;
+    cur.ptr_end = nullptr;
+    tail.ptr_start = nullptr;
+    tail.ptr_end = nullptr;
+    old_areas.clear();
+  }
+
+  void _hard_clear() {
+    for (auto old : old_areas) {
+      free(old.ptr_start);
+    }
+    if (nullptr != cur.ptr_start) {
+      free(cur.ptr_start);
+    }
+  }
+
+  ~monotonic_buffer() { _hard_clear(); }
+
+  monotonic_buffer(monotonic_buffer&& other) noexcept
+      : tail(other.tail), cur(other.cur),
+        old_areas(std::move(other.old_areas)) {
+    other._soft_clear();
+  }
+
+  monotonic_buffer(const monotonic_buffer& other) = delete;
+
+  monotonic_buffer& operator=(monotonic_buffer&& other) noexcept {
+    _hard_clear();
+    cur = other.cur;
+    tail = other.tail;
+    old_areas = std::move(other.old_areas);
+    other._soft_clear();
+    return *this;
+  }
+
+  // Invalidates all previous data and resets the class into a clean state.
+  // Happens to keep the current contiguous memory region and free old regions.
+  void clear() {
+    for (auto old : old_areas) {
+      free(old.ptr_start);
+    }
+    old_areas.clear();
+    tail.ptr_end = tail.ptr_start = cur.ptr_start;
+  }
+
+  // Returns the size of memory allocated and held by this monotonic buffer (in
+  // units of sizeof(T)).
+  size_t total_allocated() const {
+    size_t result = cur.size();
+    for (auto old : old_areas) {
+      result += old.size();
+    }
+    return result;
+  }
+
+  // Appends and commits data.
+  // Requires the tail to be empty, to avoid bugs where previously staged data
+  // is committed.
+  span_ref<T> take_copy(span_ref<const T> data) {
+    assert(tail.size() == 0);
+    append_tail(data);
+    return commit_tail();
+  }
+
+  // Adds a staged data item.
+  void append_tail(T item) {
+    ensure_available(1);
+    *tail.ptr_end = item;
+    tail.ptr_end++;
+  }
+
+  // Adds staged data.
+  void append_tail(span_ref<const T> data) {
+    ensure_available(data.size());
+    std::copy(data.begin(), data.end(), tail.ptr_end);
+    tail.ptr_end += data.size();
+  }
+
+  // Throws away staged data, so its memory can be re-used.
+  void discard_tail() { tail.ptr_end = tail.ptr_start; }
+
+  // Changes staged data into committed data that will be kept until the buffer
+  // is cleared or deconstructed.
+  span_ref<T> commit_tail() {
+    span_ref<T> result(tail);
+    tail.ptr_start = tail.ptr_end;
+    return result;
+  }
+
+  // Ensures it is possible to stage at least `min_required` more items without
+  // more reallocations.
+  void ensure_available(size_t min_required) {
+    size_t available = cur.ptr_end - tail.ptr_end;
+    if (available >= min_required) {
+      return;
+    }
+
+    size_t alloc_count = std::max(min_required, cur.size() << 1);
+    if (nullptr != cur.ptr_start) {
+      old_areas.push_back(cur);
+    }
+    cur.ptr_start = (T*)malloc(alloc_count * sizeof(T));
+    cur.ptr_end = cur.ptr_start + alloc_count;
+
+    // Staged data is not complete yet; keep it contiguous by copying it to the
+    // new larger memory region.
+    size_t tail_size = tail.size();
+    if (tail_size) {
+      std::move(tail.ptr_start, tail.ptr_end, cur.ptr_start);
+    }
+
+    tail = {cur.ptr_start, cur.ptr_start + tail_size};
+  }
+};
+
+#endif
diff --git a/src/qfvm_clifford/table.h b/src/qfvm_clifford/table.h
new file mode 100644
index 0000000..e06fba4
--- /dev/null
+++ b/src/qfvm_clifford/table.h
@@ -0,0 +1,347 @@
+#ifndef TABLE_H_
+#define TABLE_H_
+
+#include "bit.h"
+#include "bit_word.h"
+#include "packed_bit_word.h"
+#include "packed_bit_word_slice.h"
+#include "utils.h"
+#include <cstddef>
+#include <cstdint>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <string>
+
+// row-major table, padded and aligned to make table more efficient
+// major represents the row (not contiguous in memory), minor represents the
+// column (contiguous in memory), the smallest table is word_size * word_size
+template <size_t word_size> struct table {
+  size_t num_bit_words_major;
+  size_t num_bit_words_minor;
+
+  packed_bit_word<word_size> data;
+
+  table(size_t min_bits_major, size_t min_bits_minor)
+      : num_bit_words_major(bits_to_word_padded<word_size>(min_bits_major)),
+        num_bit_words_minor(bits_to_word_padded<word_size>(min_bits_minor)),
+        data(bits_to_bits_padded<word_size>(min_bits_major) *
+             bits_to_bits_padded<word_size>(min_bits_minor)) {}
+
+  // index operator, major index should be the number of rows
+  inline packed_bit_word_slice<word_size> operator[](size_t major_index) {
+    return data.slice(major_index * num_bit_words_minor, num_bit_words_minor);
+  }
+
+  // index operator, major index should be the number of rows
+  inline const packed_bit_word_slice<word_size>
+  operator[](size_t major_index) const {
+    return data.slice(major_index * num_bit_words_minor, num_bit_words_minor);
+  }
+
+  // equality operator
+  bool operator==(const table<word_size>& other) const {
+    return num_bit_words_major == other.num_bit_words_major &&
+           num_bit_words_minor == other.num_bit_words_minor &&
+           data == other.data;
+  }
+
+  bool operator!=(const table<word_size>& other) const {
+    return !(*this == other);
+  }
+
+  // convert stablizer tableau to string
+  std::string str() const { return std::string(*this); }
+
+  // for better printing
+  std::string str(size_t n) const {
+    std::stringstream ss;
+    for (size_t i = 0; i < n; i++) {
+      if (i)
+        ss << "\n";
+      for (size_t j = 0; j < n; j++)
+        ss << "_1"[(*this)[i][j]];
+    }
+
+    return ss.str();
+  }
+
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+  // major_index is the index of bit_word in the row, minor_index is the index
+  // of bit_word in the column, major_index_sub is the index of bit in the
+  // bit_word
+  inline size_t get_index_bit_word(const size_t major_index,
+                                   const size_t minor_index,
+                                   const size_t major_index_sub) const {
+    auto index =
+        (major_index << bit_word<word_size>::BIT_POW) + major_index_sub;
+    return index * num_bit_words_minor + minor_index;
+  }
+
+  // transpose the table
+  table<word_size> transpose() const {
+    table<word_size> result(num_bit_words_minor * word_size,
+                            num_bit_words_major * word_size);
+
+    for (size_t major_word = 0; major_word < num_bit_words_major;
+         major_word++) {
+      for (size_t minor_word = 0; minor_word < num_bit_words_minor;
+           minor_word++) {
+        for (size_t major_word_sub = 0; major_word_sub < word_size;
+             major_word_sub++) {
+          size_t src_index =
+              get_index_bit_word(major_word, minor_word, major_word_sub);
+          size_t dst_index =
+              result.get_index_bit_word(minor_word, major_word, major_word_sub);
+          result.data.bw[dst_index] = data.bw[src_index];
+        }
+      }
+    }
+
+    // transpose the bit word block, the shape of the block is (word_size,
+    // word_size)
+    for (size_t major_word = 0; major_word < result.num_bit_words_major;
+         major_word++) {
+      for (size_t minor_word = 0; minor_word < result.num_bit_words_minor;
+           minor_word++) {
+        size_t block_start =
+            result.get_index_bit_word(major_word, minor_word, 0);
+        bit_word<word_size>::inplace_transpose_square(
+            result.data.bw + block_start, result.num_bit_words_minor);
+      }
+    }
+
+    return result;
+  }
+
+  // inplace transpose, only for square matrix
+  table<word_size>& inplace_transpose() {
+
+    // transpose the bit word block, the shape of the block is (word_size,
+    // word_size)
+    for (size_t major_word = 0; major_word < num_bit_words_major;
+         major_word++) {
+      for (size_t minor_word = 0; minor_word < num_bit_words_minor;
+           minor_word++) {
+        size_t block_start = get_index_bit_word(major_word, minor_word, 0);
+        bit_word<word_size>::inplace_transpose_square(data.bw + block_start,
+                                                      num_bit_words_minor);
+      }
+    }
+
+    // transpose the table
+    for (size_t major_word = 0; major_word < num_bit_words_major; major_word++)
+      for (size_t minor_word = major_word + 1; minor_word < num_bit_words_minor;
+           minor_word++)
+        for (size_t major_word_sub = 0; major_word_sub < word_size;
+             major_word_sub++)
+          std::swap(data.bw[get_index_bit_word(major_word, minor_word,
+                                               major_word_sub)],
+                    data.bw[get_index_bit_word(minor_word, major_word,
+                                               major_word_sub)]);
+
+    return *this;
+  }
+
+  // square matrix multiplication (assuming row indexing)
+  table<word_size> square_matrix_mul(const table<word_size>& right,
+                                     size_t n) const {
+    auto tmp = right.transpose();
+
+    table<word_size> result(n, n);
+    for (std::size_t i = 0; i < n; i++) {
+      for (std::size_t j = 0; j < n; j++) {
+        bit_word<word_size> accumulater{};
+        (*this)[i].for_each_word(
+            tmp[j], [&](auto& a, auto& b) { accumulater ^= a & b; });
+        result[i][j] = accumulater.count() & 1;
+      }
+    }
+
+    return result;
+  }
+
+  // sqaure matrix inverse for lower triangular matrix
+  table<word_size> inverse_for_lower_triangular_matrix(size_t n) const {
+    table<word_size> result = table<word_size>::identity(n);
+    packed_bit_word<word_size> tmp(num_bit_words_minor * word_size);
+
+    for (size_t i = 0; i < n; i++) {
+      tmp = (*this)[i];
+      // pivot
+      for (size_t j = 0; j < i; j++) {
+        if (tmp[j]) {
+          tmp ^= (*this)[j];
+          result[i] ^= result[j];
+        }
+      }
+    }
+
+    return result;
+  }
+
+  // concatenate four tables
+  static table<word_size>
+  concatenate_four(size_t n, const table<word_size>& upper_left,
+                   const table<word_size>& upper_right,
+                   const table<word_size>& lower_left,
+                   const table<word_size>& lower_right) {
+    table<word_size> result(n << 1, n << 1);
+    for (size_t i = 0; i < n; i++) {
+      for (size_t j = 0; j < n; j++) {
+        result[i][j] = upper_left[i][j];
+        result[i][j + n] = upper_right[i][j];
+        result[i + n][j] = lower_left[i][j];
+        result[i + n][j + n] = lower_right[i][j];
+      }
+    }
+
+    return result;
+  }
+
+  // generate identity table
+  static table<word_size> identity(size_t n) {
+    table<word_size> result(n, n);
+    for (size_t i = 0; i < n; i++)
+      result[i][i] = true;
+
+    return result;
+  }
+
+  // generate random table
+  static table<word_size> random(size_t random_bits_major,
+                                 size_t random_bits_minor,
+                                 std::mt19937_64& rng) {
+    table<word_size> result(random_bits_major, random_bits_minor);
+    for (size_t major = 0; major < random_bits_major; major++)
+      result[major].randomize(random_bits_minor, rng);
+
+    return result;
+  }
+
+  // Sample from the quantum Mallows distribution, generate a bit string h and a
+  // permutation S
+  // TODO: fix -Wstringop-overflow in test when n is 1
+  static inline std::pair<std::vector<bool>, std::vector<size_t>>
+  sample_quantum_mallows(size_t n, std::mt19937_64& rng) {
+    auto r_dis = std::uniform_real_distribution<double>(0, 1);
+    std::vector<bool> h;
+    std::vector<size_t> S;
+    std::vector<size_t> A;
+
+    for (size_t i = 0; i < n; i++)
+      A.push_back(i);
+
+    for (size_t i = 0; i < n; i++) {
+      auto m = A.size();
+      auto r = r_dis(rng);
+      auto eps = pow(4, -int(m));
+      auto k = size_t(-ceil(log2(r + (1 - r) * eps)));
+      h.push_back(k < m);
+      if (k >= m)
+        k = 2 * m - k - 1;
+      S.push_back(A[k]);
+      A.erase(A.begin() + k);
+    }
+
+    return {h, S};
+  }
+
+  // Samples a random valid stabilizer tableau.
+  // reference: Generation of random Clifford operators in
+  // https://arxiv.org/pdf/2003.09412.pdf
+  static table<word_size> random_valid_stabilizer_table(size_t n,
+                                                        std::mt19937_64& rng) {
+    auto h_S = sample_quantum_mallows(n, rng);
+
+    const auto& h = h_S.first;
+    const auto& S = h_S.second;
+
+    table<word_size> symmetric(n, n);
+    for (size_t i = 0; i < n; i++) {
+      symmetric[i].randomize(i + 1, rng);
+      for (size_t j = 0; j < i; j++)
+        symmetric[j][i] = symmetric[i][j];
+    }
+
+    table<word_size> symmetric_m(n, n);
+    for (size_t i = 0; i < n; i++) {
+      symmetric_m[i].randomize(i + 1, rng);
+      symmetric_m[i][i] &= h[i];
+      for (size_t j = 0; j < i; j++) {
+        bool b = h[i] && h[j];
+        b |= h[i] > h[j] && S[i] < S[j];
+        b |= h[i] < h[j] && S[i] > S[j];
+        symmetric_m[i][j] &= b;
+        symmetric_m[j][i] = symmetric_m[i][j];
+      }
+    }
+
+    auto lower = table<word_size>::identity(n);
+    for (size_t i = 0; i < n; i++)
+      lower[i].randomize(i, rng);
+
+    auto lower_m = table<word_size>::identity(n);
+    for (size_t i = 0; i < n; i++) {
+      lower_m[i].randomize(i, rng);
+      for (size_t j = 0; j < i; j++) {
+        bool b = h[i] < h[j];
+        b |= h[i] && h[j] && S[i] > S[j];
+        b |= !h[i] && !h[j] && S[i] < S[j];
+        lower_m[i][j] &= b;
+      }
+    }
+
+    // a normalized probability distribution, P_n(h, S) is the fraction of
+    // n-qubit Clifford operators U such that the canonical form of U defined in
+    // Theorem 1 contains a layer of h gates labeled by h and a qubit
+    // permutation S.
+    auto prod = symmetric.square_matrix_mul(lower, n);
+    auto prod_m = symmetric_m.square_matrix_mul(lower_m, n);
+
+    auto inv = lower.inverse_for_lower_triangular_matrix(n);
+    auto inv_m = lower_m.inverse_for_lower_triangular_matrix(n);
+
+    inv.inplace_transpose();
+    inv_m.inplace_transpose();
+
+    // the first n columns represent Pauli operators Fx_iF^{-1} (ignoring the
+    // phase) and the last n columns represent Fz_iF^{−1} . Stabilizer tableau
+    // of the Hadamard stage and qubit permutation layers in the canonical form
+    auto fused = table<word_size>::concatenate_four(
+        n, lower, table<word_size>(n, n), prod, inv);
+    auto fused_m = table<word_size>::concatenate_four(
+        n, lower_m, table<word_size>(n, n), prod_m, inv_m);
+
+    table<word_size> u(2 * n, 2 * n);
+    for (size_t i = 0; i < n; i++) {
+      u[i] = fused[S[i]];
+      u[i + n] = fused[S[i] + n];
+    }
+
+    // hadamards
+    for (size_t i = 0; i < n; i++)
+      if (h[i])
+        u[i].swap(u[i + n]);
+
+    return fused_m.square_matrix_mul(u, 2 * n);
+  }
+};
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& os, const table<word_size>& table) {
+  for (size_t i = 0; i < table.num_bit_words_major; i++) {
+    if (i)
+      os << "\n";
+    os << table[i];
+  }
+
+  return os;
+}
+
+#endif
diff --git a/src/qfvm_clifford/tableau.h b/src/qfvm_clifford/tableau.h
new file mode 100644
index 0000000..7cb880e
--- /dev/null
+++ b/src/qfvm_clifford/tableau.h
@@ -0,0 +1,434 @@
+#ifndef TABLEAU_H_
+#define TABLEAU_H_
+
+#include "gate_macro.h"
+#include "packed_bit_word.h"
+#include "pauli.h"
+#include "pauli_slice.h"
+#include "table.h"
+#include <algorithm>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <math.h>
+#include <optional>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <variant>
+#include <vector>
+
+// tableau trans is the transpose of the tableau
+template <size_t word_size> struct tableau_trans;
+
+// tableau is the main class of the clifford simulator
+template <size_t word_size> struct tableau;
+
+// quantum gate type
+// SINGLE_QUBIT_GATE: single qubit gate
+// TWO_QUBIT_GATE: two qubit gate
+// COLLAPSING_GATE: measurement gate
+// ERROR_QUBIT_GATE: error gate, which has some probability to apply the gate
+enum gate_type {
+  SINGLE_QUBIT_GATE,
+  TWO_QUBIT_GATE,
+  COLLAPSING_GATE,
+  ERROR_QUBIT_GATE
+};
+
+// quantum gate function return type
+// std::nullopt: The return value of non-measurement quantum gate
+// (COLLASPING_GATE) bool: The return value of measurement quantum gate
+using result = std::optional<bool>;
+
+// quantum gate function type
+// The type is used to represent the all the quantum gate function type, which
+// is for store gate map. The first type is for single qubit gate and error
+// qubit gate, the second type is for two qubit gate, the third type is for
+// collasping gate
+template <size_t word_size>
+using func_type = std::variant<
+    std::function<result(tableau<word_size>& t, const size_t qubit)>,
+    std::function<result(tableau<word_size>& t, const size_t qubit1,
+                         const size_t qubit2)>,
+    std::function<result(tableau<word_size>& t, std::mt19937_64& rng,
+                         const size_t qubit)>>;
+
+// the implementation of the quantum gate function
+#define FUNCTION_REGISTRATION
+#include "gate_list.h"
+#undef FUNCTION_REGISTRATION
+
+// the gate map, which is used to store all the quantum gate function, the index
+// is the gate name
+template <size_t word_size>
+std::unordered_map<std::string, std::pair<gate_type, func_type<word_size>>>
+    gate_map = {
+#define GATE_MAP_REGISTRATION
+#include "gate_list.h"
+#undef GATE_MAP_REGISTRATION
+};
+
+// Inner struct of the tableau, which is used to store the the distabilizer and
+// stabilizer of the tableau.
+//
+// Reference: https://arxiv.org/pdf/quant-ph/0406196.pdf
+template <size_t word_size> struct _tableau {
+  size_t num_qubits;
+
+  // The stabilizer tableau is represented by two bit tables, one for X and one
+  // for Z.
+  table<word_size> xs_t, zs_t;
+
+  // The signs of the tableau
+  packed_bit_word<word_size> signs;
+
+  // constructor, n is the number of qubits
+  _tableau(size_t n) : num_qubits(n), xs_t(n, n), zs_t(n, n), signs(n) {}
+
+  // index operator
+  pauli_string_slice<word_size> operator[](size_t qubit) {
+    // if padding, the number of real words is not the same with the number of
+    // padded words
+    size_t num_words = (num_qubits + word_size - 1) / word_size;
+    return pauli_string_slice<word_size>(num_qubits, signs[qubit],
+                                         xs_t[qubit].slice(0, num_words),
+                                         zs_t[qubit].slice(0, num_words));
+  }
+
+  const pauli_string_slice<word_size> operator[](size_t qubit) const {
+    size_t num_words = (num_qubits + word_size - 1) / word_size;
+    return pauli_string_slice<word_size>(num_qubits, signs[qubit],
+                                         xs_t[qubit].slice(0, num_words),
+                                         zs_t[qubit].slice(0, num_words));
+  }
+};
+
+// A Clifford operation is a unitary quantum operation that conjugates Pauli
+// products into Pauli products. C is Clifford if, for all pauli products P, it
+// is the case that C^*PC is also a Pauli product. In fact, a Clifford operation
+// can be uniquely identified (up to global phase) by how it conjugates Pauli
+// products.
+// A stabilizer tableau is a representation of a Clifford operation
+// that simply directly stores how the Clifford operation conjugates each
+// generator of the Pauli group.
+template <size_t word_size> struct tableau {
+
+  size_t num_qubits;
+
+  // n distabilizer generators which are Pauli operators that together with the
+  // stabilizer generators generate the full Pauli group
+  _tableau<word_size> distabilizer;
+  _tableau<word_size> stabilizer;
+
+  // constructor
+  explicit tableau(size_t num_qubits)
+      : num_qubits(num_qubits), distabilizer(num_qubits),
+        stabilizer(num_qubits) {
+    // Initialize identity elements along the diagonal. The state is |0...0>
+    for (size_t q = 0; q < num_qubits; q++) {
+      distabilizer.xs_t[q][q] = true;
+      stabilizer.zs_t[q][q] = true;
+    }
+  }
+
+  // convert tableau to string
+  operator std::string() const {
+    std::stringstream ss;
+    ss << *this;
+    return ss.str();
+  }
+
+// quantum gate function registration
+#define STRUCT_FUNCTION_REGISTRATION
+#include "gate_list.h"
+#undef STRUCT_FUNCTION_REGISTRATION
+
+  std::string str() const { return std::string(*this); }
+
+  // return identity tableau, that means the state is |0...0>
+  static tableau<word_size> identity(size_t num_qubits) {
+    return tableau<word_size>(num_qubits);
+  }
+
+  void reset() { *this = tableau<word_size>(num_qubits); }
+  void reset(std::mt19937_64& rng, size_t qubit) { r_gate(rng, qubit); }
+  // void reset_x(std::mt19937_64& rng, size_t qubit) { rx_gate(rng, qubit); }
+  // void reset_y(std::mt19937_64& rng, size_t qubit) { ry_gate(rng, qubit); }
+
+  // expand current tableau to new_num_qubits
+  // args:
+  //   new_num_qubits: the new number of qubits
+  //   resize_pad_factor: the resize pad factor, which means leave more space
+  //   for future storage of more quantum bits.
+  void expand(size_t new_num_qubits, double resize_pad_factor) {
+
+    assert(new_num_qubits >= num_qubits);
+    assert(resize_pad_factor >= 1);
+
+    if (new_num_qubits <= distabilizer.xs_t.num_bit_words_major * word_size) {
+      size_t old_num_qubits = num_qubits;
+      num_qubits = new_num_qubits;
+      distabilizer.num_qubits = new_num_qubits;
+      stabilizer.num_qubits = new_num_qubits;
+
+      // Initialize identity elements along the diagonal.
+      for (size_t k = old_num_qubits; k < new_num_qubits; k++) {
+        distabilizer.xs_t[k][k] = true;
+        stabilizer.zs_t[k][k] = true;
+      }
+
+      return;
+    }
+
+    size_t old_num_bit_words = distabilizer.xs_t.num_bit_words_major;
+    size_t old_num_qubits = num_qubits;
+    tableau<word_size> old_state = std::move(*this);
+    *this = tableau<word_size>((size_t)(new_num_qubits * resize_pad_factor));
+    this->num_qubits = new_num_qubits;
+    this->distabilizer.num_qubits = new_num_qubits;
+    this->stabilizer.num_qubits = new_num_qubits;
+
+    // Copy stored state back into new larger space.
+    auto partial_copy = [=](packed_bit_word_slice<word_size> dst,
+                            packed_bit_word_slice<word_size> src) {
+      dst.slice(0, old_num_bit_words) = src;
+    };
+    partial_copy(distabilizer.signs, old_state.distabilizer.signs);
+    partial_copy(stabilizer.signs, old_state.stabilizer.signs);
+    for (size_t k = 0; k < old_num_qubits; k++) {
+      partial_copy(distabilizer[k].xs, old_state.distabilizer[k].xs);
+      partial_copy(distabilizer[k].zs, old_state.distabilizer[k].zs);
+      partial_copy(stabilizer[k].xs, old_state.stabilizer[k].xs);
+      partial_copy(stabilizer[k].zs, old_state.stabilizer[k].zs);
+    }
+  }
+
+  // transpose each table of the tableau
+  void inplace_transpose() {
+    stabilizer.xs_t.inplace_transpose();
+    stabilizer.zs_t.inplace_transpose();
+    distabilizer.xs_t.inplace_transpose();
+    distabilizer.zs_t.inplace_transpose();
+  }
+
+  // Clifford state measurements only have three probabilities: (p0, p1) = (0.5,
+  // 0.5), (1, 0), or (0, 1) The random case happens if there is a row
+  // anti-commuting with Z[qubit]
+  bool is_deterministic_z(size_t target_qubit) const {
+    return !stabilizer[target_qubit].xs.is_not_all_zero();
+  }
+
+  bool is_deterministic_x(size_t target_qubit) const {
+    return !distabilizer[target_qubit].xs.is_not_all_zero();
+  }
+
+  bool is_deterministic_y(size_t target_qubit) const {
+    return distabilizer[target_qubit].xs == stabilizer[target_qubit].xs;
+  }
+
+  pauli_string<word_size> eval_y_obs(size_t qubit) const {
+    pauli_string<word_size> result = distabilizer[qubit];
+    uint8_t log_i = pauli_string_slice<word_size>(result).inplace_right_mul(
+        stabilizer[qubit]);
+    log_i++;
+    assert((log_i & 1) == 0);
+    if (log_i & 2) {
+      result.sign ^= true;
+    }
+    return result;
+  }
+
+  // collapse the qubit along z axis
+  // args:
+  //   t_trans: the transpose of the tableau
+  //   target_qubit: the target qubit
+  //   rng: the random number generator
+  size_t collapse_qubit_along_z(tableau_trans<word_size>& t_trans,
+                                size_t target_qubit, std::mt19937_64& rng) {
+
+    size_t pivot = 0;
+
+    // search for any generator that anti-commutes with the measurement
+    // observable
+    while (pivot < num_qubits &&
+           !t_trans.t.stabilizer.xs_t[pivot][target_qubit])
+      pivot++;
+
+    // Such an p does not exist. In this case the outcome is determinate, so
+    // measuring the state will not change it; the only task is to determine
+    // whether 0 or 1 is observed.
+    if (pivot == num_qubits)
+      return SIZE_MAX;
+
+    // perform partial gaussian elimination over the stabilizer generators that
+    // anti-commute with the measurement. do this by introducing
+    // no-effect-because-control-is-zero CNOT at the beginning of time.
+    for (size_t k = pivot + 1; k < num_qubits; k++)
+      if (t_trans.t.stabilizer.xs_t[k][target_qubit])
+        t_trans.cnot_gate(pivot, k);
+
+    // swap the non-isolated anti-commuting stablizer generator for one that
+    // commutes with the measurement
+    if (t_trans.t.stabilizer.zs_t[pivot][target_qubit]) {
+      t_trans.h_yz_gate(pivot);
+    } else {
+      t_trans.h_gate(pivot);
+    }
+
+    // assign measure result
+    bool result_if_measured = rng() & 1;
+    if (stabilizer.signs[target_qubit] != result_if_measured) {
+      t_trans.x_gate(pivot);
+    };
+
+    return pivot;
+  }
+
+  // random valid stablizer tableau
+  // reference: https://arxiv.org/abs/2003.09412
+  static tableau<word_size>
+  random_valid_stabilizer_tableau(size_t num_qubits, std::mt19937_64& rng) {
+    auto raw = table<word_size>::random_valid_stabilizer_table(num_qubits, rng);
+    tableau<word_size> result(num_qubits);
+    for (size_t row = 0; row < num_qubits; row++) {
+      for (size_t col = 0; col < num_qubits; col++) {
+        result.distabilizer[row].xs[col] = raw[row][col];
+        result.distabilizer[row].zs[col] = raw[row][col + num_qubits];
+        result.stabilizer[row].xs[col] = raw[row + num_qubits][col];
+        result.stabilizer[row].zs[col] =
+            raw[row + num_qubits][col + num_qubits];
+      }
+    }
+
+    result.distabilizer.signs.randomize(num_qubits, rng);
+    result.stabilizer.signs.randomize(num_qubits, rng);
+    return result;
+  }
+
+  // check whether the tableau satisfy the invariants, the tableau need to
+  // preserve commutativity
+  // everything must commute, except for X_k anticommuting with Z_k for each k.
+  bool satisfy_invariants() const {
+    for (size_t q1 = 0; q1 < num_qubits; q1++) {
+      auto x1 = distabilizer[q1];
+      auto z1 = stabilizer[q1];
+
+      if (x1.commutes(z1))
+        return false;
+
+      for (size_t q2 = q1 + 1; q2 < num_qubits; q2++) {
+        auto x2 = distabilizer[q2];
+        auto z2 = stabilizer[q2];
+
+        if (!x1.commutes(x2) || !x1.commutes(z2) || !z1.commutes(x2) ||
+            !z1.commutes(z2))
+          return false;
+      }
+    }
+
+    return true;
+  }
+};
+
+template <size_t word_size>
+std::ostream& operator<<(std::ostream& out, const tableau<word_size>& t) {
+  out << "+-";
+  for (size_t k = 0; k < t.num_qubits; k++) {
+    out << "xz-";
+  }
+  out << "+\n|";
+  for (size_t k = 0; k < t.num_qubits; k++) {
+    out << ' ' << "+-"[t.distabilizer[k].sign] << "+-"[t.stabilizer[k].sign];
+  }
+  for (size_t q = 0; q < t.num_qubits; q++) {
+    out << " |\n|";
+    for (size_t k = 0; k < t.num_qubits; k++) {
+      out << ' '
+          << "IXZY"[t.distabilizer[k].xs[q] | t.distabilizer[k].zs[q] << 1]
+          << "IXZY"[t.stabilizer[k].xs[q] | t.stabilizer[k].zs[q] << 1];
+    }
+  }
+  out << " |";
+  return out;
+}
+
+// reference to the tableau, transpose the tableau at the construction, after
+// some computation, transpose back at the deconstruction
+template <size_t word_size> struct tableau_trans {
+  // referece to the tableau
+  tableau<word_size>& t;
+
+  // constructor
+  explicit tableau_trans(tableau<word_size>& t_in) : t(t_in) {
+    t.inplace_transpose();
+  };
+  tableau_trans() = delete;
+
+  // copt and move constructor
+  tableau_trans(const tableau_trans<word_size>& t) = delete;
+  tableau_trans(tableau_trans<word_size>&& t) = delete;
+
+  // deconstructor
+  ~tableau_trans() { t.inplace_transpose(); }
+
+  // Iterates over the Paulis in a row of the tableau.
+  //
+  // args
+  //   q: The row to iterate over.
+  //   body: A function taking X, Z, and SIGN words. The X and Z words are
+  //   chunks of xz-encoded Paulis from the row. The SIGN word is the
+  //   corresponding chunk of sign bits from the sign row.
+  template <typename FUNC>
+  inline void for_each_trans_obs(const size_t q, FUNC body) {
+    for (size_t k = 0; k < 2; k++) {
+      _tableau<word_size>& h = k == 0 ? t.distabilizer : t.stabilizer;
+      pauli_string_slice<word_size> p = h[q];
+      p.xs.for_each_word(p.zs, h.signs, body);
+    }
+  }
+
+  template <typename FUNC>
+  inline void for_each_trans_obs(const size_t q1, const size_t q2, FUNC body) {
+    for (size_t k = 0; k < 2; k++) {
+      _tableau<word_size>& h = k == 0 ? t.distabilizer : t.stabilizer;
+      pauli_string_slice<word_size> p1 = h[q1];
+      pauli_string_slice<word_size> p2 = h[q2];
+      p1.xs.for_each_word(p1.zs, p2.xs, p2.zs, h.signs, body);
+    }
+  }
+
+  tableau_trans<word_size>& x_gate(const size_t qubit) {
+    for_each_trans_obs(qubit, [](auto& x, auto& z, auto& s) { s ^= z; });
+    return *this;
+  }
+
+  tableau_trans<word_size>& h_gate(const size_t qubit) {
+    for_each_trans_obs(qubit, [](auto& x, auto& z, auto& s) {
+      std::swap(x, z);
+      s ^= x & z;
+    });
+    return *this;
+  }
+
+  tableau_trans<word_size>& h_yz_gate(const size_t qubit) {
+    for_each_trans_obs(qubit, [](auto& x, auto& z, auto& s) {
+      s ^= z.andnot(x);
+      x ^= z;
+    });
+    return *this;
+  }
+
+  tableau_trans<word_size>& cnot_gate(const size_t control_qubit,
+                                      const size_t target_qubit) {
+    for_each_trans_obs(control_qubit, target_qubit,
+                       [](auto& cx, auto& cz, auto& tx, auto& tz, auto& s) {
+                         s ^= (cz ^ tx).andnot(cx & tz);
+                         cz ^= tz;
+                         tx ^= cx;
+                       });
+    return *this;
+  }
+};
+#endif
diff --git a/src/qfvm_clifford/utils.h b/src/qfvm_clifford/utils.h
new file mode 100644
index 0000000..3ec0a27
--- /dev/null
+++ b/src/qfvm_clifford/utils.h
@@ -0,0 +1,40 @@
+#ifndef TABLEAU_ELEMENT_UTILS_H_
+#define TABLEAU_ELEMENT_UTILS_H_
+
+#include <cstddef>
+#include <cstdint>
+
+template <size_t word_size> constexpr size_t bits_to_bits_padded(size_t bits) {
+  return (bits + (word_size - 1)) & ~(word_size - 1);
+}
+
+template <size_t word_size> constexpr size_t bits_to_word_padded(size_t bits) {
+  return bits_to_bits_padded<word_size>(bits) / word_size;
+}
+
+// reference:
+// https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+inline uint8_t count_uint64_bits(uint64_t value) {
+  value = value - ((value >> 1) & 0x5555555555555555ull);
+  value =
+      (value & 0x3333333333333333ull) + ((value >> 2) & 0x3333333333333333ull);
+  return (((value + (value >> 4)) & 0xF0F0F0F0F0F0F0Full) *
+          0x101010101010101ull) >>
+         56;
+}
+
+// Concatenate preprocessor tokens A and B without expanding macro definitions
+// (however, if invoked from a macro, macro arguments are expanded).
+#define PPCAT_NX(A, B) A##B
+
+// Concatenate preprocessor tokens A and B after macro-expanding them.
+#define PPCAT(A, B) PPCAT_NX(A, B)
+
+// Turn A into a string literal without expanding macro definitions (however, if
+// invoked from a macro, macro arguments are expanded).
+#define STRINGIZE_NX(A) #A
+
+// Turn A into a string literal after macro-expanding it.
+#define STRINGIZE(A) STRINGIZE_NX(A)
+
+#endif
diff --git a/src/qfvm_gpu/apply_gate_custate.cuh b/src/qfvm_gpu/apply_gate_custate.cuh
index 2d9c2e7..10a0513 100644
--- a/src/qfvm_gpu/apply_gate_custate.cuh
+++ b/src/qfvm_gpu/apply_gate_custate.cuh
@@ -3,44 +3,42 @@
 #include <custatevec.h>
 #include <helper_custatevec.hpp>
 
-
-void apply_gate_custate(cuDoubleComplex *psi_d, QuantumOperator &op, int n)
-{   
-         
-    //get information form op
-    auto pos = op.positions();
-    const int nTargets  = op.targe_num();
-    const int nControls = op.control_num();
-    const int adjoint   = 0;
-
-    vector<int> targets{pos.begin()+nControls, pos.end()};
-    vector<int> controls{pos.begin(), pos.begin()+nControls};
-
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = 
-    reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-
-    // custatevec handle initialization
-    custatevecHandle_t handle;
-    custatevecCreate(&handle) ;
-    void* extraWorkspace = nullptr;
-    size_t extraWorkspaceSizeInBytes = 0;
-
-    // check the size of external workspace
-    custatevecApplyMatrixGetWorkspaceSize(
-                handle, CUDA_C_64F, n, mat, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW,
-                adjoint, nTargets, nControls, CUSTATEVEC_COMPUTE_64F, &extraWorkspaceSizeInBytes) ;
-
-    // allocate external workspace if necessary
-    if (extraWorkspaceSizeInBytes > 0)
-        cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes);
-
-    custatevecApplyMatrix(
-                    handle, psi_d, CUDA_C_64F, n, mat, CUDA_C_64F,
-                    CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, targets.data(), nTargets, controls.data(), nullptr,
-                    nControls, CUSTATEVEC_COMPUTE_64F, extraWorkspace, extraWorkspaceSizeInBytes);
-
-                      // destroy handle
-    custatevecDestroy(handle);    
+void apply_gate_custate(cuDoubleComplex* psi_d, QuantumOperator& op, int n) {
+
+  // get information form op
+  auto pos = op.positions();
+  const int nTargets = op.targe_num();
+  const int nControls = op.control_num();
+  const int adjoint = 0;
+
+  vector<int> targets{pos.begin() + nControls, pos.end()};
+  vector<int> controls{pos.begin(), pos.begin() + nControls};
+
+  auto mat_temp = op.mat();
+  cuDoubleComplex* mat = reinterpret_cast<cuDoubleComplex*>(mat_temp.data());
+
+  // custatevec handle initialization
+  custatevecHandle_t handle;
+  custatevecCreate(&handle);
+  void* extraWorkspace = nullptr;
+  size_t extraWorkspaceSizeInBytes = 0;
+
+  // check the size of external workspace
+  custatevecApplyMatrixGetWorkspaceSize(
+      handle, CUDA_C_64F, n, mat, CUDA_C_64F, CUSTATEVEC_MATRIX_LAYOUT_ROW,
+      adjoint, nTargets, nControls, CUSTATEVEC_COMPUTE_64F,
+      &extraWorkspaceSizeInBytes);
+
+  // allocate external workspace if necessary
+  if (extraWorkspaceSizeInBytes > 0)
+    cudaMalloc(&extraWorkspace, extraWorkspaceSizeInBytes);
+
+  custatevecApplyMatrix(handle, psi_d, CUDA_C_64F, n, mat, CUDA_C_64F,
+                        CUSTATEVEC_MATRIX_LAYOUT_ROW, adjoint, targets.data(),
+                        nTargets, controls.data(), nullptr, nControls,
+                        CUSTATEVEC_COMPUTE_64F, extraWorkspace,
+                        extraWorkspaceSizeInBytes);
+
+  // destroy handle
+  custatevecDestroy(handle);
 }
-    
\ No newline at end of file
diff --git a/src/qfvm_gpu/apply_gate_gpu.cuh b/src/qfvm_gpu/apply_gate_gpu.cuh
index e2d7be0..5f9bfd5 100644
--- a/src/qfvm_gpu/apply_gate_gpu.cuh
+++ b/src/qfvm_gpu/apply_gate_gpu.cuh
@@ -1,411 +1,455 @@
 #pragma once
+#include <algorithm>
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
 #include <types.hpp>
-#include <algorithm>
 #include <util.h>
 
-struct targeIndex
-{
-    size_t ind0;
-    size_t ind1;
+struct targeIndex {
+  size_t ind0;
+  size_t ind1;
 };
 
-
 __constant__ uint posv_d[50];
 __constant__ uint posv_sorted_d[50];
-__constant__ cuDoubleComplex mat_d_const[32*32]; //If target qubit < 5, use const memory; 
+__constant__ cuDoubleComplex
+    mat_d_const[32 * 32]; // If target qubit < 5, use const memory;
 __constant__ uint mat_mask_d_const[32];
 
-
 //-------------Single-target gate-------------------------------
-template<class Func>
-__global__ void apply_one_targe_gate_kernel(cuDoubleComplex *psi_d, Func get_index, int rsize){
-    cuDoubleComplex mat00 = mat_d_const[0];
-    cuDoubleComplex mat01 = mat_d_const[1];
-    cuDoubleComplex mat10 = mat_d_const[2];
-    cuDoubleComplex mat11 = mat_d_const[3];
-
-    unsigned int gridSize = blockDim.x*gridDim.x;
-    for (int j = blockDim.x * blockIdx.x + threadIdx.x; j < rsize;j+= gridSize){
-        targeIndex ind = get_index(j);
-        cuDoubleComplex temp = psi_d[ind.ind0];
-        psi_d[ind.ind0] = cuCadd(cuCmul(mat00, psi_d[ind.ind0]), cuCmul(mat01, psi_d[ind.ind1]));
-        psi_d[ind.ind1] = cuCadd(cuCmul(mat10, temp), cuCmul(mat11, psi_d[ind.ind1]));
-    }
-}
-
-template<int ctrl_num>
-void apply_one_targe_gate_gpu(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){ 
-    //copy mat to device
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = 
-    reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-    checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, 4*sizeof(cuDoubleComplex)));
-    size_t rsize;
-    size_t offset;
-    size_t targe;
-    size_t control;
-    size_t setbit;
-    size_t poffset;
-    if (ctrl_num == 0){
-        targe = op.positions()[0];
-        offset = 1ll<<targe;
-        rsize = size>>1;
-        auto getind_func = [offset, targe] __device__ (size_t j)-> targeIndex {
-            size_t ind0 = (j&(offset-1)) | (j>>targe<<targe<<1);
-            size_t ind1 = ind0 + offset;
-            return {ind0, ind1};
-        };
-
-        size_t blockdim = rsize <= 1024 ? rsize : 1024;
-        size_t griddim = rsize / blockdim;
-        apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d,  getind_func, rsize);
-    }
-    else if(ctrl_num == 1){
-        control = op.positions()[0];
-        targe = op.positions()[1];
-        offset = 1ll<<targe;
-        setbit = 1ll<<control;
-        if (control>targe) {
-            control--;
-        }
-        poffset=1ll<<control;
-        rsize = size>>2;
-        auto getind_func = [control, targe, poffset, offset, setbit] __device__ (size_t j) -> targeIndex {
-            size_t ind0 = (j>>control<<(control+1))|(j&(poffset-1));
-            ind0 = (ind0 >> targe << (targe+1))|(ind0 &(offset-1))|setbit;
-            size_t ind1 = ind0 + offset;
-            return {ind0, ind1};
-        };
-        
-
-        size_t blockdim = rsize <= 1024 ? rsize : 1024;
-        size_t griddim = rsize / blockdim;
-       
-        apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d,  getind_func, rsize);      
-    }
-    else if(ctrl_num == 2){    
-        targe = op.positions().back();
-        offset = 1ll<<targe;
-        uint psize = op.positions().size();
-        rsize = size>>psize;
-
-        vector<pos_t> posv_sorted = op.positions();
-        std::sort(posv_sorted.begin(), posv_sorted.end());
-        //Copy pos to device
-        checkCudaErrors(cudaMemcpyToSymbol(posv_d, op.positions().data(), psize*sizeof(uint)));
-        checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-        auto getind_func = [offset, psize] __device__ (size_t j)-> targeIndex{
-            size_t ind0 = j;
-            for (pos_t k=0;k < psize;k++)
-            {   
-                pos_t _pos = posv_sorted_d[k];
-                ind0 = (ind0&((1ll<<_pos)-1)) | (ind0>>_pos<<_pos<<1);
-            }
-            for (pos_t k=0;k < psize-1;k++){
-                ind0 |= 1ll<<posv_d[k];
-            }
-
-            size_t ind1 = ind0 + offset;
-            return {ind0, ind1};
-        };
-        size_t blockdim = rsize <= 1024 ? rsize : 1024;
-        size_t griddim = rsize / blockdim;
-        apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d, getind_func, rsize);
-    } 
+template <class Func>
+__global__ void apply_one_targe_gate_kernel(cuDoubleComplex* psi_d,
+                                            Func get_index, int rsize) {
+  cuDoubleComplex mat00 = mat_d_const[0];
+  cuDoubleComplex mat01 = mat_d_const[1];
+  cuDoubleComplex mat10 = mat_d_const[2];
+  cuDoubleComplex mat11 = mat_d_const[3];
+
+  unsigned int gridSize = blockDim.x * gridDim.x;
+  for (int j = blockDim.x * blockIdx.x + threadIdx.x; j < rsize;
+       j += gridSize) {
+    targeIndex ind = get_index(j);
+    cuDoubleComplex temp = psi_d[ind.ind0];
+    psi_d[ind.ind0] =
+        cuCadd(cuCmul(mat00, psi_d[ind.ind0]), cuCmul(mat01, psi_d[ind.ind1]));
+    psi_d[ind.ind1] =
+        cuCadd(cuCmul(mat10, temp), cuCmul(mat11, psi_d[ind.ind1]));
+  }
 }
 
-template<int targe_num>
-__global__ void apply_2to4_targe_gate_kernel(cuDoubleComplex *psi_d,  uint ctrlnum, int psize){
-    constexpr uint matlen = 1<<targe_num;
-    uint block_length = blockDim.x;
-    size_t i = blockDim.x * blockIdx.x + threadIdx.x;
-    // Insert zeros
-    for(size_t k=0;k < psize;k++){
-        size_t _pos = posv_sorted_d[k];
-        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-    }
-    // Set control
-    for (size_t k=0; k < ctrlnum;k++){
-        i |= 1ll<<posv_d[k];
-    }
-
-    cuDoubleComplex psi_d_buffer[matlen];
-    for (int y = 0; y < matlen;++y){
-        psi_d_buffer[y] = {0., 0.};
-        for (int x = 0; x < matlen;++x){
-            psi_d_buffer[y] = cuCadd(psi_d_buffer[y], cuCmul(psi_d[i | mat_mask_d_const[x]], mat_d_const[y*matlen+x]));
-        }
+template <int ctrl_num>
+void apply_one_targe_gate_gpu(cuDoubleComplex* psi_d, QuantumOperator& op,
+                              size_t size) {
+  // copy mat to device
+  auto mat_temp = op.mat();
+  cuDoubleComplex* mat = reinterpret_cast<cuDoubleComplex*>(mat_temp.data());
+  checkCudaErrors(
+      cudaMemcpyToSymbol(mat_d_const, mat, 4 * sizeof(cuDoubleComplex)));
+  size_t rsize;
+  size_t offset;
+  size_t targe;
+  size_t control;
+  size_t setbit;
+  size_t poffset;
+  if (ctrl_num == 0) {
+    targe = op.positions()[0];
+    offset = 1ll << targe;
+    rsize = size >> 1;
+    auto getind_func = [offset, targe] __device__(size_t j) -> targeIndex {
+      size_t ind0 = (j & (offset - 1)) | (j >> targe << targe << 1);
+      size_t ind1 = ind0 + offset;
+      return {ind0, ind1};
+    };
+
+    size_t blockdim = rsize <= 1024 ? rsize : 1024;
+    size_t griddim = rsize / blockdim;
+    apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d, getind_func,
+                                                       rsize);
+  } else if (ctrl_num == 1) {
+    control = op.positions()[0];
+    targe = op.positions()[1];
+    offset = 1ll << targe;
+    setbit = 1ll << control;
+    if (control > targe) {
+      control--;
     }
-    for (int y = 0; y < matlen;++y) psi_d[i | mat_mask_d_const[y]] = psi_d_buffer[y];
-}
+    poffset = 1ll << control;
+    rsize = size >> 2;
+    auto getind_func = [control, targe, poffset, offset,
+                        setbit] __device__(size_t j) -> targeIndex {
+      size_t ind0 = (j >> control << (control + 1)) | (j & (poffset - 1));
+      ind0 = (ind0 >> targe << (targe + 1)) | (ind0 & (offset - 1)) | setbit;
+      size_t ind1 = ind0 + offset;
+      return {ind0, ind1};
+    };
+
+    size_t blockdim = rsize <= 1024 ? rsize : 1024;
+    size_t griddim = rsize / blockdim;
 
-template<int targe_num>
-void apply_2to4_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){
-    // uint targe_num = op.targe_num();
-    uint matlen = 1<<targe_num;
-    auto pos = op.positions();
-    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
-    vector<uint> targ_mask(matlen);
-    //create target mask
-    for (size_t m = 0; m < matlen;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
-    }
+    apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d, getind_func,
+                                                       rsize);
+  } else if (ctrl_num == 2) {
+    targe = op.positions().back();
+    offset = 1ll << targe;
+    uint psize = op.positions().size();
+    rsize = size >> psize;
 
     vector<pos_t> posv_sorted = op.positions();
-    uint psize = pos.size();
-    std::sort(posv_sorted.begin(),posv_sorted.end());
-    //Copy pos to device
-    checkCudaErrors(cudaMemcpyToSymbol(posv_d,  pos.data(), psize*sizeof(uint)));
-    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-    //copy mat to const memory
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-
-    checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex)));
-    checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint)));
-    size_t rsize = size>>psize;
-    
-    uint max_thread_num = targe_num < 4 ? 1024 : 512;
-    size_t blockdim = rsize <= max_thread_num ? rsize : max_thread_num;
+    std::sort(posv_sorted.begin(), posv_sorted.end());
+    // Copy pos to device
+    checkCudaErrors(cudaMemcpyToSymbol(posv_d, op.positions().data(),
+                                       psize * sizeof(uint)));
+    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(),
+                                       psize * sizeof(uint)));
+
+    auto getind_func = [offset, psize] __device__(size_t j) -> targeIndex {
+      size_t ind0 = j;
+      for (pos_t k = 0; k < psize; k++) {
+        pos_t _pos = posv_sorted_d[k];
+        ind0 = (ind0 & ((1ll << _pos) - 1)) | (ind0 >> _pos << _pos << 1);
+      }
+      for (pos_t k = 0; k < psize - 1; k++) {
+        ind0 |= 1ll << posv_d[k];
+      }
+
+      size_t ind1 = ind0 + offset;
+      return {ind0, ind1};
+    };
+    size_t blockdim = rsize <= 1024 ? rsize : 1024;
     size_t griddim = rsize / blockdim;
-    apply_2to4_targe_gate_kernel<targe_num><<<griddim, blockdim>>>(psi_d, op.control_num(), psize);
+    apply_one_targe_gate_kernel<<<griddim, blockdim>>>(psi_d, getind_func,
+                                                       rsize);
+  }
 }
 
-
-
-// ------------Large target number gate---------------
-
-__global__ void apply_5_targe_gate_kernel_const(cuDoubleComplex *psi_d, uint ctrlnum, int psize, size_t size){
-    uint rsize = size>>psize;
-    uint targnum = psize-ctrlnum;
-    uint matlen = (1<<targnum);
-    uint block_length = blockDim.x;
-    size_t b = blockIdx.x; // < rsize
-    int idx = threadIdx.x;// 
-    int idy = threadIdx.y;// 
-    size_t i = b;
-    // Insert zeros
-    for(size_t k=0;k < psize;k++){
-        size_t _pos = posv_sorted_d[k];
-        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-    }
-    // Set control
-    for (size_t k=0; k < ctrlnum;k++){
-        i |= 1ll<<posv_d[k];
-    }
-
-    __syncthreads();
-    cuDoubleComplex v;
-    v = cuCmul(psi_d[ i | mat_mask_d_const[idx]], mat_d_const[idy*matlen+idx]);
-    for (int offset = block_length>>1;offset > 0;offset >>=1){
-        v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
-        v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
+template <int targe_num>
+__global__ void apply_2to4_targe_gate_kernel(cuDoubleComplex* psi_d,
+                                             uint ctrlnum, int psize) {
+  constexpr uint matlen = 1 << targe_num;
+  uint block_length = blockDim.x;
+  size_t i = blockDim.x * blockIdx.x + threadIdx.x;
+  // Insert zeros
+  for (size_t k = 0; k < psize; k++) {
+    size_t _pos = posv_sorted_d[k];
+    i = (i & ((1ll << _pos) - 1)) | (i >> _pos << _pos << 1);
+  }
+  // Set control
+  for (size_t k = 0; k < ctrlnum; k++) {
+    i |= 1ll << posv_d[k];
+  }
+
+  cuDoubleComplex psi_d_buffer[matlen];
+  for (int y = 0; y < matlen; ++y) {
+    psi_d_buffer[y] = {0., 0.};
+    for (int x = 0; x < matlen; ++x) {
+      psi_d_buffer[y] =
+          cuCadd(psi_d_buffer[y], cuCmul(psi_d[i | mat_mask_d_const[x]],
+                                         mat_d_const[y * matlen + x]));
     }
-    __syncthreads(); 
-    if (!idx) psi_d[ i | mat_mask_d_const[idy]] = v;
+  }
+  for (int y = 0; y < matlen; ++y)
+    psi_d[i | mat_mask_d_const[y]] = psi_d_buffer[y];
 }
 
-
-void apply_5_targe_gate_gpu_const(cuDoubleComplex *psi_d, QuantumOperator &op, size_t size){
-    uint targe_num = op.targe_num();
-    uint matlen = 1<<targe_num;
-    auto pos = op.positions();
-    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
-    vector<uint> targ_mask(matlen);
-    //create target mask
-    for (size_t m = 0; m < matlen;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
+template <int targe_num>
+void apply_2to4_targe_gate_gpu_const(cuDoubleComplex* psi_d,
+                                     QuantumOperator& op, size_t size) {
+  // uint targe_num = op.targe_num();
+  uint matlen = 1 << targe_num;
+  auto pos = op.positions();
+  auto targs = vector<pos_t>(pos.begin() + op.control_num(), pos.end());
+  vector<uint> targ_mask(matlen);
+  // create target mask
+  for (size_t m = 0; m < matlen; m++) {
+    for (size_t j = 0; j < targe_num; j++) {
+      if ((m >> j) & 1) {
+        auto mask_pos = targs[j];
+        targ_mask[m] |= 1ll << mask_pos;
+      }
     }
-
-    vector<pos_t> posv_sorted = op.positions();
-    uint psize = pos.size();
-    std::sort(posv_sorted.begin(),posv_sorted.end());
-    //Copy pos to device
-    checkCudaErrors(cudaMemcpyToSymbol(posv_d,  pos.data(), psize*sizeof(uint)));
-    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-    //copy mat to const memory
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-
-    checkCudaErrors(cudaMemcpyToSymbol(mat_d_const, mat, matlen*matlen*sizeof(cuDoubleComplex)));
-    checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(), matlen*sizeof(uint)));
-    size_t rsize = size>>psize;
-    uint thread_num = matlen > 32 ? 32 : matlen;
-    dim3 blockdim = dim3(thread_num, thread_num);
-    apply_5_targe_gate_kernel_const<<<rsize, blockdim, thread_num*sizeof(cuDoubleComplex)>>>(psi_d, op.control_num(), psize, size);
+  }
+
+  vector<pos_t> posv_sorted = op.positions();
+  uint psize = pos.size();
+  std::sort(posv_sorted.begin(), posv_sorted.end());
+  // Copy pos to device
+  checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize * sizeof(uint)));
+  checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(),
+                                     psize * sizeof(uint)));
+
+  // copy mat to const memory
+  auto mat_temp = op.mat();
+  cuDoubleComplex* mat = reinterpret_cast<cuDoubleComplex*>(mat_temp.data());
+
+  checkCudaErrors(cudaMemcpyToSymbol(
+      mat_d_const, mat, matlen * matlen * sizeof(cuDoubleComplex)));
+  checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(),
+                                     matlen * sizeof(uint)));
+  size_t rsize = size >> psize;
+
+  uint max_thread_num = targe_num < 4 ? 1024 : 512;
+  size_t blockdim = rsize <= max_thread_num ? rsize : max_thread_num;
+  size_t griddim = rsize / blockdim;
+  apply_2to4_targe_gate_kernel<targe_num>
+      <<<griddim, blockdim>>>(psi_d, op.control_num(), psize);
 }
 
+// ------------Large target number gate---------------
 
-//For target number 6-10
-__global__ void apply_multi_targe_gate_kernel_shared(cuDoubleComplex *psi_d, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){
-    
-    uint rsize = size>>psize;
-    uint targnum = psize-ctrlnum;
-    uint matlen = (1<<targnum);
-    uint block_length = blockDim.x;
-    size_t b = blockIdx.x; // < rsize
-    int idx = threadIdx.x;// 
-    int idy = threadIdx.y;// 
-    size_t i = b;
-    // Insert zeros
-    for(size_t k=0;k < psize;k++){
-        size_t _pos = posv_sorted_d[k];
-        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-    }
-    // Set control
-    for (size_t k=0; k < ctrlnum;k++){
-        i |= 1ll<<posv_d[k];
-    }
-
-    __syncthreads();
-    __shared__ cuDoubleComplex local_sum[1024];
-    cuDoubleComplex v;
-    for (int y = idy; y < matlen;y+=blockDim.y){
-        local_sum[y] = {0, 0};
-        for (int x = idx; x < matlen;x+=blockDim.x){
-            v = cuCmul(psi_d[ i | mat_mask_d[x]], mat_d[y*matlen+x]);    
-            __syncthreads(); 
-            for (int offset = block_length>>1;offset > 0;offset >>=1){
-                v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
-                v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
-            }
-            __syncthreads();
-            if (!idx) local_sum[y] = cuCadd(local_sum[y], v);
-        }
-    }
+__global__ void apply_5_targe_gate_kernel_const(cuDoubleComplex* psi_d,
+                                                uint ctrlnum, int psize,
+                                                size_t size) {
+  uint rsize = size >> psize;
+  uint targnum = psize - ctrlnum;
+  uint matlen = (1 << targnum);
+  uint block_length = blockDim.x;
+  size_t b = blockIdx.x; // < rsize
+  int idx = threadIdx.x; //
+  int idy = threadIdx.y; //
+  size_t i = b;
+  // Insert zeros
+  for (size_t k = 0; k < psize; k++) {
+    size_t _pos = posv_sorted_d[k];
+    i = (i & ((1ll << _pos) - 1)) | (i >> _pos << _pos << 1);
+  }
+  // Set control
+  for (size_t k = 0; k < ctrlnum; k++) {
+    i |= 1ll << posv_d[k];
+  }
+
+  __syncthreads();
+  cuDoubleComplex v;
+  v = cuCmul(psi_d[i | mat_mask_d_const[idx]], mat_d_const[idy * matlen + idx]);
+  for (int offset = block_length >> 1; offset > 0; offset >>= 1) {
+    v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
+    v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
+  }
+  __syncthreads();
+  if (!idx)
+    psi_d[i | mat_mask_d_const[idy]] = v;
+}
 
-    for (int y = idy; y < matlen;y+=blockDim.y){
-    if (!idx) psi_d[ i | mat_mask_d[y]] = local_sum[y];
+void apply_5_targe_gate_gpu_const(cuDoubleComplex* psi_d, QuantumOperator& op,
+                                  size_t size) {
+  uint targe_num = op.targe_num();
+  uint matlen = 1 << targe_num;
+  auto pos = op.positions();
+  auto targs = vector<pos_t>(pos.begin() + op.control_num(), pos.end());
+  vector<uint> targ_mask(matlen);
+  // create target mask
+  for (size_t m = 0; m < matlen; m++) {
+    for (size_t j = 0; j < targe_num; j++) {
+      if ((m >> j) & 1) {
+        auto mask_pos = targs[j];
+        targ_mask[m] |= 1ll << mask_pos;
+      }
     }
+  }
+
+  vector<pos_t> posv_sorted = op.positions();
+  uint psize = pos.size();
+  std::sort(posv_sorted.begin(), posv_sorted.end());
+  // Copy pos to device
+  checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize * sizeof(uint)));
+  checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(),
+                                     psize * sizeof(uint)));
+
+  // copy mat to const memory
+  auto mat_temp = op.mat();
+  cuDoubleComplex* mat = reinterpret_cast<cuDoubleComplex*>(mat_temp.data());
+
+  checkCudaErrors(cudaMemcpyToSymbol(
+      mat_d_const, mat, matlen * matlen * sizeof(cuDoubleComplex)));
+  checkCudaErrors(cudaMemcpyToSymbol(mat_mask_d_const, targ_mask.data(),
+                                     matlen * sizeof(uint)));
+  size_t rsize = size >> psize;
+  uint thread_num = matlen > 32 ? 32 : matlen;
+  dim3 blockdim = dim3(thread_num, thread_num);
+  apply_5_targe_gate_kernel_const<<<rsize, blockdim,
+                                    thread_num * sizeof(cuDoubleComplex)>>>(
+      psi_d, op.control_num(), psize, size);
 }
 
-
-void apply_multi_targe_gate_gpu_shared(cuDoubleComplex *psi_d, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){
-    uint targe_num = op.targe_num();
-    uint matlen = 1<<targe_num;
-    auto pos = op.positions();
-    uint psize = pos.size();
-    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
-    vector<uint> targ_mask(matlen);
-    //create target mask
-    for (size_t m = 0; m < matlen;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
+// For target number 6-10
+__global__ void apply_multi_targe_gate_kernel_shared(cuDoubleComplex* psi_d,
+                                                     uint ctrlnum,
+                                                     cuDoubleComplex* mat_d,
+                                                     uint* mat_mask_d,
+                                                     int psize, size_t size) {
+
+  uint rsize = size >> psize;
+  uint targnum = psize - ctrlnum;
+  uint matlen = (1 << targnum);
+  uint block_length = blockDim.x;
+  size_t b = blockIdx.x; // < rsize
+  int idx = threadIdx.x; //
+  int idy = threadIdx.y; //
+  size_t i = b;
+  // Insert zeros
+  for (size_t k = 0; k < psize; k++) {
+    size_t _pos = posv_sorted_d[k];
+    i = (i & ((1ll << _pos) - 1)) | (i >> _pos << _pos << 1);
+  }
+  // Set control
+  for (size_t k = 0; k < ctrlnum; k++) {
+    i |= 1ll << posv_d[k];
+  }
+
+  __syncthreads();
+  __shared__ cuDoubleComplex local_sum[1024];
+  cuDoubleComplex v;
+  for (int y = idy; y < matlen; y += blockDim.y) {
+    local_sum[y] = {0, 0};
+    for (int x = idx; x < matlen; x += blockDim.x) {
+      v = cuCmul(psi_d[i | mat_mask_d[x]], mat_d[y * matlen + x]);
+      __syncthreads();
+      for (int offset = block_length >> 1; offset > 0; offset >>= 1) {
+        v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
+        v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
+      }
+      __syncthreads();
+      if (!idx)
+        local_sum[y] = cuCadd(local_sum[y], v);
     }
+  }
 
-    vector<pos_t> posv_sorted = pos;
-    std::sort(posv_sorted.begin(),posv_sorted.end());
-    //Copy pos to device
-    checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint)));
-    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-     
-    //copy mat to global memory
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-    checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice));
-
-    size_t rsize = size>>psize;
-    uint thread_num = matlen > 32 ? 32 : matlen;
-    dim3 blockdim = dim3(thread_num, thread_num);
-
-    apply_multi_targe_gate_kernel_shared<<<rsize, blockdim>>>(psi_d, op.control_num(), mat_d, mat_mask_d, psize, size);
+  for (int y = idy; y < matlen; y += blockDim.y) {
+    if (!idx)
+      psi_d[i | mat_mask_d[y]] = local_sum[y];
+  }
 }
 
-//For target number > 10
-__global__ void apply_multi_targe_gate_kernel_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, uint ctrlnum, cuDoubleComplex *mat_d, uint *mat_mask_d, int psize, size_t size){
-    uint rsize = size>>psize;
-    uint targnum = psize-ctrlnum;
-    uint matlen = (1<<targnum);
-    uint block_length = blockDim.x;
-    size_t b = blockIdx.x; // < rsize
-    int idx = threadIdx.x;// 
-    int idy = threadIdx.y;// 
-    size_t i = b;
-    // Insert zeros
-    for(size_t k=0;k < psize;k++){
-        size_t _pos = posv_sorted_d[k];
-        i = (i&((1ll<<_pos)-1)) | (i>>_pos<<_pos<<1);
-    }
-    // Set control
-    for (size_t k=0; k < ctrlnum;k++){
-        i |= 1ll<<posv_d[k];
+void apply_multi_targe_gate_gpu_shared(cuDoubleComplex* psi_d,
+                                       QuantumOperator& op,
+                                       cuDoubleComplex* mat_d, uint* mat_mask_d,
+                                       size_t size) {
+  uint targe_num = op.targe_num();
+  uint matlen = 1 << targe_num;
+  auto pos = op.positions();
+  uint psize = pos.size();
+  auto targs = vector<pos_t>(pos.begin() + op.control_num(), pos.end());
+  vector<uint> targ_mask(matlen);
+  // create target mask
+  for (size_t m = 0; m < matlen; m++) {
+    for (size_t j = 0; j < targe_num; j++) {
+      if ((m >> j) & 1) {
+        auto mask_pos = targs[j];
+        targ_mask[m] |= 1ll << mask_pos;
+      }
     }
+  }
+
+  vector<pos_t> posv_sorted = pos;
+  std::sort(posv_sorted.begin(), posv_sorted.end());
+  // Copy pos to device
+  checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize * sizeof(uint)));
+  checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(),
+                                     psize * sizeof(uint)));
+
+  // copy mat to global memory
+  auto mat_temp = op.mat();
+  cuDoubleComplex* mat = reinterpret_cast<cuDoubleComplex*>(mat_temp.data());
+  checkCudaErrors(cudaMemcpy(mat_d, mat,
+                             matlen * matlen * sizeof(cuDoubleComplex),
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(),
+                             matlen * sizeof(uint), cudaMemcpyHostToDevice));
+
+  size_t rsize = size >> psize;
+  uint thread_num = matlen > 32 ? 32 : matlen;
+  dim3 blockdim = dim3(thread_num, thread_num);
+
+  apply_multi_targe_gate_kernel_shared<<<rsize, blockdim>>>(
+      psi_d, op.control_num(), mat_d, mat_mask_d, psize, size);
+}
 
-    __syncthreads();
-
-    cuDoubleComplex v;
-    cuDoubleComplex v_sum;
-    for (int y = idy; y < matlen;y+=blockDim.y){
-        v_sum = {0, 0};
-        for (int x = idx; x < matlen;x+=blockDim.x){
-            v = cuCmul(psi_d_copy[ i | mat_mask_d[x]], mat_d[y*matlen+x]);    
-            __syncthreads(); 
-            for (int offset = block_length>>1;offset > 0;offset >>=1){
-                v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
-                v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
-            }
-            __syncthreads();
-            if (!idx) v_sum = cuCadd(v_sum, v);
-        }
-        if (!idx) psi_d[ i | mat_mask_d[y]] = v_sum;
+// For target number > 10
+__global__ void apply_multi_targe_gate_kernel_global(
+    cuDoubleComplex* psi_d, cuDoubleComplex* psi_d_copy, uint ctrlnum,
+    cuDoubleComplex* mat_d, uint* mat_mask_d, int psize, size_t size) {
+  uint rsize = size >> psize;
+  uint targnum = psize - ctrlnum;
+  uint matlen = (1 << targnum);
+  uint block_length = blockDim.x;
+  size_t b = blockIdx.x; // < rsize
+  int idx = threadIdx.x; //
+  int idy = threadIdx.y; //
+  size_t i = b;
+  // Insert zeros
+  for (size_t k = 0; k < psize; k++) {
+    size_t _pos = posv_sorted_d[k];
+    i = (i & ((1ll << _pos) - 1)) | (i >> _pos << _pos << 1);
+  }
+  // Set control
+  for (size_t k = 0; k < ctrlnum; k++) {
+    i |= 1ll << posv_d[k];
+  }
+
+  __syncthreads();
+
+  cuDoubleComplex v;
+  cuDoubleComplex v_sum;
+  for (int y = idy; y < matlen; y += blockDim.y) {
+    v_sum = {0, 0};
+    for (int x = idx; x < matlen; x += blockDim.x) {
+      v = cuCmul(psi_d_copy[i | mat_mask_d[x]], mat_d[y * matlen + x]);
+      __syncthreads();
+      for (int offset = block_length >> 1; offset > 0; offset >>= 1) {
+        v.x += __shfl_down_sync(0xFFFFFFFF, v.x, offset);
+        v.y += __shfl_down_sync(0xFFFFFFFF, v.y, offset);
+      }
+      __syncthreads();
+      if (!idx)
+        v_sum = cuCadd(v_sum, v);
     }
+    if (!idx)
+      psi_d[i | mat_mask_d[y]] = v_sum;
+  }
 }
 
-void apply_multi_targe_gate_gpu_global(cuDoubleComplex *psi_d, cuDoubleComplex *psi_d_copy, QuantumOperator &op, cuDoubleComplex *mat_d, uint *mat_mask_d, size_t size){
-    uint targe_num = op.targe_num();
-    uint matlen = 1<<targe_num;
-    auto pos = op.positions();
-    uint psize = pos.size();
-    auto targs = vector<pos_t>(pos.begin()+op.control_num(), pos.end());
-    vector<uint> targ_mask(matlen);
-    //create target mask
-    for (size_t m = 0; m < matlen;m++){
-        for (size_t j = 0; j < targe_num; j++){
-            if ((m>>j)&1){
-                auto mask_pos = targs[j];
-                targ_mask[m] |= 1ll<<mask_pos;
-            }
-        }
+void apply_multi_targe_gate_gpu_global(cuDoubleComplex* psi_d,
+                                       cuDoubleComplex* psi_d_copy,
+                                       QuantumOperator& op,
+                                       cuDoubleComplex* mat_d, uint* mat_mask_d,
+                                       size_t size) {
+  uint targe_num = op.targe_num();
+  uint matlen = 1 << targe_num;
+  auto pos = op.positions();
+  uint psize = pos.size();
+  auto targs = vector<pos_t>(pos.begin() + op.control_num(), pos.end());
+  vector<uint> targ_mask(matlen);
+  // create target mask
+  for (size_t m = 0; m < matlen; m++) {
+    for (size_t j = 0; j < targe_num; j++) {
+      if ((m >> j) & 1) {
+        auto mask_pos = targs[j];
+        targ_mask[m] |= 1ll << mask_pos;
+      }
     }
-
-    vector<pos_t> posv_sorted = pos;
-    std::sort(posv_sorted.begin(),posv_sorted.end());
-    //Copy pos to device
-    checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize*sizeof(uint)));
-    checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(), psize*sizeof(uint)));
-
-     
-    //copy mat to global memory
-    auto mat_temp = op.mat();
-    cuDoubleComplex *mat = reinterpret_cast<cuDoubleComplex *>(mat_temp.data());
-    checkCudaErrors(cudaMemcpy(mat_d, mat, matlen*matlen*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(), matlen*sizeof(uint), cudaMemcpyHostToDevice));
-
-    size_t rsize = size>>psize;
-    uint thread_num = matlen > 32 ? 32 : matlen;
-    dim3 blockdim = dim3(thread_num, thread_num);
-
-    apply_multi_targe_gate_kernel_global<<<rsize, blockdim>>>(psi_d, psi_d_copy, op.control_num(), mat_d, mat_mask_d, psize, size);
-    checkCudaErrors(cudaMemcpy(psi_d_copy, psi_d, size*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice));
-}
\ No newline at end of file
+  }
+
+  vector<pos_t> posv_sorted = pos;
+  std::sort(posv_sorted.begin(), posv_sorted.end());
+  // Copy pos to device
+  checkCudaErrors(cudaMemcpyToSymbol(posv_d, pos.data(), psize * sizeof(uint)));
+  checkCudaErrors(cudaMemcpyToSymbol(posv_sorted_d, posv_sorted.data(),
+                                     psize * sizeof(uint)));
+
+  // copy mat to global memory
+  auto mat_temp = op.mat();
+  cuDoubleComplex* mat = reinterpret_cast<cuDoubleComplex*>(mat_temp.data());
+  checkCudaErrors(cudaMemcpy(mat_d, mat,
+                             matlen * matlen * sizeof(cuDoubleComplex),
+                             cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(mat_mask_d, targ_mask.data(),
+                             matlen * sizeof(uint), cudaMemcpyHostToDevice));
+
+  size_t rsize = size >> psize;
+  uint thread_num = matlen > 32 ? 32 : matlen;
+  dim3 blockdim = dim3(thread_num, thread_num);
+
+  apply_multi_targe_gate_kernel_global<<<rsize, blockdim>>>(
+      psi_d, psi_d_copy, op.control_num(), mat_d, mat_mask_d, psize, size);
+  checkCudaErrors(cudaMemcpy(psi_d_copy, psi_d, size * sizeof(cuDoubleComplex),
+                             cudaMemcpyDeviceToDevice));
+}
diff --git a/src/qfvm_gpu/cuda_simulator.cuh b/src/qfvm_gpu/cuda_simulator.cuh
index 99ee5e4..d66424a 100644
--- a/src/qfvm_gpu/cuda_simulator.cuh
+++ b/src/qfvm_gpu/cuda_simulator.cuh
@@ -1,91 +1,87 @@
 #pragma once
+#include "apply_gate_gpu.cuh"
 #include "cuda_statevector.cuh"
 #include <circuit.hpp>
-#include <types.hpp>
 #include <statevector.hpp>
 #include <ticktock.h>
-#include "apply_gate_gpu.cuh"
+#include <types.hpp>
 
-void simulate_gpu(Circuit & circuit, CudaStateVector & psi_d){
-    size_t size = psi_d.size();
-    //initialize mat
-    cuDoubleComplex *mat_d;
-    uint *mat_mask_d;
-    CudaStateVector psi_d_copy{};
-    if (circuit.max_targe_num() > 5)
-    {
-        uint max_matlen = 1<<circuit.max_targe_num();
-        checkCudaErrors(cudaMalloc(&mat_d, (max_matlen*max_matlen)*sizeof(cuDoubleComplex)));
-        checkCudaErrors(cudaMalloc(&mat_mask_d, max_matlen*sizeof(cuDoubleComplex)));
-        if (circuit.max_targe_num() > 10){
-            psi_d_copy = psi_d;
-        }
+void simulate_gpu(Circuit& circuit, CudaStateVector& psi_d) {
+  size_t size = psi_d.size();
+  // initialize mat
+  cuDoubleComplex* mat_d;
+  uint* mat_mask_d;
+  CudaStateVector psi_d_copy{};
+  if (circuit.max_targe_num() > 5) {
+    uint max_matlen = 1 << circuit.max_targe_num();
+    checkCudaErrors(cudaMalloc(&mat_d, (max_matlen * max_matlen) *
+                                           sizeof(cuDoubleComplex)));
+    checkCudaErrors(
+        cudaMalloc(&mat_mask_d, max_matlen * sizeof(cuDoubleComplex)));
+    if (circuit.max_targe_num() > 10) {
+      psi_d_copy = psi_d;
     }
+  }
 
-    //apply_gate
-    for (auto gate : circuit.gates()){
-        uint targnum = gate.targe_num();
-        uint ctrlnum = gate.control_num();
+  // apply_gate
+  for (auto gate : circuit.gates()) {
+    uint targnum = gate.targe_num();
+    uint ctrlnum = gate.control_num();
 
-        if (targnum  == 1){
-            if (ctrlnum == 0){ 
-                apply_one_targe_gate_gpu<0>(psi_d.data(), gate, size);
-                }
-            else if (ctrlnum == 1){
-                apply_one_targe_gate_gpu<1>(psi_d.data(), gate, size);
-                }
-            else{
-                 apply_one_targe_gate_gpu<2>(psi_d.data(), gate, size);
-                }
-        }
-        else if(targnum > 1){
-            if (targnum == 2){
-                apply_2to4_targe_gate_gpu_const<2>(psi_d.data(), gate, size);
-            }
-            else if(targnum == 3){
-                apply_2to4_targe_gate_gpu_const<3>(psi_d.data(), gate, size);
-            }
-            else if(targnum == 4){
-                apply_2to4_targe_gate_gpu_const<4>(psi_d.data(), gate, size);
-            }
-            else if (targnum == 5){
-                apply_5_targe_gate_gpu_const(psi_d.data(), gate, size);
-            }else if (targnum > 5 && targnum <= 10){
-                apply_multi_targe_gate_gpu_shared(psi_d.data(), gate, mat_d, mat_mask_d, size);
-            }
-            else{
-                apply_multi_targe_gate_gpu_global(psi_d.data(), psi_d_copy.data(), gate, mat_d, mat_mask_d, size);
-            }
-        }   
-        else{
-            throw "Invalid target number";
-        }
+    if (targnum == 1) {
+      if (ctrlnum == 0) {
+        apply_one_targe_gate_gpu<0>(psi_d.data(), gate, size);
+      } else if (ctrlnum == 1) {
+        apply_one_targe_gate_gpu<1>(psi_d.data(), gate, size);
+      } else {
+        apply_one_targe_gate_gpu<2>(psi_d.data(), gate, size);
+      }
+    } else if (targnum > 1) {
+      if (targnum == 2) {
+        apply_2to4_targe_gate_gpu_const<2>(psi_d.data(), gate, size);
+      } else if (targnum == 3) {
+        apply_2to4_targe_gate_gpu_const<3>(psi_d.data(), gate, size);
+      } else if (targnum == 4) {
+        apply_2to4_targe_gate_gpu_const<4>(psi_d.data(), gate, size);
+      } else if (targnum == 5) {
+        apply_5_targe_gate_gpu_const(psi_d.data(), gate, size);
+      } else if (targnum > 5 && targnum <= 10) {
+        apply_multi_targe_gate_gpu_shared(psi_d.data(), gate, mat_d, mat_mask_d,
+                                          size);
+      } else {
+        apply_multi_targe_gate_gpu_global(psi_d.data(), psi_d_copy.data(), gate,
+                                          mat_d, mat_mask_d, size);
+      }
+    } else {
+      throw "Invalid target number";
     }
+  }
 
-    //free source
-    if (circuit.max_targe_num() > 5){
-        checkCudaErrors(cudaFree(mat_d));
-        checkCudaErrors(cudaFree(mat_mask_d));
-    }
+  // free source
+  if (circuit.max_targe_num() > 5) {
+    checkCudaErrors(cudaFree(mat_d));
+    checkCudaErrors(cudaFree(mat_mask_d));
+  }
 }
 
-void simulate_gpu(Circuit & circuit, StateVector<data_t> & state){
-    //initialize psi
-    state.set_num(circuit.qubit_num());
-    size_t size = state.size();
-    CudaStateVector psi_d(state);
-    
-    simulate_gpu(circuit, psi_d);
-    cudaDeviceSynchronize();
+void simulate_gpu(Circuit& circuit, StateVector<data_t>& state) {
+  // initialize psi
+  state.set_num(circuit.qubit_num());
+  size_t size = state.size();
+  CudaStateVector psi_d(state);
+
+  simulate_gpu(circuit, psi_d);
+  cudaDeviceSynchronize();
 
-    //copy back
-    complex<double>* psi = reinterpret_cast<complex<double>*>(psi_d.data());
-    checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex<double>), cudaMemcpyDeviceToHost));
-    psi=nullptr;
-}   
+  // copy back
+  complex<double>* psi = reinterpret_cast<complex<double>*>(psi_d.data());
+  checkCudaErrors(cudaMemcpy(state.data(), psi, size * sizeof(complex<double>),
+                             cudaMemcpyDeviceToHost));
+  psi = nullptr;
+}
 
-StateVector<double> simulate_gpu(Circuit & circuit){
-    StateVector<double> state(circuit.qubit_num());
-    simulate_gpu(circuit, state);
-    return std::move(state);
-}
\ No newline at end of file
+StateVector<double> simulate_gpu(Circuit& circuit) {
+  StateVector<double> state(circuit.qubit_num());
+  simulate_gpu(circuit, state);
+  return std::move(state);
+}
diff --git a/src/qfvm_gpu/cuda_statevector.cuh b/src/qfvm_gpu/cuda_statevector.cuh
index 19e8cb4..ae088f6 100644
--- a/src/qfvm_gpu/cuda_statevector.cuh
+++ b/src/qfvm_gpu/cuda_statevector.cuh
@@ -1,58 +1,51 @@
 
 #pragma once
+#include <cuComplex.h>
 #include <cuda_runtime.h>
 #include <helper_cuda.h>
-#include <cuComplex.h>
 #include <statevector.hpp>
 
-class CudaStateVector
-{
-    protected:
-        uint num_;
-        size_t size_;
-        cuDoubleComplex *data_;
-
-    public:
-        //construct function
-        CudaStateVector(){checkCudaErrors(cudaMalloc(&data_, 0));}
-        CudaStateVector(CudaStateVector const &other);
-       
-        explicit CudaStateVector(StateVector<double> &sv);
-        ~CudaStateVector() {
-            checkCudaErrors(cudaFree(data_));
-        }
-
-        CudaStateVector &operator=(CudaStateVector const &other)
-        {   
-            num_ = other.num();
-            size_  = other.size();
-            checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex)));
-            checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice));
-            return *this;
-        }
-
-        cuDoubleComplex* data() const { return data_;}
-        size_t size() const { return size_; }
-        uint num() const { return num_; }
+class CudaStateVector {
+protected:
+  uint num_;
+  size_t size_;
+  cuDoubleComplex* data_;
+
+public:
+  // construct function
+  CudaStateVector() { checkCudaErrors(cudaMalloc(&data_, 0)); }
+  CudaStateVector(CudaStateVector const& other);
+
+  explicit CudaStateVector(StateVector<double>& sv);
+  ~CudaStateVector() { checkCudaErrors(cudaFree(data_)); }
+
+  CudaStateVector& operator=(CudaStateVector const& other) {
+    num_ = other.num();
+    size_ = other.size();
+    checkCudaErrors(cudaMalloc(&data_, size_ * sizeof(cuDoubleComplex)));
+    checkCudaErrors(cudaMemcpy(data_, other.data(),
+                               size_ * sizeof(cuDoubleComplex),
+                               cudaMemcpyDeviceToDevice));
+    return *this;
+  }
+
+  cuDoubleComplex* data() const { return data_; }
+  size_t size() const { return size_; }
+  uint num() const { return num_; }
 };
 
-
-CudaStateVector::CudaStateVector(StateVector<double> &sv)
-:
-num_(sv.num()),
-size_(sv.size())
-{   
-    cuDoubleComplex *psi_h = reinterpret_cast<cuDoubleComplex*>(sv.data());
-    checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex)));
-    checkCudaErrors(cudaMemcpy(data_, psi_h, size_*sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
+CudaStateVector::CudaStateVector(StateVector<double>& sv)
+    : num_(sv.num()), size_(sv.size()) {
+  cuDoubleComplex* psi_h = reinterpret_cast<cuDoubleComplex*>(sv.data());
+  checkCudaErrors(cudaMalloc(&data_, size_ * sizeof(cuDoubleComplex)));
+  checkCudaErrors(cudaMemcpy(data_, psi_h, size_ * sizeof(cuDoubleComplex),
+                             cudaMemcpyHostToDevice));
 }
 
-CudaStateVector::CudaStateVector(CudaStateVector const &other)
-:
-num_(other.num()), 
-size_(other.size())
-{
-    checkCudaErrors(cudaMalloc(&data_, size_*sizeof(cuDoubleComplex)));
-    checkCudaErrors(cudaMemcpy(data_, other.data(), size_*sizeof(cuDoubleComplex), cudaMemcpyDeviceToDevice));
+CudaStateVector::CudaStateVector(CudaStateVector const& other)
+    : num_(other.num()), size_(other.size()) {
+  checkCudaErrors(cudaMalloc(&data_, size_ * sizeof(cuDoubleComplex)));
+  checkCudaErrors(cudaMemcpy(data_, other.data(),
+                             size_ * sizeof(cuDoubleComplex),
+                             cudaMemcpyDeviceToDevice));
 }
-
diff --git a/src/qfvm_gpu/cuda_utils/CudaTexture.h b/src/qfvm_gpu/cuda_utils/CudaTexture.h
index 0661d0d..cf21fae 100644
--- a/src/qfvm_gpu/cuda_utils/CudaTexture.h
+++ b/src/qfvm_gpu/cuda_utils/CudaTexture.h
@@ -1,39 +1,32 @@
 #pragma once
 
-
 #include "helper_cuda.h"
 #include <cuda_runtime.h>
 
-
 struct CudaTexture {
-    cudaTextureObject_t tex;
+  cudaTextureObject_t tex;
 
-    CudaTexture(CudaTexture const &) = delete;
-    CudaTexture(CudaTexture &&) = default;
-    CudaTexture &operator=(CudaTexture const &) = delete;
-    CudaTexture &operator=(CudaTexture &&) = default;
+  CudaTexture(CudaTexture const&) = delete;
+  CudaTexture(CudaTexture&&) = default;
+  CudaTexture& operator=(CudaTexture const&) = delete;
+  CudaTexture& operator=(CudaTexture&&) = default;
 
-    template <class T>
-    CudaTexture(T *dataDev, int width, int height) {
-        cudaTextureObject_t tex;
-        cudaResourceDesc resDesc;
-        memset(&resDesc, 0, sizeof(resDesc));
-        resDesc.resType = cudaResourceTypePitch2D;
-        resDesc.res.pitch2D.devPtr = dataDev;
-        resDesc.res.pitch2D.width = width;
-        resDesc.res.pitch2D.height = height;
-        resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
-        resDesc.res.pitch2D.pitchInBytes = width * sizeof(T);
-        cudaTextureDesc texDesc;
-        memset(&texDesc, 0, sizeof(texDesc));
-        checkCudaErrors(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
-    }
-
-    ~CudaTexture() {
-        checkCudaErrors(cudaDestroyTextureObject(tex));
-    }
-
-    constexpr operator cudaTextureObject_t() const {
-        return tex;
-    }
+  template <class T> CudaTexture(T* dataDev, int width, int height) {
+    cudaTextureObject_t tex;
+    cudaResourceDesc resDesc;
+    memset(&resDesc, 0, sizeof(resDesc));
+    resDesc.resType = cudaResourceTypePitch2D;
+    resDesc.res.pitch2D.devPtr = dataDev;
+    resDesc.res.pitch2D.width = width;
+    resDesc.res.pitch2D.height = height;
+    resDesc.res.pitch2D.desc = cudaCreateChannelDesc<T>();
+    resDesc.res.pitch2D.pitchInBytes = width * sizeof(T);
+    cudaTextureDesc texDesc;
+    memset(&texDesc, 0, sizeof(texDesc));
+    checkCudaErrors(cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL));
+  }
+
+  ~CudaTexture() { checkCudaErrors(cudaDestroyTextureObject(tex)); }
+
+  constexpr operator cudaTextureObject_t() const { return tex; }
 };
diff --git a/src/qfvm_gpu/cuda_utils/helper_cuda.h b/src/qfvm_gpu/cuda_utils/helper_cuda.h
index 516dd57..18c16b2 100644
--- a/src/qfvm_gpu/cuda_utils/helper_cuda.h
+++ b/src/qfvm_gpu/cuda_utils/helper_cuda.h
@@ -34,16 +34,16 @@
 
 // CUDA Runtime error messages
 #ifdef __DRIVER_TYPES_H__
-static const char *_cudaGetErrorEnum(cudaError_t error) {
+static const char* _cudaGetErrorEnum(cudaError_t error) {
   return cudaGetErrorName(error);
 }
 #endif
 
 #ifdef CUDA_DRIVER_API
 // CUDA Driver API errors
-static const char *_cudaGetErrorEnum(CUresult error) {
+static const char* _cudaGetErrorEnum(CUresult error) {
   static char unknown[] = "<unknown>";
-  const char *ret = NULL;
+  const char* ret = NULL;
   cuGetErrorName(error, &ret);
   return ret ? ret : unknown;
 }
@@ -51,37 +51,37 @@ static const char *_cudaGetErrorEnum(CUresult error) {
 
 #ifdef CUBLAS_API_H_
 // cuBLAS API errors
-static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+static const char* _cudaGetErrorEnum(cublasStatus_t error) {
   switch (error) {
-    case CUBLAS_STATUS_SUCCESS:
-      return "CUBLAS_STATUS_SUCCESS";
+  case CUBLAS_STATUS_SUCCESS:
+    return "CUBLAS_STATUS_SUCCESS";
 
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-      return "CUBLAS_STATUS_NOT_INITIALIZED";
+  case CUBLAS_STATUS_NOT_INITIALIZED:
+    return "CUBLAS_STATUS_NOT_INITIALIZED";
 
-    case CUBLAS_STATUS_ALLOC_FAILED:
-      return "CUBLAS_STATUS_ALLOC_FAILED";
+  case CUBLAS_STATUS_ALLOC_FAILED:
+    return "CUBLAS_STATUS_ALLOC_FAILED";
 
-    case CUBLAS_STATUS_INVALID_VALUE:
-      return "CUBLAS_STATUS_INVALID_VALUE";
+  case CUBLAS_STATUS_INVALID_VALUE:
+    return "CUBLAS_STATUS_INVALID_VALUE";
 
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-      return "CUBLAS_STATUS_ARCH_MISMATCH";
+  case CUBLAS_STATUS_ARCH_MISMATCH:
+    return "CUBLAS_STATUS_ARCH_MISMATCH";
 
-    case CUBLAS_STATUS_MAPPING_ERROR:
-      return "CUBLAS_STATUS_MAPPING_ERROR";
+  case CUBLAS_STATUS_MAPPING_ERROR:
+    return "CUBLAS_STATUS_MAPPING_ERROR";
 
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-      return "CUBLAS_STATUS_EXECUTION_FAILED";
+  case CUBLAS_STATUS_EXECUTION_FAILED:
+    return "CUBLAS_STATUS_EXECUTION_FAILED";
 
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-      return "CUBLAS_STATUS_INTERNAL_ERROR";
+  case CUBLAS_STATUS_INTERNAL_ERROR:
+    return "CUBLAS_STATUS_INTERNAL_ERROR";
 
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-      return "CUBLAS_STATUS_NOT_SUPPORTED";
+  case CUBLAS_STATUS_NOT_SUPPORTED:
+    return "CUBLAS_STATUS_NOT_SUPPORTED";
 
-    case CUBLAS_STATUS_LICENSE_ERROR:
-      return "CUBLAS_STATUS_LICENSE_ERROR";
+  case CUBLAS_STATUS_LICENSE_ERROR:
+    return "CUBLAS_STATUS_LICENSE_ERROR";
   }
 
   return "<unknown>";
@@ -90,58 +90,58 @@ static const char *_cudaGetErrorEnum(cublasStatus_t error) {
 
 #ifdef _CUFFT_H_
 // cuFFT API errors
-static const char *_cudaGetErrorEnum(cufftResult error) {
+static const char* _cudaGetErrorEnum(cufftResult error) {
   switch (error) {
-    case CUFFT_SUCCESS:
-      return "CUFFT_SUCCESS";
+  case CUFFT_SUCCESS:
+    return "CUFFT_SUCCESS";
 
-    case CUFFT_INVALID_PLAN:
-      return "CUFFT_INVALID_PLAN";
+  case CUFFT_INVALID_PLAN:
+    return "CUFFT_INVALID_PLAN";
 
-    case CUFFT_ALLOC_FAILED:
-      return "CUFFT_ALLOC_FAILED";
+  case CUFFT_ALLOC_FAILED:
+    return "CUFFT_ALLOC_FAILED";
 
-    case CUFFT_INVALID_TYPE:
-      return "CUFFT_INVALID_TYPE";
+  case CUFFT_INVALID_TYPE:
+    return "CUFFT_INVALID_TYPE";
 
-    case CUFFT_INVALID_VALUE:
-      return "CUFFT_INVALID_VALUE";
+  case CUFFT_INVALID_VALUE:
+    return "CUFFT_INVALID_VALUE";
 
-    case CUFFT_INTERNAL_ERROR:
-      return "CUFFT_INTERNAL_ERROR";
+  case CUFFT_INTERNAL_ERROR:
+    return "CUFFT_INTERNAL_ERROR";
 
-    case CUFFT_EXEC_FAILED:
-      return "CUFFT_EXEC_FAILED";
+  case CUFFT_EXEC_FAILED:
+    return "CUFFT_EXEC_FAILED";
 
-    case CUFFT_SETUP_FAILED:
-      return "CUFFT_SETUP_FAILED";
+  case CUFFT_SETUP_FAILED:
+    return "CUFFT_SETUP_FAILED";
 
-    case CUFFT_INVALID_SIZE:
-      return "CUFFT_INVALID_SIZE";
+  case CUFFT_INVALID_SIZE:
+    return "CUFFT_INVALID_SIZE";
 
-    case CUFFT_UNALIGNED_DATA:
-      return "CUFFT_UNALIGNED_DATA";
+  case CUFFT_UNALIGNED_DATA:
+    return "CUFFT_UNALIGNED_DATA";
 
-    case CUFFT_INCOMPLETE_PARAMETER_LIST:
-      return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+  case CUFFT_INCOMPLETE_PARAMETER_LIST:
+    return "CUFFT_INCOMPLETE_PARAMETER_LIST";
 
-    case CUFFT_INVALID_DEVICE:
-      return "CUFFT_INVALID_DEVICE";
+  case CUFFT_INVALID_DEVICE:
+    return "CUFFT_INVALID_DEVICE";
 
-    case CUFFT_PARSE_ERROR:
-      return "CUFFT_PARSE_ERROR";
+  case CUFFT_PARSE_ERROR:
+    return "CUFFT_PARSE_ERROR";
 
-    case CUFFT_NO_WORKSPACE:
-      return "CUFFT_NO_WORKSPACE";
+  case CUFFT_NO_WORKSPACE:
+    return "CUFFT_NO_WORKSPACE";
 
-    case CUFFT_NOT_IMPLEMENTED:
-      return "CUFFT_NOT_IMPLEMENTED";
+  case CUFFT_NOT_IMPLEMENTED:
+    return "CUFFT_NOT_IMPLEMENTED";
 
-    case CUFFT_LICENSE_ERROR:
-      return "CUFFT_LICENSE_ERROR";
+  case CUFFT_LICENSE_ERROR:
+    return "CUFFT_LICENSE_ERROR";
 
-    case CUFFT_NOT_SUPPORTED:
-      return "CUFFT_NOT_SUPPORTED";
+  case CUFFT_NOT_SUPPORTED:
+    return "CUFFT_NOT_SUPPORTED";
   }
 
   return "<unknown>";
@@ -150,34 +150,34 @@ static const char *_cudaGetErrorEnum(cufftResult error) {
 
 #ifdef CUSPARSEAPI
 // cuSPARSE API errors
-static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+static const char* _cudaGetErrorEnum(cusparseStatus_t error) {
   switch (error) {
-    case CUSPARSE_STATUS_SUCCESS:
-      return "CUSPARSE_STATUS_SUCCESS";
+  case CUSPARSE_STATUS_SUCCESS:
+    return "CUSPARSE_STATUS_SUCCESS";
 
-    case CUSPARSE_STATUS_NOT_INITIALIZED:
-      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+  case CUSPARSE_STATUS_NOT_INITIALIZED:
+    return "CUSPARSE_STATUS_NOT_INITIALIZED";
 
-    case CUSPARSE_STATUS_ALLOC_FAILED:
-      return "CUSPARSE_STATUS_ALLOC_FAILED";
+  case CUSPARSE_STATUS_ALLOC_FAILED:
+    return "CUSPARSE_STATUS_ALLOC_FAILED";
 
-    case CUSPARSE_STATUS_INVALID_VALUE:
-      return "CUSPARSE_STATUS_INVALID_VALUE";
+  case CUSPARSE_STATUS_INVALID_VALUE:
+    return "CUSPARSE_STATUS_INVALID_VALUE";
 
-    case CUSPARSE_STATUS_ARCH_MISMATCH:
-      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+  case CUSPARSE_STATUS_ARCH_MISMATCH:
+    return "CUSPARSE_STATUS_ARCH_MISMATCH";
 
-    case CUSPARSE_STATUS_MAPPING_ERROR:
-      return "CUSPARSE_STATUS_MAPPING_ERROR";
+  case CUSPARSE_STATUS_MAPPING_ERROR:
+    return "CUSPARSE_STATUS_MAPPING_ERROR";
 
-    case CUSPARSE_STATUS_EXECUTION_FAILED:
-      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+  case CUSPARSE_STATUS_EXECUTION_FAILED:
+    return "CUSPARSE_STATUS_EXECUTION_FAILED";
 
-    case CUSPARSE_STATUS_INTERNAL_ERROR:
-      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+  case CUSPARSE_STATUS_INTERNAL_ERROR:
+    return "CUSPARSE_STATUS_INTERNAL_ERROR";
 
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+    return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
   }
 
   return "<unknown>";
@@ -186,32 +186,32 @@ static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
 
 #ifdef CUSOLVER_COMMON_H_
 // cuSOLVER API errors
-static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+static const char* _cudaGetErrorEnum(cusolverStatus_t error) {
   switch (error) {
-    case CUSOLVER_STATUS_SUCCESS:
-      return "CUSOLVER_STATUS_SUCCESS";
-    case CUSOLVER_STATUS_NOT_INITIALIZED:
-      return "CUSOLVER_STATUS_NOT_INITIALIZED";
-    case CUSOLVER_STATUS_ALLOC_FAILED:
-      return "CUSOLVER_STATUS_ALLOC_FAILED";
-    case CUSOLVER_STATUS_INVALID_VALUE:
-      return "CUSOLVER_STATUS_INVALID_VALUE";
-    case CUSOLVER_STATUS_ARCH_MISMATCH:
-      return "CUSOLVER_STATUS_ARCH_MISMATCH";
-    case CUSOLVER_STATUS_MAPPING_ERROR:
-      return "CUSOLVER_STATUS_MAPPING_ERROR";
-    case CUSOLVER_STATUS_EXECUTION_FAILED:
-      return "CUSOLVER_STATUS_EXECUTION_FAILED";
-    case CUSOLVER_STATUS_INTERNAL_ERROR:
-      return "CUSOLVER_STATUS_INTERNAL_ERROR";
-    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSOLVER_STATUS_NOT_SUPPORTED:
-      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
-    case CUSOLVER_STATUS_ZERO_PIVOT:
-      return "CUSOLVER_STATUS_ZERO_PIVOT";
-    case CUSOLVER_STATUS_INVALID_LICENSE:
-      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  case CUSOLVER_STATUS_SUCCESS:
+    return "CUSOLVER_STATUS_SUCCESS";
+  case CUSOLVER_STATUS_NOT_INITIALIZED:
+    return "CUSOLVER_STATUS_NOT_INITIALIZED";
+  case CUSOLVER_STATUS_ALLOC_FAILED:
+    return "CUSOLVER_STATUS_ALLOC_FAILED";
+  case CUSOLVER_STATUS_INVALID_VALUE:
+    return "CUSOLVER_STATUS_INVALID_VALUE";
+  case CUSOLVER_STATUS_ARCH_MISMATCH:
+    return "CUSOLVER_STATUS_ARCH_MISMATCH";
+  case CUSOLVER_STATUS_MAPPING_ERROR:
+    return "CUSOLVER_STATUS_MAPPING_ERROR";
+  case CUSOLVER_STATUS_EXECUTION_FAILED:
+    return "CUSOLVER_STATUS_EXECUTION_FAILED";
+  case CUSOLVER_STATUS_INTERNAL_ERROR:
+    return "CUSOLVER_STATUS_INTERNAL_ERROR";
+  case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+    return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  case CUSOLVER_STATUS_NOT_SUPPORTED:
+    return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+  case CUSOLVER_STATUS_ZERO_PIVOT:
+    return "CUSOLVER_STATUS_ZERO_PIVOT";
+  case CUSOLVER_STATUS_INVALID_LICENSE:
+    return "CUSOLVER_STATUS_INVALID_LICENSE";
   }
 
   return "<unknown>";
@@ -220,46 +220,46 @@ static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
 
 #ifdef CURAND_H_
 // cuRAND API errors
-static const char *_cudaGetErrorEnum(curandStatus_t error) {
+static const char* _cudaGetErrorEnum(curandStatus_t error) {
   switch (error) {
-    case CURAND_STATUS_SUCCESS:
-      return "CURAND_STATUS_SUCCESS";
+  case CURAND_STATUS_SUCCESS:
+    return "CURAND_STATUS_SUCCESS";
 
-    case CURAND_STATUS_VERSION_MISMATCH:
-      return "CURAND_STATUS_VERSION_MISMATCH";
+  case CURAND_STATUS_VERSION_MISMATCH:
+    return "CURAND_STATUS_VERSION_MISMATCH";
 
-    case CURAND_STATUS_NOT_INITIALIZED:
-      return "CURAND_STATUS_NOT_INITIALIZED";
+  case CURAND_STATUS_NOT_INITIALIZED:
+    return "CURAND_STATUS_NOT_INITIALIZED";
 
-    case CURAND_STATUS_ALLOCATION_FAILED:
-      return "CURAND_STATUS_ALLOCATION_FAILED";
+  case CURAND_STATUS_ALLOCATION_FAILED:
+    return "CURAND_STATUS_ALLOCATION_FAILED";
 
-    case CURAND_STATUS_TYPE_ERROR:
-      return "CURAND_STATUS_TYPE_ERROR";
+  case CURAND_STATUS_TYPE_ERROR:
+    return "CURAND_STATUS_TYPE_ERROR";
 
-    case CURAND_STATUS_OUT_OF_RANGE:
-      return "CURAND_STATUS_OUT_OF_RANGE";
+  case CURAND_STATUS_OUT_OF_RANGE:
+    return "CURAND_STATUS_OUT_OF_RANGE";
 
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+  case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+    return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
 
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+  case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+    return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
 
-    case CURAND_STATUS_LAUNCH_FAILURE:
-      return "CURAND_STATUS_LAUNCH_FAILURE";
+  case CURAND_STATUS_LAUNCH_FAILURE:
+    return "CURAND_STATUS_LAUNCH_FAILURE";
 
-    case CURAND_STATUS_PREEXISTING_FAILURE:
-      return "CURAND_STATUS_PREEXISTING_FAILURE";
+  case CURAND_STATUS_PREEXISTING_FAILURE:
+    return "CURAND_STATUS_PREEXISTING_FAILURE";
 
-    case CURAND_STATUS_INITIALIZATION_FAILED:
-      return "CURAND_STATUS_INITIALIZATION_FAILED";
+  case CURAND_STATUS_INITIALIZATION_FAILED:
+    return "CURAND_STATUS_INITIALIZATION_FAILED";
 
-    case CURAND_STATUS_ARCH_MISMATCH:
-      return "CURAND_STATUS_ARCH_MISMATCH";
+  case CURAND_STATUS_ARCH_MISMATCH:
+    return "CURAND_STATUS_ARCH_MISMATCH";
 
-    case CURAND_STATUS_INTERNAL_ERROR:
-      return "CURAND_STATUS_INTERNAL_ERROR";
+  case CURAND_STATUS_INTERNAL_ERROR:
+    return "CURAND_STATUS_INTERNAL_ERROR";
   }
 
   return "<unknown>";
@@ -268,34 +268,34 @@ static const char *_cudaGetErrorEnum(curandStatus_t error) {
 
 #ifdef NVJPEGAPI
 // nvJPEG API errors
-static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+static const char* _cudaGetErrorEnum(nvjpegStatus_t error) {
   switch (error) {
-    case NVJPEG_STATUS_SUCCESS:
-      return "NVJPEG_STATUS_SUCCESS";
+  case NVJPEG_STATUS_SUCCESS:
+    return "NVJPEG_STATUS_SUCCESS";
 
-    case NVJPEG_STATUS_NOT_INITIALIZED:
-      return "NVJPEG_STATUS_NOT_INITIALIZED";
+  case NVJPEG_STATUS_NOT_INITIALIZED:
+    return "NVJPEG_STATUS_NOT_INITIALIZED";
 
-    case NVJPEG_STATUS_INVALID_PARAMETER:
-      return "NVJPEG_STATUS_INVALID_PARAMETER";
+  case NVJPEG_STATUS_INVALID_PARAMETER:
+    return "NVJPEG_STATUS_INVALID_PARAMETER";
 
-    case NVJPEG_STATUS_BAD_JPEG:
-      return "NVJPEG_STATUS_BAD_JPEG";
+  case NVJPEG_STATUS_BAD_JPEG:
+    return "NVJPEG_STATUS_BAD_JPEG";
 
-    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
-      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+  case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+    return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
 
-    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
-      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+  case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+    return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
 
-    case NVJPEG_STATUS_EXECUTION_FAILED:
-      return "NVJPEG_STATUS_EXECUTION_FAILED";
+  case NVJPEG_STATUS_EXECUTION_FAILED:
+    return "NVJPEG_STATUS_EXECUTION_FAILED";
 
-    case NVJPEG_STATUS_ARCH_MISMATCH:
-      return "NVJPEG_STATUS_ARCH_MISMATCH";
+  case NVJPEG_STATUS_ARCH_MISMATCH:
+    return "NVJPEG_STATUS_ARCH_MISMATCH";
 
-    case NVJPEG_STATUS_INTERNAL_ERROR:
-      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  case NVJPEG_STATUS_INTERNAL_ERROR:
+    return "NVJPEG_STATUS_INTERNAL_ERROR";
   }
 
   return "<unknown>";
@@ -304,258 +304,258 @@ static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
 
 #ifdef NV_NPPIDEFS_H
 // NPP API errors
-static const char *_cudaGetErrorEnum(NppStatus error) {
+static const char* _cudaGetErrorEnum(NppStatus error) {
   switch (error) {
-    case NPP_NOT_SUPPORTED_MODE_ERROR:
-      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+  case NPP_NOT_SUPPORTED_MODE_ERROR:
+    return "NPP_NOT_SUPPORTED_MODE_ERROR";
 
-    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
-      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+  case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+    return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
 
-    case NPP_RESIZE_NO_OPERATION_ERROR:
-      return "NPP_RESIZE_NO_OPERATION_ERROR";
+  case NPP_RESIZE_NO_OPERATION_ERROR:
+    return "NPP_RESIZE_NO_OPERATION_ERROR";
 
-    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
-      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+  case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+    return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
 
 #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
 
-    case NPP_BAD_ARG_ERROR:
-      return "NPP_BAD_ARGUMENT_ERROR";
+  case NPP_BAD_ARG_ERROR:
+    return "NPP_BAD_ARGUMENT_ERROR";
 
-    case NPP_COEFF_ERROR:
-      return "NPP_COEFFICIENT_ERROR";
+  case NPP_COEFF_ERROR:
+    return "NPP_COEFFICIENT_ERROR";
 
-    case NPP_RECT_ERROR:
-      return "NPP_RECTANGLE_ERROR";
+  case NPP_RECT_ERROR:
+    return "NPP_RECTANGLE_ERROR";
 
-    case NPP_QUAD_ERROR:
-      return "NPP_QUADRANGLE_ERROR";
+  case NPP_QUAD_ERROR:
+    return "NPP_QUADRANGLE_ERROR";
 
-    case NPP_MEM_ALLOC_ERR:
-      return "NPP_MEMORY_ALLOCATION_ERROR";
+  case NPP_MEM_ALLOC_ERR:
+    return "NPP_MEMORY_ALLOCATION_ERROR";
 
-    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
-      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+  case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+    return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
 
-    case NPP_INVALID_INPUT:
-      return "NPP_INVALID_INPUT";
+  case NPP_INVALID_INPUT:
+    return "NPP_INVALID_INPUT";
 
-    case NPP_POINTER_ERROR:
-      return "NPP_POINTER_ERROR";
+  case NPP_POINTER_ERROR:
+    return "NPP_POINTER_ERROR";
 
-    case NPP_WARNING:
-      return "NPP_WARNING";
+  case NPP_WARNING:
+    return "NPP_WARNING";
 
-    case NPP_ODD_ROI_WARNING:
-      return "NPP_ODD_ROI_WARNING";
+  case NPP_ODD_ROI_WARNING:
+    return "NPP_ODD_ROI_WARNING";
 #else
 
-    // These are for CUDA 5.5 or higher
-    case NPP_BAD_ARGUMENT_ERROR:
-      return "NPP_BAD_ARGUMENT_ERROR";
+  // These are for CUDA 5.5 or higher
+  case NPP_BAD_ARGUMENT_ERROR:
+    return "NPP_BAD_ARGUMENT_ERROR";
 
-    case NPP_COEFFICIENT_ERROR:
-      return "NPP_COEFFICIENT_ERROR";
+  case NPP_COEFFICIENT_ERROR:
+    return "NPP_COEFFICIENT_ERROR";
 
-    case NPP_RECTANGLE_ERROR:
-      return "NPP_RECTANGLE_ERROR";
+  case NPP_RECTANGLE_ERROR:
+    return "NPP_RECTANGLE_ERROR";
 
-    case NPP_QUADRANGLE_ERROR:
-      return "NPP_QUADRANGLE_ERROR";
+  case NPP_QUADRANGLE_ERROR:
+    return "NPP_QUADRANGLE_ERROR";
 
-    case NPP_MEMORY_ALLOCATION_ERR:
-      return "NPP_MEMORY_ALLOCATION_ERROR";
+  case NPP_MEMORY_ALLOCATION_ERR:
+    return "NPP_MEMORY_ALLOCATION_ERROR";
 
-    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
-      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+  case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+    return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
 
-    case NPP_INVALID_HOST_POINTER_ERROR:
-      return "NPP_INVALID_HOST_POINTER_ERROR";
+  case NPP_INVALID_HOST_POINTER_ERROR:
+    return "NPP_INVALID_HOST_POINTER_ERROR";
 
-    case NPP_INVALID_DEVICE_POINTER_ERROR:
-      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+  case NPP_INVALID_DEVICE_POINTER_ERROR:
+    return "NPP_INVALID_DEVICE_POINTER_ERROR";
 #endif
 
-    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
-      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+  case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+    return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
 
-    case NPP_TEXTURE_BIND_ERROR:
-      return "NPP_TEXTURE_BIND_ERROR";
+  case NPP_TEXTURE_BIND_ERROR:
+    return "NPP_TEXTURE_BIND_ERROR";
 
-    case NPP_WRONG_INTERSECTION_ROI_ERROR:
-      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+  case NPP_WRONG_INTERSECTION_ROI_ERROR:
+    return "NPP_WRONG_INTERSECTION_ROI_ERROR";
 
-    case NPP_NOT_EVEN_STEP_ERROR:
-      return "NPP_NOT_EVEN_STEP_ERROR";
+  case NPP_NOT_EVEN_STEP_ERROR:
+    return "NPP_NOT_EVEN_STEP_ERROR";
 
-    case NPP_INTERPOLATION_ERROR:
-      return "NPP_INTERPOLATION_ERROR";
+  case NPP_INTERPOLATION_ERROR:
+    return "NPP_INTERPOLATION_ERROR";
 
-    case NPP_RESIZE_FACTOR_ERROR:
-      return "NPP_RESIZE_FACTOR_ERROR";
+  case NPP_RESIZE_FACTOR_ERROR:
+    return "NPP_RESIZE_FACTOR_ERROR";
 
-    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
-      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+  case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+    return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
 
 #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
 
-    case NPP_MEMFREE_ERR:
-      return "NPP_MEMFREE_ERR";
+  case NPP_MEMFREE_ERR:
+    return "NPP_MEMFREE_ERR";
 
-    case NPP_MEMSET_ERR:
-      return "NPP_MEMSET_ERR";
+  case NPP_MEMSET_ERR:
+    return "NPP_MEMSET_ERR";
 
-    case NPP_MEMCPY_ERR:
-      return "NPP_MEMCPY_ERROR";
+  case NPP_MEMCPY_ERR:
+    return "NPP_MEMCPY_ERROR";
 
-    case NPP_MIRROR_FLIP_ERR:
-      return "NPP_MIRROR_FLIP_ERR";
+  case NPP_MIRROR_FLIP_ERR:
+    return "NPP_MIRROR_FLIP_ERR";
 #else
 
-    case NPP_MEMFREE_ERROR:
-      return "NPP_MEMFREE_ERROR";
+  case NPP_MEMFREE_ERROR:
+    return "NPP_MEMFREE_ERROR";
 
-    case NPP_MEMSET_ERROR:
-      return "NPP_MEMSET_ERROR";
+  case NPP_MEMSET_ERROR:
+    return "NPP_MEMSET_ERROR";
 
-    case NPP_MEMCPY_ERROR:
-      return "NPP_MEMCPY_ERROR";
+  case NPP_MEMCPY_ERROR:
+    return "NPP_MEMCPY_ERROR";
 
-    case NPP_MIRROR_FLIP_ERROR:
-      return "NPP_MIRROR_FLIP_ERROR";
+  case NPP_MIRROR_FLIP_ERROR:
+    return "NPP_MIRROR_FLIP_ERROR";
 #endif
 
-    case NPP_ALIGNMENT_ERROR:
-      return "NPP_ALIGNMENT_ERROR";
+  case NPP_ALIGNMENT_ERROR:
+    return "NPP_ALIGNMENT_ERROR";
 
-    case NPP_STEP_ERROR:
-      return "NPP_STEP_ERROR";
+  case NPP_STEP_ERROR:
+    return "NPP_STEP_ERROR";
 
-    case NPP_SIZE_ERROR:
-      return "NPP_SIZE_ERROR";
+  case NPP_SIZE_ERROR:
+    return "NPP_SIZE_ERROR";
 
-    case NPP_NULL_POINTER_ERROR:
-      return "NPP_NULL_POINTER_ERROR";
+  case NPP_NULL_POINTER_ERROR:
+    return "NPP_NULL_POINTER_ERROR";
 
-    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
-      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+  case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+    return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
 
-    case NPP_NOT_IMPLEMENTED_ERROR:
-      return "NPP_NOT_IMPLEMENTED_ERROR";
+  case NPP_NOT_IMPLEMENTED_ERROR:
+    return "NPP_NOT_IMPLEMENTED_ERROR";
 
-    case NPP_ERROR:
-      return "NPP_ERROR";
+  case NPP_ERROR:
+    return "NPP_ERROR";
 
-    case NPP_SUCCESS:
-      return "NPP_SUCCESS";
+  case NPP_SUCCESS:
+    return "NPP_SUCCESS";
 
-    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
-      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+  case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+    return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
 
-    case NPP_MISALIGNED_DST_ROI_WARNING:
-      return "NPP_MISALIGNED_DST_ROI_WARNING";
+  case NPP_MISALIGNED_DST_ROI_WARNING:
+    return "NPP_MISALIGNED_DST_ROI_WARNING";
 
-    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
-      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+  case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+    return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
 
-    case NPP_DOUBLE_SIZE_WARNING:
-      return "NPP_DOUBLE_SIZE_WARNING";
+  case NPP_DOUBLE_SIZE_WARNING:
+    return "NPP_DOUBLE_SIZE_WARNING";
 
-    case NPP_WRONG_INTERSECTION_ROI_WARNING:
-      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+  case NPP_WRONG_INTERSECTION_ROI_WARNING:
+    return "NPP_WRONG_INTERSECTION_ROI_WARNING";
 
 #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
-    /* These are 6.0 or higher */
-    case NPP_LUT_PALETTE_BITSIZE_ERROR:
-      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+  /* These are 6.0 or higher */
+  case NPP_LUT_PALETTE_BITSIZE_ERROR:
+    return "NPP_LUT_PALETTE_BITSIZE_ERROR";
 
-    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
-      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+  case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+    return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
 
-    case NPP_QUALITY_INDEX_ERROR:
-      return "NPP_QUALITY_INDEX_ERROR";
+  case NPP_QUALITY_INDEX_ERROR:
+    return "NPP_QUALITY_INDEX_ERROR";
 
-    case NPP_CHANNEL_ORDER_ERROR:
-      return "NPP_CHANNEL_ORDER_ERROR";
+  case NPP_CHANNEL_ORDER_ERROR:
+    return "NPP_CHANNEL_ORDER_ERROR";
 
-    case NPP_ZERO_MASK_VALUE_ERROR:
-      return "NPP_ZERO_MASK_VALUE_ERROR";
+  case NPP_ZERO_MASK_VALUE_ERROR:
+    return "NPP_ZERO_MASK_VALUE_ERROR";
 
-    case NPP_NUMBER_OF_CHANNELS_ERROR:
-      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+  case NPP_NUMBER_OF_CHANNELS_ERROR:
+    return "NPP_NUMBER_OF_CHANNELS_ERROR";
 
-    case NPP_COI_ERROR:
-      return "NPP_COI_ERROR";
+  case NPP_COI_ERROR:
+    return "NPP_COI_ERROR";
 
-    case NPP_DIVISOR_ERROR:
-      return "NPP_DIVISOR_ERROR";
+  case NPP_DIVISOR_ERROR:
+    return "NPP_DIVISOR_ERROR";
 
-    case NPP_CHANNEL_ERROR:
-      return "NPP_CHANNEL_ERROR";
+  case NPP_CHANNEL_ERROR:
+    return "NPP_CHANNEL_ERROR";
 
-    case NPP_STRIDE_ERROR:
-      return "NPP_STRIDE_ERROR";
+  case NPP_STRIDE_ERROR:
+    return "NPP_STRIDE_ERROR";
 
-    case NPP_ANCHOR_ERROR:
-      return "NPP_ANCHOR_ERROR";
+  case NPP_ANCHOR_ERROR:
+    return "NPP_ANCHOR_ERROR";
 
-    case NPP_MASK_SIZE_ERROR:
-      return "NPP_MASK_SIZE_ERROR";
+  case NPP_MASK_SIZE_ERROR:
+    return "NPP_MASK_SIZE_ERROR";
 
-    case NPP_MOMENT_00_ZERO_ERROR:
-      return "NPP_MOMENT_00_ZERO_ERROR";
+  case NPP_MOMENT_00_ZERO_ERROR:
+    return "NPP_MOMENT_00_ZERO_ERROR";
 
-    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
-      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+  case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+    return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
 
-    case NPP_THRESHOLD_ERROR:
-      return "NPP_THRESHOLD_ERROR";
+  case NPP_THRESHOLD_ERROR:
+    return "NPP_THRESHOLD_ERROR";
 
-    case NPP_CONTEXT_MATCH_ERROR:
-      return "NPP_CONTEXT_MATCH_ERROR";
+  case NPP_CONTEXT_MATCH_ERROR:
+    return "NPP_CONTEXT_MATCH_ERROR";
 
-    case NPP_FFT_FLAG_ERROR:
-      return "NPP_FFT_FLAG_ERROR";
+  case NPP_FFT_FLAG_ERROR:
+    return "NPP_FFT_FLAG_ERROR";
 
-    case NPP_FFT_ORDER_ERROR:
-      return "NPP_FFT_ORDER_ERROR";
+  case NPP_FFT_ORDER_ERROR:
+    return "NPP_FFT_ORDER_ERROR";
 
-    case NPP_SCALE_RANGE_ERROR:
-      return "NPP_SCALE_RANGE_ERROR";
+  case NPP_SCALE_RANGE_ERROR:
+    return "NPP_SCALE_RANGE_ERROR";
 
-    case NPP_DATA_TYPE_ERROR:
-      return "NPP_DATA_TYPE_ERROR";
+  case NPP_DATA_TYPE_ERROR:
+    return "NPP_DATA_TYPE_ERROR";
 
-    case NPP_OUT_OFF_RANGE_ERROR:
-      return "NPP_OUT_OFF_RANGE_ERROR";
+  case NPP_OUT_OFF_RANGE_ERROR:
+    return "NPP_OUT_OFF_RANGE_ERROR";
 
-    case NPP_DIVIDE_BY_ZERO_ERROR:
-      return "NPP_DIVIDE_BY_ZERO_ERROR";
+  case NPP_DIVIDE_BY_ZERO_ERROR:
+    return "NPP_DIVIDE_BY_ZERO_ERROR";
 
-    case NPP_RANGE_ERROR:
-      return "NPP_RANGE_ERROR";
+  case NPP_RANGE_ERROR:
+    return "NPP_RANGE_ERROR";
 
-    case NPP_NO_MEMORY_ERROR:
-      return "NPP_NO_MEMORY_ERROR";
+  case NPP_NO_MEMORY_ERROR:
+    return "NPP_NO_MEMORY_ERROR";
 
-    case NPP_ERROR_RESERVED:
-      return "NPP_ERROR_RESERVED";
+  case NPP_ERROR_RESERVED:
+    return "NPP_ERROR_RESERVED";
 
-    case NPP_NO_OPERATION_WARNING:
-      return "NPP_NO_OPERATION_WARNING";
+  case NPP_NO_OPERATION_WARNING:
+    return "NPP_NO_OPERATION_WARNING";
 
-    case NPP_DIVIDE_BY_ZERO_WARNING:
-      return "NPP_DIVIDE_BY_ZERO_WARNING";
+  case NPP_DIVIDE_BY_ZERO_WARNING:
+    return "NPP_DIVIDE_BY_ZERO_WARNING";
 #endif
 
 #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
-    /* These are 7.0 or higher */
-    case NPP_OVERFLOW_ERROR:
-      return "NPP_OVERFLOW_ERROR";
+  /* These are 7.0 or higher */
+  case NPP_OVERFLOW_ERROR:
+    return "NPP_OVERFLOW_ERROR";
 
-    case NPP_CORRUPTED_DATA_ERROR:
-      return "NPP_CORRUPTED_DATA_ERROR";
+  case NPP_CORRUPTED_DATA_ERROR:
+    return "NPP_CORRUPTED_DATA_ERROR";
 #endif
   }
 
@@ -564,7 +564,7 @@ static const char *_cudaGetErrorEnum(NppStatus error) {
 #endif
 
 template <typename T>
-void check(T result, char const *const func, const char *const file,
+void check(T result, char const* const func, const char* const file,
            int const line) {
   if (result) {
     fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
@@ -581,7 +581,7 @@ void check(T result, char const *const func, const char *const file,
 // This will output the proper error string when calling cudaGetLastError
 #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
 
-inline void __getLastCudaError(const char *errorMessage, const char *file,
+inline void __getLastCudaError(const char* errorMessage, const char* file,
                                const int line) {
   cudaError_t err = cudaGetLastError();
 
@@ -599,7 +599,7 @@ inline void __getLastCudaError(const char *errorMessage, const char *file,
 // but not exit program incase error detected.
 #define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
 
-inline void __printLastCudaError(const char *errorMessage, const char *file,
+inline void __printLastCudaError(const char* errorMessage, const char* file,
                                  const int line) {
   cudaError_t err = cudaGetLastError();
 
@@ -628,29 +628,16 @@ inline int _ConvertSMVer2Cores(int major, int minor) {
   // Defines for GPU Architecture types (using the SM version to determine
   // the # of cores per SM
   typedef struct {
-    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
     // and m = SM minor version
     int Cores;
   } sSMtoCores;
 
   sSMtoCores nGpuArchCoresPerSM[] = {
-      {0x30, 192},
-      {0x32, 192},
-      {0x35, 192},
-      {0x37, 192},
-      {0x50, 128},
-      {0x52, 128},
-      {0x53, 128},
-      {0x60,  64},
-      {0x61, 128},
-      {0x62, 128},
-      {0x70,  64},
-      {0x72,  64},
-      {0x75,  64},
-      {0x80,  64},
-      {0x86, 128},
-      {0x87, 128},
-      {-1, -1}};
+      {0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128},
+      {0x52, 128}, {0x53, 128}, {0x60, 64},  {0x61, 128}, {0x62, 128},
+      {0x70, 64},  {0x72, 64},  {0x75, 64},  {0x80, 64},  {0x86, 128},
+      {0x87, 128}, {-1, -1}};
 
   int index = 0;
 
@@ -664,10 +651,9 @@ inline int _ConvertSMVer2Cores(int major, int minor) {
 
   // If we don't find the values, we default use the previous one
   // to run properly
-  printf(
-      "MapSMtoCores for SM %d.%d is undefined."
-      "  Default to use %d Cores/SM\n",
-      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  printf("MapSMtoCores for SM %d.%d is undefined."
+         "  Default to use %d Cores/SM\n",
+         major, minor, nGpuArchCoresPerSM[index - 1].Cores);
   return nGpuArchCoresPerSM[index - 1].Cores;
 }
 
@@ -675,29 +661,18 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
   // Defines for GPU Architecture types (using the SM version to determine
   // the GPU Arch name)
   typedef struct {
-    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    int SM; // 0xMm (hexidecimal notation), M = SM Major version,
     // and m = SM minor version
     const char* name;
   } sSMtoArchName;
 
   sSMtoArchName nGpuArchNameSM[] = {
-      {0x30, "Kepler"},
-      {0x32, "Kepler"},
-      {0x35, "Kepler"},
-      {0x37, "Kepler"},
-      {0x50, "Maxwell"},
-      {0x52, "Maxwell"},
-      {0x53, "Maxwell"},
-      {0x60, "Pascal"},
-      {0x61, "Pascal"},
-      {0x62, "Pascal"},
-      {0x70, "Volta"},
-      {0x72, "Xavier"},
-      {0x75, "Turing"},
-      {0x80, "Ampere"},
-      {0x86, "Ampere"},
-      {0x87, "Ampere"},
-      {-1, "Graphics Device"}};
+      {0x30, "Kepler"},  {0x32, "Kepler"},       {0x35, "Kepler"},
+      {0x37, "Kepler"},  {0x50, "Maxwell"},      {0x52, "Maxwell"},
+      {0x53, "Maxwell"}, {0x60, "Pascal"},       {0x61, "Pascal"},
+      {0x62, "Pascal"},  {0x70, "Volta"},        {0x72, "Xavier"},
+      {0x75, "Turing"},  {0x80, "Ampere"},       {0x86, "Ampere"},
+      {0x87, "Ampere"},  {-1, "Graphics Device"}};
 
   int index = 0;
 
@@ -711,13 +686,12 @@ inline const char* _ConvertSMVer2ArchName(int major, int minor) {
 
   // If we don't find the values, we default use the previous one
   // to run properly
-  printf(
-      "MapSMtoArchName for SM %d.%d is undefined."
-      "  Default to use %s\n",
-      major, minor, nGpuArchNameSM[index - 1].name);
+  printf("MapSMtoArchName for SM %d.%d is undefined."
+         "  Default to use %s\n",
+         major, minor, nGpuArchNameSM[index - 1].name);
   return nGpuArchNameSM[index - 1].name;
 }
-  // end of GPU Architecture definitions
+// end of GPU Architecture definitions
 
 #ifdef __CUDA_RUNTIME_H__
 // General GPU Device CUDA Initialization
@@ -726,9 +700,8 @@ inline int gpuDeviceInit(int devID) {
   checkCudaErrors(cudaGetDeviceCount(&device_count));
 
   if (device_count == 0) {
-    fprintf(stderr,
-            "gpuDeviceInit() CUDA error: "
-            "no devices supporting CUDA.\n");
+    fprintf(stderr, "gpuDeviceInit() CUDA error: "
+                    "no devices supporting CUDA.\n");
     exit(EXIT_FAILURE);
   }
 
@@ -749,13 +722,15 @@ inline int gpuDeviceInit(int devID) {
   }
 
   int computeMode = -1, major = 0, minor = 0;
-  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
-  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
-  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
   if (computeMode == cudaComputeModeProhibited) {
-    fprintf(stderr,
-            "Error: device is running in <Compute Mode "
-            "Prohibited>, no threads can use cudaSetDevice().\n");
+    fprintf(stderr, "Error: device is running in <Compute Mode "
+                    "Prohibited>, no threads can use cudaSetDevice().\n");
     return -1;
   }
 
@@ -765,7 +740,8 @@ inline int gpuDeviceInit(int devID) {
   }
 
   checkCudaErrors(cudaSetDevice(devID));
-  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID,
+         _ConvertSMVer2ArchName(major, minor));
 
   return devID;
 }
@@ -781,9 +757,8 @@ inline int gpuGetMaxGflopsDeviceId() {
   checkCudaErrors(cudaGetDeviceCount(&device_count));
 
   if (device_count == 0) {
-    fprintf(stderr,
-            "gpuGetMaxGflopsDeviceId() CUDA error:"
-            " no devices supporting CUDA.\n");
+    fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:"
+                    " no devices supporting CUDA.\n");
     exit(EXIT_FAILURE);
   }
 
@@ -792,9 +767,12 @@ inline int gpuGetMaxGflopsDeviceId() {
 
   while (current_device < device_count) {
     int computeMode = -1, major = 0, minor = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
-    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
-    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode,
+                                           current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(
+        &major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(
+        &minor, cudaDevAttrComputeCapabilityMinor, current_device));
 
     // If this GPU is not running on Compute Mode prohibited,
     // then we can add it to the list
@@ -802,25 +780,28 @@ inline int gpuGetMaxGflopsDeviceId() {
       if (major == 9999 && minor == 9999) {
         sm_per_multiproc = 1;
       } else {
-        sm_per_multiproc =
-            _ConvertSMVer2Cores(major,  minor);
+        sm_per_multiproc = _ConvertSMVer2Cores(major, minor);
       }
       int multiProcessorCount = 0, clockRate = 0;
-      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
-      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount,
+                                             cudaDevAttrMultiProcessorCount,
+                                             current_device));
+      cudaError_t result = cudaDeviceGetAttribute(
+          &clockRate, cudaDevAttrClockRate, current_device);
       if (result != cudaSuccess) {
         // If cudaDevAttrClockRate attribute is not supported we
         // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
-        if(result == cudaErrorInvalidValue) {
+        if (result == cudaErrorInvalidValue) {
           clockRate = 1;
-        }
-        else {
-          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
-            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+        } else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__,
+                  __LINE__, static_cast<unsigned int>(result),
+                  _cudaGetErrorEnum(result));
           exit(EXIT_FAILURE);
         }
       }
-      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+      uint64_t compute_perf =
+          (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
 
       if (compute_perf > max_compute_perf) {
         max_compute_perf = compute_perf;
@@ -834,9 +815,8 @@ inline int gpuGetMaxGflopsDeviceId() {
   }
 
   if (devices_prohibited == device_count) {
-    fprintf(stderr,
-            "gpuGetMaxGflopsDeviceId() CUDA error:"
-            " all devices have compute mode prohibited.\n");
+    fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:"
+                    " all devices have compute mode prohibited.\n");
     exit(EXIT_FAILURE);
   }
 
@@ -844,7 +824,7 @@ inline int gpuGetMaxGflopsDeviceId() {
 }
 
 // Initialization code to find the best CUDA Device
-inline int findCudaDevice(int argc, const char **argv) {
+inline int findCudaDevice(int argc, const char** argv) {
   int devID = 0;
 
   // If the command-line has a device number specified, use it
@@ -866,12 +846,13 @@ inline int findCudaDevice(int argc, const char **argv) {
     // Otherwise pick the device with highest Gflops/s
     devID = gpuGetMaxGflopsDeviceId();
     checkCudaErrors(cudaSetDevice(devID));
-    int major = 0, minor = 0; 
-    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
-    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
-    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
-           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
-
+    int major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(
+        &major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(
+        &minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
   }
 
   return devID;
@@ -892,18 +873,23 @@ inline int findIntegratedGPU() {
   // Find the integrated GPU which is compute capable
   while (current_device < device_count) {
     int computeMode = -1, integrated = -1;
-    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
-    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode,
+                                           current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated,
+                                           current_device));
     // If GPU is integrated and is not running on Compute Mode prohibited,
     // then cuda can map to GLES resource
     if (integrated && (computeMode != cudaComputeModeProhibited)) {
       checkCudaErrors(cudaSetDevice(current_device));
 
-      int major = 0, minor = 0; 
-      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
-      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      int major = 0, minor = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(
+          &major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(
+          &minor, cudaDevAttrComputeCapabilityMinor, current_device));
       printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
-             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+             current_device, _ConvertSMVer2ArchName(major, minor), major,
+             minor);
 
       return current_device;
     } else {
@@ -914,9 +900,8 @@ inline int findIntegratedGPU() {
   }
 
   if (devices_prohibited == device_count) {
-    fprintf(stderr,
-            "CUDA error:"
-            " No GLES-CUDA Interop capable GPU found.\n");
+    fprintf(stderr, "CUDA error:"
+                    " No GLES-CUDA Interop capable GPU found.\n");
     exit(EXIT_FAILURE);
   }
 
@@ -929,25 +914,25 @@ inline bool checkCudaCapabilities(int major_version, int minor_version) {
   int major = 0, minor = 0;
 
   checkCudaErrors(cudaGetDevice(&dev));
-  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
-  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(
+      cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
 
   if ((major > major_version) ||
-      (major == major_version &&
-       minor >= minor_version)) {
+      (major == major_version && minor >= minor_version)) {
     printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
            _ConvertSMVer2ArchName(major, minor), major, minor);
     return true;
   } else {
-    printf(
-        "  No GPU device was found that can support "
-        "CUDA compute capability %d.%d.\n",
-        major_version, minor_version);
+    printf("  No GPU device was found that can support "
+           "CUDA compute capability %d.%d.\n",
+           major_version, minor_version);
     return false;
   }
 }
 #endif
 
-  // end of CUDA Helper Functions
+// end of CUDA Helper Functions
 
-#endif  // COMMON_HELPER_CUDA_H_
+#endif // COMMON_HELPER_CUDA_H_
diff --git a/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp b/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp
index 59fdbc3..e5a631f 100644
--- a/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp
+++ b/src/qfvm_gpu/cuda_utils/helper_custatevec.hpp
@@ -5,27 +5,32 @@
  */
 
 #define HANDLE_ERROR(x)                                                        \
-{   const auto err = x;                                                        \
-    if (err != CUSTATEVEC_STATUS_SUCCESS ) {                                   \
-        printf("Error: %s in line %d\n",                                       \
-               custatevecGetErrorString(err), __LINE__); return err; }         \
-};
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != CUSTATEVEC_STATUS_SUCCESS) {                                    \
+      printf("Error: %s in line %d\n", custatevecGetErrorString(err),          \
+             __LINE__);                                                        \
+      return err;                                                              \
+    }                                                                          \
+  };
 
 #define HANDLE_CUDA_ERROR(x)                                                   \
-{   const auto err = x;                                                        \
-    if (err != cudaSuccess ) {                                                 \
-        printf("Error: %s in line %d\n",                                       \
-               cudaGetErrorString(err), __LINE__); return err; }               \
-};
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != cudaSuccess) {                                                  \
+      printf("Error: %s in line %d\n", cudaGetErrorString(err), __LINE__);     \
+      return err;                                                              \
+    }                                                                          \
+  };
 
 bool almost_equal(cuDoubleComplex x, cuDoubleComplex y) {
-    const double eps = 1.0e-5;
-    const cuDoubleComplex diff = cuCsub(x, y);
-    return (cuCabs(diff) < eps);
+  const double eps = 1.0e-5;
+  const cuDoubleComplex diff = cuCsub(x, y);
+  return (cuCabs(diff) < eps);
 }
 
 bool almost_equal(double x, double y) {
-    const double eps = 1.0e-5;
-    const double diff = x - y;
-    return (abs(diff) < eps);
+  const double eps = 1.0e-5;
+  const double diff = x - y;
+  return (abs(diff) < eps);
 }
diff --git a/src/qfvm_gpu/cuda_utils/helper_string.h b/src/qfvm_gpu/cuda_utils/helper_string.h
index 77864b8..15aa1ab 100644
--- a/src/qfvm_gpu/cuda_utils/helper_string.h
+++ b/src/qfvm_gpu/cuda_utils/helper_string.h
@@ -13,9 +13,9 @@
 #ifndef COMMON_HELPER_STRING_H_
 #define COMMON_HELPER_STRING_H_
 
+#include <fstream>
 #include <stdio.h>
 #include <stdlib.h>
-#include <fstream>
 #include <string>
 
 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
@@ -44,7 +44,7 @@
 #ifndef SPRINTF
 #define SPRINTF sprintf_s
 #endif
-#else  // Linux Includes
+#else // Linux Includes
 #include <string.h>
 #include <strings.h>
 
@@ -77,7 +77,7 @@
 #endif
 
 // CUDA Utility Helper Functions
-inline int stringRemoveDelimiter(char delimiter, const char *string) {
+inline int stringRemoveDelimiter(char delimiter, const char* string) {
   int string_start = 0;
 
   while (string[string_start] == delimiter) {
@@ -91,14 +91,16 @@ inline int stringRemoveDelimiter(char delimiter, const char *string) {
   return string_start;
 }
 
-inline int getFileExtension(char *filename, char **extension) {
+inline int getFileExtension(char* filename, char** extension) {
   int string_length = static_cast<int>(strlen(filename));
 
   while (filename[string_length--] != '.') {
-    if (string_length == 0) break;
+    if (string_length == 0)
+      break;
   }
 
-  if (string_length > 0) string_length += 2;
+  if (string_length > 0)
+    string_length += 2;
 
   if (string_length == 0)
     *extension = NULL;
@@ -108,16 +110,16 @@ inline int getFileExtension(char *filename, char **extension) {
   return string_length;
 }
 
-inline bool checkCmdLineFlag(const int argc, const char **argv,
-                             const char *string_ref) {
+inline bool checkCmdLineFlag(const int argc, const char** argv,
+                             const char* string_ref) {
   bool bFound = false;
 
   if (argc >= 1) {
     for (int i = 1; i < argc; i++) {
       int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
+      const char* string_argv = &argv[i][string_start];
 
-      const char *equal_pos = strchr(string_argv, '=');
+      const char* equal_pos = strchr(string_argv, '=');
       int argv_length = static_cast<int>(
           equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
 
@@ -136,14 +138,14 @@ inline bool checkCmdLineFlag(const int argc, const char **argv,
 
 // This function wraps the CUDA Driver API into a template function
 template <class T>
-inline bool getCmdLineArgumentValue(const int argc, const char **argv,
-                                    const char *string_ref, T *value) {
+inline bool getCmdLineArgumentValue(const int argc, const char** argv,
+                                    const char* string_ref, T* value) {
   bool bFound = false;
 
   if (argc >= 1) {
     for (int i = 1; i < argc; i++) {
       int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
+      const char* string_argv = &argv[i][string_start];
       int length = static_cast<int>(strlen(string_ref));
 
       if (!STRNCASECMP(string_argv, string_ref, length)) {
@@ -161,15 +163,15 @@ inline bool getCmdLineArgumentValue(const int argc, const char **argv,
   return bFound;
 }
 
-inline int getCmdLineArgumentInt(const int argc, const char **argv,
-                                 const char *string_ref) {
+inline int getCmdLineArgumentInt(const int argc, const char** argv,
+                                 const char* string_ref) {
   bool bFound = false;
   int value = -1;
 
   if (argc >= 1) {
     for (int i = 1; i < argc; i++) {
       int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
+      const char* string_argv = &argv[i][string_start];
       int length = static_cast<int>(strlen(string_ref));
 
       if (!STRNCASECMP(string_argv, string_ref, length)) {
@@ -193,15 +195,15 @@ inline int getCmdLineArgumentInt(const int argc, const char **argv,
   }
 }
 
-inline float getCmdLineArgumentFloat(const int argc, const char **argv,
-                                     const char *string_ref) {
+inline float getCmdLineArgumentFloat(const int argc, const char** argv,
+                                     const char* string_ref) {
   bool bFound = false;
   float value = -1;
 
   if (argc >= 1) {
     for (int i = 1; i < argc; i++) {
       int string_start = stringRemoveDelimiter('-', argv[i]);
-      const char *string_argv = &argv[i][string_start];
+      const char* string_argv = &argv[i][string_start];
       int length = static_cast<int>(strlen(string_ref));
 
       if (!STRNCASECMP(string_argv, string_ref, length)) {
@@ -225,15 +227,15 @@ inline float getCmdLineArgumentFloat(const int argc, const char **argv,
   }
 }
 
-inline bool getCmdLineArgumentString(const int argc, const char **argv,
-                                     const char *string_ref,
-                                     char **string_retval) {
+inline bool getCmdLineArgumentString(const int argc, const char** argv,
+                                     const char* string_ref,
+                                     char** string_retval) {
   bool bFound = false;
 
   if (argc >= 1) {
     for (int i = 1; i < argc; i++) {
       int string_start = stringRemoveDelimiter('-', argv[i]);
-      char *string_argv = const_cast<char*>(&argv[i][string_start]);
+      char* string_argv = const_cast<char*>(&argv[i][string_start]);
       int length = static_cast<int>(strlen(string_ref));
 
       if (!STRNCASECMP(string_argv, string_ref, length)) {
@@ -259,8 +261,8 @@ inline bool getCmdLineArgumentString(const int argc, const char **argv,
 //! @param filename         name of the file
 //! @param executable_path  optional absolute path of the executable
 //////////////////////////////////////////////////////////////////////////////
-inline char *sdkFindFilePath(const char *filename,
-                             const char *executable_path) {
+inline char* sdkFindFilePath(const char* filename,
+                             const char* executable_path) {
   // <executable_name> defines a variable that is replaced with the name of the
   // executable
 
@@ -268,349 +270,349 @@ inline char *sdkFindFilePath(const char *filename,
   // input data, or JIT source files) The origin for the relative search may be
   // the .exe file, a .bat file launching an .exe, a browser .exe launching the
   // .exe or .bat, etc
-  const char *searchPath[] = {
-      "./",  // same dir
+  const char* searchPath[] = {
+      "./", // same dir
       "./<executable_name>_data_files/",
-      "./common/",                      // "/common/" subdir
-      "./common/data/",                 // "/common/data/" subdir
-      "./data/",                        // "/data/" subdir
-      "./src/",                         // "/src/" subdir
-      "./src/<executable_name>/data/",  // "/src/<executable_name>/data/" subdir
-      "./inc/",                         // "/inc/" subdir
-      "./0_Simple/",                    // "/0_Simple/" subdir
-      "./1_Utilities/",                 // "/1_Utilities/" subdir
-      "./2_Graphics/",                  // "/2_Graphics/" subdir
-      "./3_Imaging/",                   // "/3_Imaging/" subdir
-      "./4_Finance/",                   // "/4_Finance/" subdir
-      "./5_Simulations/",               // "/5_Simulations/" subdir
-      "./6_Advanced/",                  // "/6_Advanced/" subdir
-      "./7_CUDALibraries/",             // "/7_CUDALibraries/" subdir
-      "./8_Android/",                   // "/8_Android/" subdir
-      "./samples/",                     // "/samples/" subdir
-
-      "./0_Simple/<executable_name>/data/",  // "/0_Simple/<executable_name>/data/"
+      "./common/",                     // "/common/" subdir
+      "./common/data/",                // "/common/data/" subdir
+      "./data/",                       // "/data/" subdir
+      "./src/",                        // "/src/" subdir
+      "./src/<executable_name>/data/", // "/src/<executable_name>/data/" subdir
+      "./inc/",                        // "/inc/" subdir
+      "./0_Simple/",                   // "/0_Simple/" subdir
+      "./1_Utilities/",                // "/1_Utilities/" subdir
+      "./2_Graphics/",                 // "/2_Graphics/" subdir
+      "./3_Imaging/",                  // "/3_Imaging/" subdir
+      "./4_Finance/",                  // "/4_Finance/" subdir
+      "./5_Simulations/",              // "/5_Simulations/" subdir
+      "./6_Advanced/",                 // "/6_Advanced/" subdir
+      "./7_CUDALibraries/",            // "/7_CUDALibraries/" subdir
+      "./8_Android/",                  // "/8_Android/" subdir
+      "./samples/",                    // "/samples/" subdir
+
+      "./0_Simple/<executable_name>/data/", // "/0_Simple/<executable_name>/data/"
+                                            // subdir
+      "./1_Utilities/<executable_name>/data/", // "/1_Utilities/<executable_name>/data/"
+                                               // subdir
+      "./2_Graphics/<executable_name>/data/", // "/2_Graphics/<executable_name>/data/"
+                                              // subdir
+      "./3_Imaging/<executable_name>/data/", // "/3_Imaging/<executable_name>/data/"
+                                             // subdir
+      "./4_Finance/<executable_name>/data/", // "/4_Finance/<executable_name>/data/"
                                              // subdir
-      "./1_Utilities/<executable_name>/data/",  // "/1_Utilities/<executable_name>/data/"
+      "./5_Simulations/<executable_name>/data/", // "/5_Simulations/<executable_name>/data/"
+                                                 // subdir
+      "./6_Advanced/<executable_name>/data/", // "/6_Advanced/<executable_name>/data/"
+                                              // subdir
+      "./7_CUDALibraries/<executable_name>/", // "/7_CUDALibraries/<executable_name>/"
+                                              // subdir
+      "./7_CUDALibraries/<executable_name>/data/", // "/7_CUDALibraries/<executable_name>/data/"
+                                                   // subdir
+
+      "../",             // up 1 in tree
+      "../common/",      // up 1 in tree, "/common/" subdir
+      "../common/data/", // up 1 in tree, "/common/data/" subdir
+      "../data/",        // up 1 in tree, "/data/" subdir
+      "../src/",         // up 1 in tree, "/src/" subdir
+      "../inc/",         // up 1 in tree, "/inc/" subdir
+
+      "../0_Simple/<executable_name>/data/", // up 1 in tree,
+                                             // "/0_Simple/<executable_name>/"
+                                             // subdir
+      "../1_Utilities/<executable_name>/data/", // up 1 in tree,
+                                                // "/1_Utilities/<executable_name>/"
                                                 // subdir
-      "./2_Graphics/<executable_name>/data/",  // "/2_Graphics/<executable_name>/data/"
+      "../2_Graphics/<executable_name>/data/", // up 1 in tree,
+                                               // "/2_Graphics/<executable_name>/"
                                                // subdir
-      "./3_Imaging/<executable_name>/data/",  // "/3_Imaging/<executable_name>/data/"
+      "../3_Imaging/<executable_name>/data/", // up 1 in tree,
+                                              // "/3_Imaging/<executable_name>/"
                                               // subdir
-      "./4_Finance/<executable_name>/data/",  // "/4_Finance/<executable_name>/data/"
+      "../4_Finance/<executable_name>/data/", // up 1 in tree,
+                                              // "/4_Finance/<executable_name>/"
                                               // subdir
-      "./5_Simulations/<executable_name>/data/",  // "/5_Simulations/<executable_name>/data/"
+      "../5_Simulations/<executable_name>/data/", // up 1 in tree,
+                                                  // "/5_Simulations/<executable_name>/"
                                                   // subdir
-      "./6_Advanced/<executable_name>/data/",  // "/6_Advanced/<executable_name>/data/"
+      "../6_Advanced/<executable_name>/data/", // up 1 in tree,
+                                               // "/6_Advanced/<executable_name>/"
                                                // subdir
-      "./7_CUDALibraries/<executable_name>/",  // "/7_CUDALibraries/<executable_name>/"
+      "../7_CUDALibraries/<executable_name>/data/", // up 1 in tree,
+                                                    // "/7_CUDALibraries/<executable_name>/"
+                                                    // subdir
+      "../8_Android/<executable_name>/data/", // up 1 in tree,
+                                              // "/8_Android/<executable_name>/"
+                                              // subdir
+      "../samples/<executable_name>/data/",   // up 1 in tree,
+                                              // "/samples/<executable_name>/"
+                                              // subdir
+      "../../",                               // up 2 in tree
+      "../../common/",                        // up 2 in tree, "/common/" subdir
+      "../../common/data/", // up 2 in tree, "/common/data/" subdir
+      "../../data/",        // up 2 in tree, "/data/" subdir
+      "../../src/",         // up 2 in tree, "/src/" subdir
+      "../../inc/",         // up 2 in tree, "/inc/" subdir
+      "../../sandbox/<executable_name>/data/",  // up 2 in tree,
+                                                // "/sandbox/<executable_name>/"
+                                                // subdir
+      "../../0_Simple/<executable_name>/data/", // up 2 in tree,
+                                                // "/0_Simple/<executable_name>/"
+                                                // subdir
+      "../../1_Utilities/<executable_name>/data/", // up 2 in tree,
+                                                   // "/1_Utilities/<executable_name>/"
+                                                   // subdir
+      "../../2_Graphics/<executable_name>/data/", // up 2 in tree,
+                                                  // "/2_Graphics/<executable_name>/"
+                                                  // subdir
+      "../../3_Imaging/<executable_name>/data/", // up 2 in tree,
+                                                 // "/3_Imaging/<executable_name>/"
+                                                 // subdir
+      "../../4_Finance/<executable_name>/data/", // up 2 in tree,
+                                                 // "/4_Finance/<executable_name>/"
+                                                 // subdir
+      "../../5_Simulations/<executable_name>/data/", // up 2 in tree,
+                                                     // "/5_Simulations/<executable_name>/"
+                                                     // subdir
+      "../../6_Advanced/<executable_name>/data/", // up 2 in tree,
+                                                  // "/6_Advanced/<executable_name>/"
+                                                  // subdir
+      "../../7_CUDALibraries/<executable_name>/data/", // up 2 in tree,
+                                                       // "/7_CUDALibraries/<executable_name>/"
+                                                       // subdir
+      "../../8_Android/<executable_name>/data/", // up 2 in tree,
+                                                 // "/8_Android/<executable_name>/"
+                                                 // subdir
+      "../../samples/<executable_name>/data/", // up 2 in tree,
+                                               // "/samples/<executable_name>/"
                                                // subdir
-      "./7_CUDALibraries/<executable_name>/data/",  // "/7_CUDALibraries/<executable_name>/data/"
+      "../../../",                             // up 3 in tree
+      "../../../src/<executable_name>/",       // up 3 in tree,
+                                         // "/src/<executable_name>/" subdir
+      "../../../src/<executable_name>/data/", // up 3 in tree,
+                                              // "/src/<executable_name>/data/"
+                                              // subdir
+      "../../../src/<executable_name>/src/",  // up 3 in tree,
+                                              // "/src/<executable_name>/src/"
+                                              // subdir
+      "../../../src/<executable_name>/inc/",  // up 3 in tree,
+                                              // "/src/<executable_name>/inc/"
+                                              // subdir
+      "../../../sandbox/<executable_name>/",  // up 3 in tree,
+                                              // "/sandbox/<executable_name>/"
+                                              // subdir
+      "../../../sandbox/<executable_name>/data/", // up 3 in tree,
+                                                  // "/sandbox/<executable_name>/data/"
+                                                  // subdir
+      "../../../sandbox/<executable_name>/src/", // up 3 in tree,
+                                                 // "/sandbox/<executable_name>/src/"
+                                                 // subdir
+      "../../../sandbox/<executable_name>/inc/", // up 3 in tree,
+                                                 // "/sandbox/<executable_name>/inc/"
+                                                 // subdir
+      "../../../0_Simple/<executable_name>/data/", // up 3 in tree,
+                                                   // "/0_Simple/<executable_name>/"
+                                                   // subdir
+      "../../../1_Utilities/<executable_name>/data/", // up 3 in tree,
+                                                      // "/1_Utilities/<executable_name>/"
+                                                      // subdir
+      "../../../2_Graphics/<executable_name>/data/", // up 3 in tree,
+                                                     // "/2_Graphics/<executable_name>/"
+                                                     // subdir
+      "../../../3_Imaging/<executable_name>/data/", // up 3 in tree,
+                                                    // "/3_Imaging/<executable_name>/"
                                                     // subdir
-
-      "../",              // up 1 in tree
-      "../common/",       // up 1 in tree, "/common/" subdir
-      "../common/data/",  // up 1 in tree, "/common/data/" subdir
-      "../data/",         // up 1 in tree, "/data/" subdir
-      "../src/",          // up 1 in tree, "/src/" subdir
-      "../inc/",          // up 1 in tree, "/inc/" subdir
-
-      "../0_Simple/<executable_name>/data/",  // up 1 in tree,
+      "../../../4_Finance/<executable_name>/data/", // up 3 in tree,
+                                                    // "/4_Finance/<executable_name>/"
+                                                    // subdir
+      "../../../5_Simulations/<executable_name>/data/", // up 3 in tree,
+                                                        // "/5_Simulations/<executable_name>/"
+                                                        // subdir
+      "../../../6_Advanced/<executable_name>/data/", // up 3 in tree,
+                                                     // "/6_Advanced/<executable_name>/"
+                                                     // subdir
+      "../../../7_CUDALibraries/<executable_name>/data/", // up 3 in tree,
+                                                          // "/7_CUDALibraries/<executable_name>/"
+                                                          // subdir
+      "../../../8_Android/<executable_name>/data/", // up 3 in tree,
+                                                    // "/8_Android/<executable_name>/"
+                                                    // subdir
+      "../../../0_Simple/<executable_name>/", // up 3 in tree,
                                               // "/0_Simple/<executable_name>/"
                                               // subdir
-      "../1_Utilities/<executable_name>/data/",  // up 1 in tree,
+      "../../../1_Utilities/<executable_name>/", // up 3 in tree,
                                                  // "/1_Utilities/<executable_name>/"
                                                  // subdir
-      "../2_Graphics/<executable_name>/data/",  // up 1 in tree,
+      "../../../2_Graphics/<executable_name>/", // up 3 in tree,
                                                 // "/2_Graphics/<executable_name>/"
                                                 // subdir
-      "../3_Imaging/<executable_name>/data/",  // up 1 in tree,
+      "../../../3_Imaging/<executable_name>/", // up 3 in tree,
                                                // "/3_Imaging/<executable_name>/"
                                                // subdir
-      "../4_Finance/<executable_name>/data/",  // up 1 in tree,
+      "../../../4_Finance/<executable_name>/", // up 3 in tree,
                                                // "/4_Finance/<executable_name>/"
                                                // subdir
-      "../5_Simulations/<executable_name>/data/",  // up 1 in tree,
+      "../../../5_Simulations/<executable_name>/", // up 3 in tree,
                                                    // "/5_Simulations/<executable_name>/"
                                                    // subdir
-      "../6_Advanced/<executable_name>/data/",  // up 1 in tree,
+      "../../../6_Advanced/<executable_name>/", // up 3 in tree,
                                                 // "/6_Advanced/<executable_name>/"
                                                 // subdir
-      "../7_CUDALibraries/<executable_name>/data/",  // up 1 in tree,
+      "../../../7_CUDALibraries/<executable_name>/", // up 3 in tree,
                                                      // "/7_CUDALibraries/<executable_name>/"
                                                      // subdir
-      "../8_Android/<executable_name>/data/",  // up 1 in tree,
+      "../../../8_Android/<executable_name>/", // up 3 in tree,
                                                // "/8_Android/<executable_name>/"
                                                // subdir
-      "../samples/<executable_name>/data/",  // up 1 in tree,
-                                             // "/samples/<executable_name>/"
-                                             // subdir
-      "../../",                              // up 2 in tree
-      "../../common/",                       // up 2 in tree, "/common/" subdir
-      "../../common/data/",  // up 2 in tree, "/common/data/" subdir
-      "../../data/",         // up 2 in tree, "/data/" subdir
-      "../../src/",          // up 2 in tree, "/src/" subdir
-      "../../inc/",          // up 2 in tree, "/inc/" subdir
-      "../../sandbox/<executable_name>/data/",  // up 2 in tree,
+      "../../../samples/<executable_name>/data/", // up 3 in tree,
+                                                  // "/samples/<executable_name>/"
+                                                  // subdir
+      "../../../common/",      // up 3 in tree, "../../../common/" subdir
+      "../../../common/data/", // up 3 in tree, "../../../common/data/" subdir
+      "../../../data/",        // up 3 in tree, "../../../data/" subdir
+      "../../../../",          // up 4 in tree
+      "../../../../src/<executable_name>/", // up 4 in tree,
+                                            // "/src/<executable_name>/" subdir
+      "../../../../src/<executable_name>/data/", // up 4 in tree,
+                                                 // "/src/<executable_name>/data/"
+                                                 // subdir
+      "../../../../src/<executable_name>/src/", // up 4 in tree,
+                                                // "/src/<executable_name>/src/"
+                                                // subdir
+      "../../../../src/<executable_name>/inc/", // up 4 in tree,
+                                                // "/src/<executable_name>/inc/"
+                                                // subdir
+      "../../../../sandbox/<executable_name>/", // up 4 in tree,
                                                 // "/sandbox/<executable_name>/"
                                                 // subdir
-      "../../0_Simple/<executable_name>/data/",  // up 2 in tree,
+      "../../../../sandbox/<executable_name>/data/", // up 4 in tree,
+                                                     // "/sandbox/<executable_name>/data/"
+                                                     // subdir
+      "../../../../sandbox/<executable_name>/src/", // up 4 in tree,
+                                                    // "/sandbox/<executable_name>/src/"
+                                                    // subdir
+      "../../../../sandbox/<executable_name>/inc/", // up 4 in tree,
+                                                    // "/sandbox/<executable_name>/inc/"
+                                                    // subdir
+      "../../../../0_Simple/<executable_name>/data/", // up 4 in tree,
+                                                      // "/0_Simple/<executable_name>/"
+                                                      // subdir
+      "../../../../1_Utilities/<executable_name>/data/", // up 4 in tree,
+                                                         // "/1_Utilities/<executable_name>/"
+                                                         // subdir
+      "../../../../2_Graphics/<executable_name>/data/", // up 4 in tree,
+                                                        // "/2_Graphics/<executable_name>/"
+                                                        // subdir
+      "../../../../3_Imaging/<executable_name>/data/", // up 4 in tree,
+                                                       // "/3_Imaging/<executable_name>/"
+                                                       // subdir
+      "../../../../4_Finance/<executable_name>/data/", // up 4 in tree,
+                                                       // "/4_Finance/<executable_name>/"
+                                                       // subdir
+      "../../../../5_Simulations/<executable_name>/data/", // up 4 in tree,
+                                                           // "/5_Simulations/<executable_name>/"
+                                                           // subdir
+      "../../../../6_Advanced/<executable_name>/data/", // up 4 in tree,
+                                                        // "/6_Advanced/<executable_name>/"
+                                                        // subdir
+      "../../../../7_CUDALibraries/<executable_name>/data/", // up 4 in tree,
+                                                             // "/7_CUDALibraries/<executable_name>/"
+                                                             // subdir
+      "../../../../8_Android/<executable_name>/data/", // up 4 in tree,
+                                                       // "/8_Android/<executable_name>/"
+                                                       // subdir
+      "../../../../0_Simple/<executable_name>/", // up 4 in tree,
                                                  // "/0_Simple/<executable_name>/"
                                                  // subdir
-      "../../1_Utilities/<executable_name>/data/",  // up 2 in tree,
+      "../../../../1_Utilities/<executable_name>/", // up 4 in tree,
                                                     // "/1_Utilities/<executable_name>/"
                                                     // subdir
-      "../../2_Graphics/<executable_name>/data/",  // up 2 in tree,
+      "../../../../2_Graphics/<executable_name>/", // up 4 in tree,
                                                    // "/2_Graphics/<executable_name>/"
                                                    // subdir
-      "../../3_Imaging/<executable_name>/data/",  // up 2 in tree,
+      "../../../../3_Imaging/<executable_name>/", // up 4 in tree,
                                                   // "/3_Imaging/<executable_name>/"
                                                   // subdir
-      "../../4_Finance/<executable_name>/data/",  // up 2 in tree,
+      "../../../../4_Finance/<executable_name>/", // up 4 in tree,
                                                   // "/4_Finance/<executable_name>/"
                                                   // subdir
-      "../../5_Simulations/<executable_name>/data/",  // up 2 in tree,
+      "../../../../5_Simulations/<executable_name>/", // up 4 in tree,
                                                       // "/5_Simulations/<executable_name>/"
                                                       // subdir
-      "../../6_Advanced/<executable_name>/data/",  // up 2 in tree,
+      "../../../../6_Advanced/<executable_name>/", // up 4 in tree,
                                                    // "/6_Advanced/<executable_name>/"
                                                    // subdir
-      "../../7_CUDALibraries/<executable_name>/data/",  // up 2 in tree,
+      "../../../../7_CUDALibraries/<executable_name>/", // up 4 in tree,
                                                         // "/7_CUDALibraries/<executable_name>/"
                                                         // subdir
-      "../../8_Android/<executable_name>/data/",  // up 2 in tree,
+      "../../../../8_Android/<executable_name>/", // up 4 in tree,
                                                   // "/8_Android/<executable_name>/"
                                                   // subdir
-      "../../samples/<executable_name>/data/",  // up 2 in tree,
-                                                // "/samples/<executable_name>/"
-                                                // subdir
-      "../../../",                              // up 3 in tree
-      "../../../src/<executable_name>/",        // up 3 in tree,
-                                          // "/src/<executable_name>/" subdir
-      "../../../src/<executable_name>/data/",  // up 3 in tree,
-                                               // "/src/<executable_name>/data/"
-                                               // subdir
-      "../../../src/<executable_name>/src/",   // up 3 in tree,
-                                               // "/src/<executable_name>/src/"
-                                               // subdir
-      "../../../src/<executable_name>/inc/",   // up 3 in tree,
-                                               // "/src/<executable_name>/inc/"
-                                               // subdir
-      "../../../sandbox/<executable_name>/",   // up 3 in tree,
-                                               // "/sandbox/<executable_name>/"
-                                               // subdir
-      "../../../sandbox/<executable_name>/data/",  // up 3 in tree,
-                                                   // "/sandbox/<executable_name>/data/"
-                                                   // subdir
-      "../../../sandbox/<executable_name>/src/",  // up 3 in tree,
-                                                  // "/sandbox/<executable_name>/src/"
-                                                  // subdir
-      "../../../sandbox/<executable_name>/inc/",  // up 3 in tree,
-                                                  // "/sandbox/<executable_name>/inc/"
-                                                  // subdir
-      "../../../0_Simple/<executable_name>/data/",  // up 3 in tree,
-                                                    // "/0_Simple/<executable_name>/"
-                                                    // subdir
-      "../../../1_Utilities/<executable_name>/data/",  // up 3 in tree,
-                                                       // "/1_Utilities/<executable_name>/"
-                                                       // subdir
-      "../../../2_Graphics/<executable_name>/data/",  // up 3 in tree,
-                                                      // "/2_Graphics/<executable_name>/"
-                                                      // subdir
-      "../../../3_Imaging/<executable_name>/data/",  // up 3 in tree,
-                                                     // "/3_Imaging/<executable_name>/"
+      "../../../../samples/<executable_name>/data/", // up 4 in tree,
+                                                     // "/samples/<executable_name>/"
                                                      // subdir
-      "../../../4_Finance/<executable_name>/data/",  // up 3 in tree,
-                                                     // "/4_Finance/<executable_name>/"
-                                                     // subdir
-      "../../../5_Simulations/<executable_name>/data/",  // up 3 in tree,
-                                                         // "/5_Simulations/<executable_name>/"
-                                                         // subdir
-      "../../../6_Advanced/<executable_name>/data/",  // up 3 in tree,
-                                                      // "/6_Advanced/<executable_name>/"
-                                                      // subdir
-      "../../../7_CUDALibraries/<executable_name>/data/",  // up 3 in tree,
-                                                           // "/7_CUDALibraries/<executable_name>/"
-                                                           // subdir
-      "../../../8_Android/<executable_name>/data/",  // up 3 in tree,
-                                                     // "/8_Android/<executable_name>/"
-                                                     // subdir
-      "../../../0_Simple/<executable_name>/",  // up 3 in tree,
-                                               // "/0_Simple/<executable_name>/"
-                                               // subdir
-      "../../../1_Utilities/<executable_name>/",  // up 3 in tree,
-                                                  // "/1_Utilities/<executable_name>/"
-                                                  // subdir
-      "../../../2_Graphics/<executable_name>/",  // up 3 in tree,
-                                                 // "/2_Graphics/<executable_name>/"
-                                                 // subdir
-      "../../../3_Imaging/<executable_name>/",  // up 3 in tree,
-                                                // "/3_Imaging/<executable_name>/"
-                                                // subdir
-      "../../../4_Finance/<executable_name>/",  // up 3 in tree,
-                                                // "/4_Finance/<executable_name>/"
-                                                // subdir
-      "../../../5_Simulations/<executable_name>/",  // up 3 in tree,
-                                                    // "/5_Simulations/<executable_name>/"
+      "../../../../common/",      // up 4 in tree, "../../../common/" subdir
+      "../../../../common/data/", // up 4 in tree, "../../../common/data/"
+                                  // subdir
+      "../../../../data/",        // up 4 in tree, "../../../data/" subdir
+      "../../../../../",          // up 5 in tree
+      "../../../../../src/<executable_name>/",      // up 5 in tree,
+                                                    // "/src/<executable_name>/"
                                                     // subdir
-      "../../../6_Advanced/<executable_name>/",  // up 3 in tree,
-                                                 // "/6_Advanced/<executable_name>/"
-                                                 // subdir
-      "../../../7_CUDALibraries/<executable_name>/",  // up 3 in tree,
-                                                      // "/7_CUDALibraries/<executable_name>/"
-                                                      // subdir
-      "../../../8_Android/<executable_name>/",  // up 3 in tree,
-                                                // "/8_Android/<executable_name>/"
-                                                // subdir
-      "../../../samples/<executable_name>/data/",  // up 3 in tree,
-                                                   // "/samples/<executable_name>/"
-                                                   // subdir
-      "../../../common/",       // up 3 in tree, "../../../common/" subdir
-      "../../../common/data/",  // up 3 in tree, "../../../common/data/" subdir
-      "../../../data/",         // up 3 in tree, "../../../data/" subdir
-      "../../../../",           // up 4 in tree
-      "../../../../src/<executable_name>/",  // up 4 in tree,
-                                             // "/src/<executable_name>/" subdir
-      "../../../../src/<executable_name>/data/",  // up 4 in tree,
-                                                  // "/src/<executable_name>/data/"
-                                                  // subdir
-      "../../../../src/<executable_name>/src/",  // up 4 in tree,
-                                                 // "/src/<executable_name>/src/"
-                                                 // subdir
-      "../../../../src/<executable_name>/inc/",  // up 4 in tree,
-                                                 // "/src/<executable_name>/inc/"
-                                                 // subdir
-      "../../../../sandbox/<executable_name>/",  // up 4 in tree,
-                                                 // "/sandbox/<executable_name>/"
-                                                 // subdir
-      "../../../../sandbox/<executable_name>/data/",  // up 4 in tree,
-                                                      // "/sandbox/<executable_name>/data/"
-                                                      // subdir
-      "../../../../sandbox/<executable_name>/src/",  // up 4 in tree,
-                                                     // "/sandbox/<executable_name>/src/"
-                                                     // subdir
-      "../../../../sandbox/<executable_name>/inc/",  // up 4 in tree,
-                                                     // "/sandbox/<executable_name>/inc/"
-                                                     // subdir
-      "../../../../0_Simple/<executable_name>/data/",  // up 4 in tree,
-                                                       // "/0_Simple/<executable_name>/"
-                                                       // subdir
-      "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree,
-                                                          // "/1_Utilities/<executable_name>/"
-                                                          // subdir
-      "../../../../2_Graphics/<executable_name>/data/",  // up 4 in tree,
-                                                         // "/2_Graphics/<executable_name>/"
-                                                         // subdir
-      "../../../../3_Imaging/<executable_name>/data/",  // up 4 in tree,
-                                                        // "/3_Imaging/<executable_name>/"
-                                                        // subdir
-      "../../../../4_Finance/<executable_name>/data/",  // up 4 in tree,
-                                                        // "/4_Finance/<executable_name>/"
-                                                        // subdir
-      "../../../../5_Simulations/<executable_name>/data/",  // up 4 in tree,
-                                                            // "/5_Simulations/<executable_name>/"
-                                                            // subdir
-      "../../../../6_Advanced/<executable_name>/data/",  // up 4 in tree,
-                                                         // "/6_Advanced/<executable_name>/"
-                                                         // subdir
-      "../../../../7_CUDALibraries/<executable_name>/data/",  // up 4 in tree,
-                                                              // "/7_CUDALibraries/<executable_name>/"
-                                                              // subdir
-      "../../../../8_Android/<executable_name>/data/",  // up 4 in tree,
-                                                        // "/8_Android/<executable_name>/"
-                                                        // subdir
-      "../../../../0_Simple/<executable_name>/",  // up 4 in tree,
-                                                  // "/0_Simple/<executable_name>/"
-                                                  // subdir
-      "../../../../1_Utilities/<executable_name>/",  // up 4 in tree,
-                                                     // "/1_Utilities/<executable_name>/"
-                                                     // subdir
-      "../../../../2_Graphics/<executable_name>/",  // up 4 in tree,
-                                                    // "/2_Graphics/<executable_name>/"
+      "../../../../../src/<executable_name>/data/", // up 5 in tree,
+                                                    // "/src/<executable_name>/data/"
                                                     // subdir
-      "../../../../3_Imaging/<executable_name>/",  // up 4 in tree,
-                                                   // "/3_Imaging/<executable_name>/"
+      "../../../../../src/<executable_name>/src/", // up 5 in tree,
+                                                   // "/src/<executable_name>/src/"
                                                    // subdir
-      "../../../../4_Finance/<executable_name>/",  // up 4 in tree,
-                                                   // "/4_Finance/<executable_name>/"
+      "../../../../../src/<executable_name>/inc/", // up 5 in tree,
+                                                   // "/src/<executable_name>/inc/"
                                                    // subdir
-      "../../../../5_Simulations/<executable_name>/",  // up 4 in tree,
-                                                       // "/5_Simulations/<executable_name>/"
-                                                       // subdir
-      "../../../../6_Advanced/<executable_name>/",  // up 4 in tree,
-                                                    // "/6_Advanced/<executable_name>/"
-                                                    // subdir
-      "../../../../7_CUDALibraries/<executable_name>/",  // up 4 in tree,
-                                                         // "/7_CUDALibraries/<executable_name>/"
-                                                         // subdir
-      "../../../../8_Android/<executable_name>/",  // up 4 in tree,
-                                                   // "/8_Android/<executable_name>/"
+      "../../../../../sandbox/<executable_name>/", // up 5 in tree,
+                                                   // "/sandbox/<executable_name>/"
                                                    // subdir
-      "../../../../samples/<executable_name>/data/",  // up 4 in tree,
-                                                      // "/samples/<executable_name>/"
-                                                      // subdir
-      "../../../../common/",       // up 4 in tree, "../../../common/" subdir
-      "../../../../common/data/",  // up 4 in tree, "../../../common/data/"
-                                   // subdir
-      "../../../../data/",         // up 4 in tree, "../../../data/" subdir
-      "../../../../../",           // up 5 in tree
-      "../../../../../src/<executable_name>/",  // up 5 in tree,
-                                                // "/src/<executable_name>/"
-                                                // subdir
-      "../../../../../src/<executable_name>/data/",  // up 5 in tree,
-                                                     // "/src/<executable_name>/data/"
-                                                     // subdir
-      "../../../../../src/<executable_name>/src/",  // up 5 in tree,
-                                                    // "/src/<executable_name>/src/"
-                                                    // subdir
-      "../../../../../src/<executable_name>/inc/",  // up 5 in tree,
-                                                    // "/src/<executable_name>/inc/"
-                                                    // subdir
-      "../../../../../sandbox/<executable_name>/",  // up 5 in tree,
-                                                    // "/sandbox/<executable_name>/"
-                                                    // subdir
-      "../../../../../sandbox/<executable_name>/data/",  // up 5 in tree,
-                                                         // "/sandbox/<executable_name>/data/"
-                                                         // subdir
-      "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree,
-                                                        // "/sandbox/<executable_name>/src/"
+      "../../../../../sandbox/<executable_name>/data/", // up 5 in tree,
+                                                        // "/sandbox/<executable_name>/data/"
                                                         // subdir
-      "../../../../../sandbox/<executable_name>/inc/",  // up 5 in tree,
-                                                        // "/sandbox/<executable_name>/inc/"
-                                                        // subdir
-      "../../../../../0_Simple/<executable_name>/data/",  // up 5 in tree,
-                                                          // "/0_Simple/<executable_name>/"
-                                                          // subdir
-      "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree,
-                                                             // "/1_Utilities/<executable_name>/"
-                                                             // subdir
-      "../../../../../2_Graphics/<executable_name>/data/",  // up 5 in tree,
-                                                            // "/2_Graphics/<executable_name>/"
+      "../../../../../sandbox/<executable_name>/src/", // up 5 in tree,
+                                                       // "/sandbox/<executable_name>/src/"
+                                                       // subdir
+      "../../../../../sandbox/<executable_name>/inc/", // up 5 in tree,
+                                                       // "/sandbox/<executable_name>/inc/"
+                                                       // subdir
+      "../../../../../0_Simple/<executable_name>/data/", // up 5 in tree,
+                                                         // "/0_Simple/<executable_name>/"
+                                                         // subdir
+      "../../../../../1_Utilities/<executable_name>/data/", // up 5 in tree,
+                                                            // "/1_Utilities/<executable_name>/"
                                                             // subdir
-      "../../../../../3_Imaging/<executable_name>/data/",  // up 5 in tree,
-                                                           // "/3_Imaging/<executable_name>/"
+      "../../../../../2_Graphics/<executable_name>/data/", // up 5 in tree,
+                                                           // "/2_Graphics/<executable_name>/"
                                                            // subdir
-      "../../../../../4_Finance/<executable_name>/data/",  // up 5 in tree,
-                                                           // "/4_Finance/<executable_name>/"
-                                                           // subdir
-      "../../../../../5_Simulations/<executable_name>/data/",  // up 5 in tree,
-                                                               // "/5_Simulations/<executable_name>/"
-                                                               // subdir
-      "../../../../../6_Advanced/<executable_name>/data/",  // up 5 in tree,
-                                                            // "/6_Advanced/<executable_name>/"
-                                                            // subdir
-      "../../../../../7_CUDALibraries/<executable_name>/data/",  // up 5 in
-                                                                 // tree,
-                                                                 // "/7_CUDALibraries/<executable_name>/"
-                                                                 // subdir
-      "../../../../../8_Android/<executable_name>/data/",  // up 5 in tree,
-                                                           // "/8_Android/<executable_name>/"
+      "../../../../../3_Imaging/<executable_name>/data/", // up 5 in tree,
+                                                          // "/3_Imaging/<executable_name>/"
+                                                          // subdir
+      "../../../../../4_Finance/<executable_name>/data/", // up 5 in tree,
+                                                          // "/4_Finance/<executable_name>/"
+                                                          // subdir
+      "../../../../../5_Simulations/<executable_name>/data/", // up 5 in tree,
+                                                              // "/5_Simulations/<executable_name>/"
+                                                              // subdir
+      "../../../../../6_Advanced/<executable_name>/data/", // up 5 in tree,
+                                                           // "/6_Advanced/<executable_name>/"
                                                            // subdir
-      "../../../../../samples/<executable_name>/data/",  // up 5 in tree,
-                                                         // "/samples/<executable_name>/"
-                                                         // subdir
-      "../../../../../common/",       // up 5 in tree, "../../../common/" subdir
-      "../../../../../common/data/",  // up 5 in tree, "../../../common/data/"
-                                      // subdir
+      "../../../../../7_CUDALibraries/<executable_name>/data/", // up 5 in
+                                                                // tree,
+                                                                // "/7_CUDALibraries/<executable_name>/"
+                                                                // subdir
+      "../../../../../8_Android/<executable_name>/data/", // up 5 in tree,
+                                                          // "/8_Android/<executable_name>/"
+                                                          // subdir
+      "../../../../../samples/<executable_name>/data/", // up 5 in tree,
+                                                        // "/samples/<executable_name>/"
+                                                        // subdir
+      "../../../../../common/",      // up 5 in tree, "../../../common/" subdir
+      "../../../../../common/data/", // up 5 in tree, "../../../common/data/"
+                                     // subdir
   };
 
   // Extract the executable name
@@ -637,7 +639,7 @@ inline char *sdkFindFilePath(const char *filename,
   }
 
   // Loop over all search paths and return the first hit
-  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char*); ++i) {
     std::string path(searchPath[i]);
     size_t executable_name_pos = path.find("<executable_name>");
 
@@ -659,14 +661,14 @@ inline char *sdkFindFilePath(const char *filename,
 
     // Test if the file exists
     path.append(filename);
-    FILE *fp;
+    FILE* fp;
     FOPEN(fp, path.c_str(), "rb");
 
     if (fp != NULL) {
       fclose(fp);
       // File found
       // returning an allocated array here for backwards compatibility reasons
-      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+      char* file_path = reinterpret_cast<char*>(malloc(path.length() + 1));
       STRCPY(file_path, path.length() + 1, path.c_str());
       return file_path;
     }
@@ -680,4 +682,4 @@ inline char *sdkFindFilePath(const char *filename,
   return 0;
 }
 
-#endif  // COMMON_HELPER_STRING_H_
+#endif // COMMON_HELPER_STRING_H_
diff --git a/src/qfvm_gpu/cuda_utils/ticktock.h b/src/qfvm_gpu/cuda_utils/ticktock.h
index 1adb8a9..e5d6950 100644
--- a/src/qfvm_gpu/cuda_utils/ticktock.h
+++ b/src/qfvm_gpu/cuda_utils/ticktock.h
@@ -1,9 +1,13 @@
 #pragma once
 
-//#include <chrono>
-//#define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
-//#define TOCK(x) std::cout << #x ": " << std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now() - bench_##x).count() << "s" << std::endl;
+// #include <chrono>
+// #define TICK(x) auto bench_##x = std::chrono::steady_clock::now();
+// #define TOCK(x) std::cout << #x ": " <<
+// std::chrono::duration_cast<std::chrono::duration<double>>(std::chrono::steady_clock::now()
+// - bench_##x).count() << "s" << std::endl;
 
 #include <tbb/tick_count.h>
 #define TICK(x) auto bench_##x = tbb::tick_count::now();
-#define TOCK(x) std::cout << #x ": " << (tbb::tick_count::now() - bench_##x).seconds() << "s" << std::endl;
+#define TOCK(x)                                                                \
+  std::cout << #x ": " << (tbb::tick_count::now() - bench_##x).seconds()       \
+            << "s" << std::endl;
diff --git a/src/qfvm_gpu/custate_simu.cuh b/src/qfvm_gpu/custate_simu.cuh
index 6456c7e..9a79054 100644
--- a/src/qfvm_gpu/custate_simu.cuh
+++ b/src/qfvm_gpu/custate_simu.cuh
@@ -1,35 +1,36 @@
 #pragma once
+#include "apply_gate_custate.cuh"
 #include "cuda_statevector.cuh"
 #include <circuit.hpp>
-#include <types.hpp>
 #include <statevector.hpp>
-#include "apply_gate_custate.cuh"
+#include <types.hpp>
 
-void simulate_custate(Circuit & circuit, CudaStateVector & psi_d){
-    size_t size = psi_d.size();
-    int n = psi_d.num();
-    for (auto gate : circuit.gates()){
-        apply_gate_custate(psi_d.data(), gate, n);
-    }
+void simulate_custate(Circuit& circuit, CudaStateVector& psi_d) {
+  size_t size = psi_d.size();
+  int n = psi_d.num();
+  for (auto gate : circuit.gates()) {
+    apply_gate_custate(psi_d.data(), gate, n);
+  }
 }
 
-void simulate_custate(Circuit & circuit, StateVector<data_t> & state){
-    //initialize psi
-    state.set_num(circuit.qubit_num());
-    size_t size = state.size();
-    CudaStateVector psi_d(state);
-    
-    simulate_custate(circuit, psi_d);
-    cudaDeviceSynchronize();
+void simulate_custate(Circuit& circuit, StateVector<data_t>& state) {
+  // initialize psi
+  state.set_num(circuit.qubit_num());
+  size_t size = state.size();
+  CudaStateVector psi_d(state);
 
-    //copy back
-    complex<double>* psi = reinterpret_cast<complex<double>*>(psi_d.data());
-    checkCudaErrors(cudaMemcpy(state.data(), psi, size*sizeof(complex<double>), cudaMemcpyDeviceToHost));
-    psi=nullptr;
-}   
+  simulate_custate(circuit, psi_d);
+  cudaDeviceSynchronize();
 
-StateVector<double> simulate_custate(Circuit & circuit){
-    StateVector<double> state(circuit.qubit_num());
-    simulate_custate(circuit, state);
-    return std::move(state);
-}
\ No newline at end of file
+  // copy back
+  complex<double>* psi = reinterpret_cast<complex<double>*>(psi_d.data());
+  checkCudaErrors(cudaMemcpy(state.data(), psi, size * sizeof(complex<double>),
+                             cudaMemcpyDeviceToHost));
+  psi = nullptr;
+}
+
+StateVector<double> simulate_custate(Circuit& circuit) {
+  StateVector<double> state(circuit.qubit_num());
+  simulate_custate(circuit, state);
+  return std::move(state);
+}
diff --git a/src/quafu/visualisation/styles.json b/src/quafu/visualisation/styles.json
index af8e8c8..8e9af8f 100644
--- a/src/quafu/visualisation/styles.json
+++ b/src/quafu/visualisation/styles.json
@@ -1,292 +1,292 @@
 {
-    "globalStyles": {
-        "h_wire": {
-            "zorder": 0,
-            "color": "#FF0000",
-            "alpha": 0.8,
-            "lw": 2
-        },
-        "v_wire": {
-            "zorder": 0,
-            "color": "#3B82F6",
-            "alpha": 0.8,
-            "lw": 4
-        },
-        "text": {
-            "zorder": 1,
-            "color": "#0C161F",
-            "alpha": 1.0,
-            "size": 18,
-            "weight": "normal",
-            "family": "monospace"
-        },
-        "stroke": {
-            "lw": 2
-        },
-        "box": {
-            "ec": "#0C161F"
-        },
-        "axes": {
-            "phys_scale": 0.7874015748031495,
-            "unit": 0.5,
-            "title_height": 0.5,
-            "x_margin_width": 0.6
-        }
-    },
-    "elementStyles": {
-        "barrier": {
-            "fc": "lightgray",
-            "hatch": "///",
-            "zorder": 4,
-            "proc_func": "barrier"
-        },
-        "ccx": {
-            "_name": "x",
-            "proc_func": "csu2"
-        },
-        "cp": {
-            "_name": "p",
-            "proc_func": "csu2"
-        },
-        "cs": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "proc_func": "csu2"
-        },
-        "cswap": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "proc_func": "cu"
-        },
-        "ct": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "proc_func": "csu2"
-        },
-        "cu": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "proc_func": "cu"
-        },
-        "cx": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "_name": "x",
-            "proc_func": "csu2"
-        },
-        "cy": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "proc_func": "csu2"
-        },
-        "cz": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "proc_func": "csu2"
-        },
-        "delay": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "Delay",
-            "proc_func": "pulse"
-        },
-        "flattop": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "Flattop",
-            "proc_func": "pulse"
-        },
-        "gaussian": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "Gaussian",
-            "proc_func": "pulse"
-        },
-        "h": {
-            "fc": "#EE7057",
-            "ec": "",
-            "label": "H",
-            "proc_func": "su2"
-        },
-        "id": {
-            "fc": "#EE7057",
-            "ec": "",
-            "label": "I",
-            "proc_func": "su2"
-        },
-        "iswap": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "proc_func": ""
-        },
-        "mcx": {
-            "_name": "x",
-            "proc_func": ""
-        },
-        "mcy": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "proc_func": ""
-        },
-        "mcz": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "",
-            "proc_func": ""
-        },
-        "measure": {
-            "fc": "#FFB240",
-            "ec": "",
-            "label": "Measure",
-            "proc_func": ""
-        },
-        "p": {
-            "fc": "#EE7057",
-            "ec": "",
-            "label": "P",
-            "proc_func": ""
-        },
-        "rect": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "Rect",
-            "proc_func": ""
-        },
-        "rx": {
-            "fc": "#6366F1",
-            "ec": "",
-            "label": "RX",
-            "proc_func": ""
-        },
-        "rxx": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "RXX",
-            "_name": "rx",
-            "proc_func": "msu2"
-        },
-        "ry": {
-            "fc": "#6366F1",
-            "ec": "",
-            "label": "RY",
-            "proc_func": ""
-        },
-        "ryy": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "RYY",
-            "proc_func": ""
-        },
-        "rz": {
-            "fc": "#6366F1",
-            "ec": "",
-            "label": "RZ",
-            "proc_func": ""
-        },
-        "rzz": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "RZZ",
-            "proc_func": ""
-        },
-        "s": {
-            "fc": "#EE7057",
-            "ec": "",
-            "label": "S",
-            "proc_func": ""
-        },
-        "sdg": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "$S^\\dagger$",
-            "proc_func": ""
-        },
-        "sw": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "$\\sqrt{W}$",
-            "proc_func": ""
-        },
-        "swap": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "Swap",
-            "proc_func": ""
-        },
-        "sx": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "$\\sqrt{X}$",
-            "proc_func": ""
-        },
-        "sxdg": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "$\\sqrt{X}^\\dagger$",
-            "proc_func": ""
-        },
-        "sy": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "$\\sqrt{Y}$",
-            "proc_func": ""
-        },
-        "sydg": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "$\\sqrt{Y}^\\dagger$",
-            "proc_func": "su2"
-        },
-        "t": {
-            "fc": "#EE7057",
-            "ec": "",
-            "label": "T",
-            "proc_func": "su2"
-        },
-        "tdg": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "$T^\\dagger$",
-            "proc_func": ""
-        },
-        "w": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "W",
-            "proc_func": ""
-        },
-        "x": {
-            "fc": "#EE7057",
-            "ec": "",
-            "label": "X",
-            "proc_func": ""
-        },
-        "xy": {
-            "fc": "#8C9197",
-            "ec": "",
-            "label": "XY",
-            "proc_func": ""
-        },
-        "y": {
-            "fc": "#EE7057",
-            "ec": "",
-            "label": "Y",
-            "proc_func": ""
-        },
-        "z": {
-            "fc": "#EE7057",
-            "ec": "",
-            "label": "Z",
-            "proc_func": ""
-        }
+  "globalStyles": {
+    "h_wire": {
+      "zorder": 0,
+      "color": "#FF0000",
+      "alpha": 0.8,
+      "lw": 2
+    },
+    "v_wire": {
+      "zorder": 0,
+      "color": "#3B82F6",
+      "alpha": 0.8,
+      "lw": 4
+    },
+    "text": {
+      "zorder": 1,
+      "color": "#0C161F",
+      "alpha": 1.0,
+      "size": 18,
+      "weight": "normal",
+      "family": "monospace"
+    },
+    "stroke": {
+      "lw": 2
+    },
+    "box": {
+      "ec": "#0C161F"
+    },
+    "axes": {
+      "phys_scale": 0.7874015748031495,
+      "unit": 0.5,
+      "title_height": 0.5,
+      "x_margin_width": 0.6
+    }
+  },
+  "elementStyles": {
+    "barrier": {
+      "fc": "lightgray",
+      "hatch": "///",
+      "zorder": 4,
+      "proc_func": "barrier"
+    },
+    "ccx": {
+      "_name": "x",
+      "proc_func": "csu2"
+    },
+    "cp": {
+      "_name": "p",
+      "proc_func": "csu2"
+    },
+    "cs": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "proc_func": "csu2"
+    },
+    "cswap": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "proc_func": "cu"
+    },
+    "ct": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "proc_func": "csu2"
+    },
+    "cu": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "proc_func": "cu"
+    },
+    "cx": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "_name": "x",
+      "proc_func": "csu2"
+    },
+    "cy": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "proc_func": "csu2"
+    },
+    "cz": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "proc_func": "csu2"
+    },
+    "delay": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "Delay",
+      "proc_func": "pulse"
+    },
+    "flattop": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "Flattop",
+      "proc_func": "pulse"
+    },
+    "gaussian": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "Gaussian",
+      "proc_func": "pulse"
+    },
+    "h": {
+      "fc": "#EE7057",
+      "ec": "",
+      "label": "H",
+      "proc_func": "su2"
+    },
+    "id": {
+      "fc": "#EE7057",
+      "ec": "",
+      "label": "I",
+      "proc_func": "su2"
+    },
+    "iswap": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "proc_func": ""
+    },
+    "mcx": {
+      "_name": "x",
+      "proc_func": ""
+    },
+    "mcy": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "proc_func": ""
+    },
+    "mcz": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "",
+      "proc_func": ""
+    },
+    "measure": {
+      "fc": "#FFB240",
+      "ec": "",
+      "label": "Measure",
+      "proc_func": ""
+    },
+    "p": {
+      "fc": "#EE7057",
+      "ec": "",
+      "label": "P",
+      "proc_func": ""
+    },
+    "rect": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "Rect",
+      "proc_func": ""
+    },
+    "rx": {
+      "fc": "#6366F1",
+      "ec": "",
+      "label": "RX",
+      "proc_func": ""
+    },
+    "rxx": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "RXX",
+      "_name": "rx",
+      "proc_func": "msu2"
+    },
+    "ry": {
+      "fc": "#6366F1",
+      "ec": "",
+      "label": "RY",
+      "proc_func": ""
+    },
+    "ryy": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "RYY",
+      "proc_func": ""
+    },
+    "rz": {
+      "fc": "#6366F1",
+      "ec": "",
+      "label": "RZ",
+      "proc_func": ""
+    },
+    "rzz": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "RZZ",
+      "proc_func": ""
+    },
+    "s": {
+      "fc": "#EE7057",
+      "ec": "",
+      "label": "S",
+      "proc_func": ""
+    },
+    "sdg": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "$S^\\dagger$",
+      "proc_func": ""
+    },
+    "sw": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "$\\sqrt{W}$",
+      "proc_func": ""
+    },
+    "swap": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "Swap",
+      "proc_func": ""
+    },
+    "sx": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "$\\sqrt{X}$",
+      "proc_func": ""
+    },
+    "sxdg": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "$\\sqrt{X}^\\dagger$",
+      "proc_func": ""
+    },
+    "sy": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "$\\sqrt{Y}$",
+      "proc_func": ""
+    },
+    "sydg": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "$\\sqrt{Y}^\\dagger$",
+      "proc_func": "su2"
+    },
+    "t": {
+      "fc": "#EE7057",
+      "ec": "",
+      "label": "T",
+      "proc_func": "su2"
+    },
+    "tdg": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "$T^\\dagger$",
+      "proc_func": ""
+    },
+    "w": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "W",
+      "proc_func": ""
+    },
+    "x": {
+      "fc": "#EE7057",
+      "ec": "",
+      "label": "X",
+      "proc_func": ""
+    },
+    "xy": {
+      "fc": "#8C9197",
+      "ec": "",
+      "label": "XY",
+      "proc_func": ""
+    },
+    "y": {
+      "fc": "#EE7057",
+      "ec": "",
+      "label": "Y",
+      "proc_func": ""
+    },
+    "z": {
+      "fc": "#EE7057",
+      "ec": "",
+      "label": "Z",
+      "proc_func": ""
     }
+  }
 }
diff --git a/tests/quafu/algorithms/amplitude_test.py b/tests/quafu/algorithms/amplitude_test.py
index fd8fb25..e194e53 100644
--- a/tests/quafu/algorithms/amplitude_test.py
+++ b/tests/quafu/algorithms/amplitude_test.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from quafu.circuits import QuantumCircuit
+import numpy as np
 import quafu.elements.element_gates as qeg
 from quafu.algorithms import AmplitudeEmbedding
-import numpy as np
+from quafu.circuits import QuantumCircuit
+
 
 class TestAmplitudeEmbedding:
     """Example of amplitude embedding"""
@@ -22,13 +23,19 @@ class TestAmplitudeEmbedding:
     def test_build(self):
         num_qubits = 2
         qc = QuantumCircuit(num_qubits)
-        state = np.array([6,-12.5,11.15,7])
-        qc.add_gates(AmplitudeEmbedding(state=state, num_qubits=num_qubits, normalize=True))
+        state = np.array([6, -12.5, 11.15, 7])
+        qc.add_gates(
+            AmplitudeEmbedding(state=state, num_qubits=num_qubits, normalize=True)
+        )
         qc.draw_circuit(width=num_qubits)
 
     def test_build_pad(self):
         num_qubits = 2
         qc = QuantumCircuit(num_qubits)
-        state = np.array([6,-12.5,11.15])
-        qc.add_gates(AmplitudeEmbedding(state=state, num_qubits=num_qubits, pad_with=7, normalize=True))
-        qc.draw_circuit(width=num_qubits)
\ No newline at end of file
+        state = np.array([6, -12.5, 11.15])
+        qc.add_gates(
+            AmplitudeEmbedding(
+                state=state, num_qubits=num_qubits, pad_with=7, normalize=True
+            )
+        )
+        qc.draw_circuit(width=num_qubits)
diff --git a/tests/quafu/algorithms/angle_test.py b/tests/quafu/algorithms/angle_test.py
index adee188..ef00da4 100644
--- a/tests/quafu/algorithms/angle_test.py
+++ b/tests/quafu/algorithms/angle_test.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from quafu.circuits import QuantumCircuit
+import numpy as np
 import quafu.elements.element_gates as qeg
 from quafu.algorithms import AngleEmbedding
-import numpy as np
+from quafu.circuits import QuantumCircuit
 
 
 class TestAngleEmbedding:
diff --git a/tests/quafu/algorithms/basic_entangle_test.py b/tests/quafu/algorithms/basic_entangle_test.py
index d1ffc3e..9453265 100644
--- a/tests/quafu/algorithms/basic_entangle_test.py
+++ b/tests/quafu/algorithms/basic_entangle_test.py
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from quafu.circuits import QuantumCircuit
+import numpy as np
 import quafu.elements.element_gates as qeg
 from quafu.algorithms import BasicEntangleLayers
-import numpy as np
+from quafu.circuits import QuantumCircuit
 
 
 class TestBasicEntangleLayers:
diff --git a/tests/quafu/algorithms/estimator_test.py b/tests/quafu/algorithms/estimator_test.py
index 83bafbb..0386fba 100644
--- a/tests/quafu/algorithms/estimator_test.py
+++ b/tests/quafu/algorithms/estimator_test.py
@@ -12,18 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 import sys
+from unittest.mock import patch
+
 import numpy as np
 import pytest
-import math
-from unittest.mock import patch
-from quafu import ExecResult
 from quafu.algorithms.estimator import Estimator
 from quafu.algorithms.hamiltonian import Hamiltonian
-
 from quafu.circuits.quantum_circuit import QuantumCircuit
 from quafu.tasks.tasks import Task
 
+from quafu import ExecResult
+
 MOCK_RES_DICT = {
     "measure": "None",
     "openqasm": 'OPENQASM 2.0;\ninclude "qelib1.inc";\nqreg q[136];\ncreg c[5];\nry(-1.5707963267948966) q[32];\nrz(-3.141592653589793) q[33];\nrz(-3.141592653589793) q[44];\nry(-1.5707963267948966) q[45];\nrz(-3.141592653589793) q[54];\nbarrier q[54],q[32],q[44],q[45],q[33];\nmeasure q[54] -> c[0];\nmeasure q[32] -> c[1];\nmeasure q[44] -> c[2];\nmeasure q[45] -> c[3];\nmeasure q[33] -> c[4];\n',
diff --git a/tests/quafu/algorithms/gradient_test.py b/tests/quafu/algorithms/gradient_test.py
index 685f113..40c31d9 100644
--- a/tests/quafu/algorithms/gradient_test.py
+++ b/tests/quafu/algorithms/gradient_test.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
 import sys
+
+import pytest
 from quafu.algorithms.estimator import Estimator
-from quafu.algorithms.hamiltonian import Hamiltonian
 from quafu.algorithms.gradients import ParamShift
+from quafu.algorithms.hamiltonian import Hamiltonian
 from quafu.circuits.quantum_circuit import QuantumCircuit
 
 
diff --git a/tests/quafu/algorithms/hamiltonian_test.py b/tests/quafu/algorithms/hamiltonian_test.py
index f841a4e..09faa3e 100644
--- a/tests/quafu/algorithms/hamiltonian_test.py
+++ b/tests/quafu/algorithms/hamiltonian_test.py
@@ -15,7 +15,6 @@
 import numpy as np
 from quafu.algorithms.hamiltonian import Hamiltonian
 
-
 M_0 = np.array(
     [
         [
diff --git a/tests/quafu/algorithms/integration_test.py b/tests/quafu/algorithms/integration_test.py
index 186aaf6..b50eb93 100644
--- a/tests/quafu/algorithms/integration_test.py
+++ b/tests/quafu/algorithms/integration_test.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import heapq
 import sys
 from typing import List
+
+import matplotlib.pyplot as plt
 import numpy as np
 import pytest
-from quafu.algorithms import Hamiltonian, QAOAAnsatz, Estimator
-from quafu import simulate
-from scipy.optimize import minimize
-import heapq
-import matplotlib.pyplot as plt
+from quafu.algorithms import Estimator, Hamiltonian, QAOAAnsatz
 from quafu.algorithms.ansatz import AlterLayeredAnsatz
+from scipy.optimize import minimize
+
+from quafu import simulate
 
 
 class TestQAOA:
@@ -61,7 +63,7 @@ def test_run(self):
         """
         num_layers = 2
         print("The test for ansatz.")
-        
+
         # test the zero qubit evolution
         hamiltonian__ = Hamiltonian.from_pauli_list(
             [("IIIII", 1), ("IIIII", 1), ("IIIII", 1), ("IIIII", 1)]
diff --git a/tests/quafu/algorithms/qnn_test.py b/tests/quafu/algorithms/qnn_test.py
index ad06ba0..7210585 100644
--- a/tests/quafu/algorithms/qnn_test.py
+++ b/tests/quafu/algorithms/qnn_test.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
-
 import torch.nn
-from quafu.algorithms.gradients import jacobian, compute_vjp
-from quafu.algorithms.interface.torch import execute
+from quafu.algorithms.ansatz import QuantumNeuralNetwork
+from quafu.algorithms.gradients import compute_vjp, jacobian
+from quafu.algorithms.interface.torch import TorchTransformer
 from quafu.algorithms.templates.basic_entangle import BasicEntangleLayers
 from quafu.circuits.quantum_circuit import QuantumCircuit
-from quafu.algorithms import QuantumNeuralNetwork
 
 
 class ModelStandardCircuit(torch.nn.Module):
@@ -29,7 +28,7 @@ def __init__(self, circ: QuantumCircuit):
 
     def forward(self, features):
         out = self.linear(features)
-        out = execute(self.circ, out, method="external")
+        out = TorchTransformer.execute(self.circ, out, method="external")
         return out
 
 
@@ -39,7 +38,19 @@ def __init__(self, circ: QuantumNeuralNetwork):
         self.circ = circ
 
     def forward(self, features):
-        out = execute(self.circ, features)
+        out = TorchTransformer.execute(self.circ, features)
+        return out
+
+
+class ModelQuantumNeuralNetworkNative(torch.nn.Module):
+    """Test execution of qnn()"""
+
+    def __init__(self, qnn: QuantumNeuralNetwork):
+        super().__init__()
+        self.qnn = qnn
+
+    def forward(self, features):
+        out = self.qnn(features)
         return out
 
 
@@ -50,6 +61,19 @@ class TestLayers:
     circ.ry(1, 0.5)
     circ.ry(0, 0.1)
 
+    def _model_grad(self, model, batch_size):
+        """Test one forward pass and gradient calculation of a model"""
+
+        # TODO(zhaoyilun): Make out dimension configurable
+        features = torch.randn(
+            batch_size, 3, requires_grad=True, dtype=torch.double
+        )  # batch_size=4, num_params=3
+        outputs = model(features)
+        targets = torch.randn(batch_size, 2, dtype=torch.double)
+        criterion = torch.nn.MSELoss()
+        loss = criterion(outputs, targets)
+        loss.backward()
+
     def test_compute_vjp(self):
         params_input = np.random.randn(4, 3)
         jac = jacobian(self.circ, params_input)
@@ -78,12 +102,11 @@ def test_torch_layer_qnn(self):
         entangle_layer = BasicEntangleLayers(weights, 2)
         qnn = QuantumNeuralNetwork(2, [entangle_layer])
         batch_size = 1
+
+        # Legacy invokation style
         model = ModelQuantumNeuralNetwork(qnn)
-        features = torch.randn(
-            batch_size, 3, requires_grad=True, dtype=torch.double
-        )  # batch_size=4, num_params=3
-        outputs = model(features)
-        targets = torch.randn(batch_size, 2, dtype=torch.double)
-        criterion = torch.nn.MSELoss()
-        loss = criterion(outputs, targets)
-        loss.backward()
+        self._model_grad(model, batch_size)
+
+        # New invokation style
+        model = ModelQuantumNeuralNetworkNative(qnn)
+        self._model_grad(model, batch_size)
diff --git a/tests/quafu/circuits/quantum_circuit_test.py b/tests/quafu/circuits/quantum_circuit_test.py
index ceb2e1e..5b7037f 100644
--- a/tests/quafu/circuits/quantum_circuit_test.py
+++ b/tests/quafu/circuits/quantum_circuit_test.py
@@ -1,6 +1,7 @@
+import math
+
 from quafu.circuits import QuantumCircuit
 from quafu.elements.element_gates import RXGate
-import math
 
 
 class TestQuantumCircuit:
diff --git a/tests/quafu/qasm/classicop_test.py b/tests/quafu/qasm/classicop_test.py
index ea84c41..6d56368 100644
--- a/tests/quafu/qasm/classicop_test.py
+++ b/tests/quafu/qasm/classicop_test.py
@@ -12,17 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import unittest
-from quafu import simulate
+
 from quafu.qfasm.qfasm_convertor import qasm_to_quafu
 
+from quafu import simulate
+
 
 class BaseTest:
-    def assertDictAlmostEqual(self, dict1, dict2, delta=None, places=None, default_value=-1):
+    def assertDictAlmostEqual(
+        self, dict1, dict2, delta=None, places=None, default_value=-1
+    ):
         """
         Assert two dictionaries with numeric values are almost equal.
-        
+
         Args:
             dict1 (dict): a dictionary.
             dict2 (dict): a dictionary.
@@ -31,6 +34,7 @@ def assertDictAlmostEqual(self, dict1, dict2, delta=None, places=None, default_v
             places (int): number of decimal places for comparison.
             default_value (number): default value for missing keys.
         """
+
         def valid_comparison(value):
             """compare value to delta, within places accuracy"""
             if places is not None:
@@ -58,24 +62,24 @@ class TestClassicOp(BaseTest):
     assertDictEqual = unittest.TestCase.assertDictEqual
     assertListEqual = unittest.TestCase.assertListEqual
     assertTrue = unittest.TestCase.assertTrue
-    
+
     def test_single_reset(self):
         qasm = "qreg q[2];creg c[2];x q; reset q[0]; measure q[0]->c[0];"
         circ = qasm_to_quafu(openqasm=qasm)
         assert circ.instructions[-2].name == "reset"
         assert circ.instructions[-2].pos == [0]
-    
+
     def test_multi_reset(self):
         qasm = "qreg q[2];creg c[2];x q; reset q; measure q[0]->c[0];"
         circ = qasm_to_quafu(openqasm=qasm)
         assert circ.instructions[-2].name == "reset"
-        assert circ.instructions[-2].pos == [0,1]
-    
+        assert circ.instructions[-2].pos == [0, 1]
+
     def test_cif_single(self):
         qasm = """
         qreg q[2];
         creg c[2];
-        x q[0];  
+        x q[0];
         measure q[0]->c[0];
         if(c[0] == 1)
             x q[1];
@@ -93,15 +97,15 @@ def test_cif_single(self):
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 1)
-        self.assertDictAlmostEqual(count, {'11':10})
-        
+        self.assertDictAlmostEqual(count, {"11": 10})
+
     def test_cif_multi(self):
         qasm = """
         qreg q[2];
         qreg m[2];
         creg c[2];
         creg n[2];
-        x q;  
+        x q;
         measure q->c;
         if(c == 3)
             x m;
@@ -109,9 +113,9 @@ def test_cif_multi(self):
         """
         circ = qasm_to_quafu(openqasm=qasm)
         assert circ.instructions[-2].name == "cif"
-        assert circ.instructions[-2].cbits == [0,1]     
+        assert circ.instructions[-2].cbits == [0, 1]
         assert circ.instructions[-2].condition == 3
-        assert len(circ.instructions[-2].instructions) == 2     # x m[0]; x m[1];
+        assert len(circ.instructions[-2].instructions) == 2  # x m[0]; x m[1];
         result = simulate(qc=circ, shots=10)
         probs = result.probabilities
         count = result.count
@@ -119,4 +123,4 @@ def test_cif_multi(self):
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[15], 1)
-        self.assertDictAlmostEqual(count, {'1111':10})
\ No newline at end of file
+        self.assertDictAlmostEqual(count, {"1111": 10})
diff --git a/tests/quafu/qasm/parser_test.py b/tests/quafu/qasm/parser_test.py
index 2a5382e..1dcfde4 100644
--- a/tests/quafu/qasm/parser_test.py
+++ b/tests/quafu/qasm/parser_test.py
@@ -20,7 +20,6 @@
 import tempfile
 
 import pytest
-
 from quafu.circuits import QuantumCircuit
 from quafu.qfasm.exceptions import LexerError, ParserError
 from quafu.qfasm.qfasm_convertor import qasm_to_quafu
@@ -80,7 +79,7 @@ def compare_cir(self, qc1: QuantumCircuit, qc2: QuantumCircuit):
                 assert gate2.paras is None
             if hasattr(gate1, "paras") and gate1.paras != None:
                 assert gate2.paras is not None
-                assert math.isclose(gate1.paras ,gate2.paras)
+                assert math.isclose(gate1.paras, gate2.paras)
 
     # ----------------------------------------
     #   test for lexer
@@ -223,7 +222,7 @@ def test_exp_binary_symbol(self, symbol, op):
         gate test(a,b,c) q {{
             U(0,0,a{symbol}(b{symbol}c)) q;
         }}
-        qreg q[1]; 
+        qreg q[1];
         test({num2}, {num1}, {num2})q[0];
         """
         cir = qasm_to_quafu(openqasm=qasm)
@@ -293,7 +292,7 @@ def test_exp_func_symbol(self, func, mathop):
         gate test(a,b,c) q{{
             U({func}(a),{func}(b),{func}(c)) q;
         }}
-        qreg q[1]; 
+        qreg q[1];
         test({num1},{num2},{num3}) q[0];
         """
         cir = qasm_to_quafu(openqasm=qasm)
@@ -311,7 +310,7 @@ def test_exp_precedence(self):
         expected = num1 + 1.5 * (-num3) ** 2 - num2 / 0.5
         qasm = f"qreg q[1]; U( 0, 0, {expr}) q[0];"
         cir = qasm_to_quafu(openqasm=qasm)
-        assert math.isclose(cir.gates[0].paras , expected)
+        assert math.isclose(cir.gates[0].paras, expected)
 
     def test_exp_sub_left(self):
         qasm = f"qreg q[1]; U( 0 , 0 , 2.0-1.0-1.0 ) q[0];"
@@ -403,7 +402,7 @@ def test_multi_format(self):
         test(   // to split a gate declaration
         theta
         )       // But it's
-        q       // still a 
+        q       // still a
         { h     // void
 q;              // gate!
 u2(
@@ -690,12 +689,12 @@ def test_barrier_mul(self):
         """
         cir = qasm_to_quafu(qasm)
         qc = QuantumCircuit(4)
-        qc.barrier([0,1])
-        qc.barrier([1,2])
-        qc.barrier([0,2,3])
-        qc.barrier([0,1,2])
+        qc.barrier([0, 1])
+        qc.barrier([1, 2])
+        qc.barrier([0, 2, 3])
+        qc.barrier([0, 1, 2])
         self.compare_cir(cir, qc)
-            
+
     def test_double_call_gate(self):
         qasm = """
             gate test(x, y) a {
@@ -1077,7 +1076,7 @@ def test_gate_not_defined(self):
             qasm_to_quafu(qasm)
 
     def test_gate_cannot_use_before_define(self):
-        qasm = f"""qreg q[2]; 
+        qasm = f"""qreg q[2];
         test q[0],q[1];
         gate test () q,r{{
             cx q,r;
diff --git a/tests/quafu/simulator/base.py b/tests/quafu/simulator/base.py
index f681924..6dc5c64 100644
--- a/tests/quafu/simulator/base.py
+++ b/tests/quafu/simulator/base.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+
 class BaseTest:
-    def assertDictAlmostEqual(self, dict1, dict2, delta=None, places=None, default_value=-1):
+    def assertDictAlmostEqual(
+        self, dict1, dict2, delta=None, places=None, default_value=-1
+    ):
         """
         Assert two dictionaries with numeric values are almost equal.
-        
+
         Args:
             dict1 (dict): a dictionary.
             dict2 (dict): a dictionary.
@@ -25,6 +28,7 @@ def assertDictAlmostEqual(self, dict1, dict2, delta=None, places=None, default_v
             places (int): number of decimal places for comparison.
             default_value (number): default value for missing keys.
         """
+
         def valid_comparison(value):
             """compare value to delta, within places accuracy"""
             if places is not None:
diff --git a/tests/quafu/simulator/basis_test.py b/tests/quafu/simulator/basis_test.py
index 3d2b45b..6521845 100644
--- a/tests/quafu/simulator/basis_test.py
+++ b/tests/quafu/simulator/basis_test.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 import sys
-import pytest
-from quafu import QuantumCircuit
-from quafu import simulate
-from base import BaseTest
 import unittest
+
 import numpy as np
+import pytest
+from base import BaseTest
+
+from quafu import QuantumCircuit, simulate
+
 
 class BellCircuits:
     """Container for reference circuits used by the tests."""
@@ -29,16 +31,16 @@ def bell_measure_atlast():
         qc = QuantumCircuit(2, 2)
         qc.h(0)
         qc.cx(0, 1)
-        qc.measure([0,1])
+        qc.measure([0, 1])
         return qc
-    
+
     @staticmethod
     def bell_measure_normal():
         """Return a Bell circuit."""
         qc = QuantumCircuit(3, 2)
         qc.h(0)
         qc.cx(0, 1)
-        qc.measure([0,1])
+        qc.measure([0, 1])
         qc.h(2)
         return qc
 
@@ -51,6 +53,7 @@ def bell_no_measure():
 
         return qc
 
+
 class BasicCircuits:
     """Container for reference circuits used by the tests."""
 
@@ -59,7 +62,7 @@ def singleQgate_measure_atlast():
         qc = QuantumCircuit(2, 2)
         qc.x(0)
         qc.x(1)
-        qc.measure([0,1])
+        qc.measure([0, 1])
         return qc
 
     @staticmethod
@@ -77,20 +80,20 @@ def singleQgate_measure_normal():
         qc.x(1)
         qc.measure([1], [1])
         return qc
-    
+
     @staticmethod
     def multiQgate_measure_atlast():
         qc = QuantumCircuit(2, 2)
         qc.x(0)
-        qc.cx(0,1)
-        qc.measure([0,1])
+        qc.cx(0, 1)
+        qc.measure([0, 1])
         return qc
 
     @staticmethod
     def multiQgate_no_measure():
         qc = QuantumCircuit(2)
         qc.x(0)
-        qc.cx(0,1)
+        qc.cx(0, 1)
         return qc
 
     @staticmethod
@@ -98,41 +101,40 @@ def multiQgate_measure_normal():
         qc = QuantumCircuit(2)
         qc.x(0)
         qc.measure([0], [0])
-        qc.cx(0,1)
+        qc.cx(0, 1)
         qc.measure([1], [1])
         return qc
-    
+
     @staticmethod
     def any_cbit_measure():
-        qc = QuantumCircuit(4,4)
+        qc = QuantumCircuit(4, 4)
         qc.x(0)
         qc.x(1)
-        qc.measure([1,2], [1,0])
-        qc.measure([3,0], [2,3])
+        qc.measure([1, 2], [1, 0])
+        qc.measure([3, 0], [2, 3])
         return qc
 
     @staticmethod
     def after_measure():
-        qc = QuantumCircuit(2,22)
+        qc = QuantumCircuit(2, 22)
         qc.h(0)
-        qc.cx(0,1)
+        qc.cx(0, 1)
         qc.measure([0], [0])
         qc.measure([1], [1])
-        qc.reset([0,1])
+        qc.reset([0, 1])
         return qc
-    
-
 
 
 class TestSimulatorBasis(BaseTest):
     """Test C++ simulator"""
+
     circuit = None
     assertEqual = unittest.TestCase.assertEqual
     assertAlmostEqual = unittest.TestCase.assertAlmostEqual
     assertDictEqual = unittest.TestCase.assertDictEqual
     assertListEqual = unittest.TestCase.assertListEqual
     assertTrue = unittest.TestCase.assertTrue
-    
+
     @pytest.mark.skipif(
         sys.platform == "darwin", reason="Avoid error on MacOS arm arch."
     )
@@ -141,12 +143,12 @@ def test_simulate(self):
         result = simulate(qc=self.circuit)
         probs = result.probabilities
         count = result.count
-        self.assertAlmostEqual(probs[0], 1/2)
+        self.assertAlmostEqual(probs[0], 1 / 2)
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
-        self.assertAlmostEqual(probs[3], 1/2)
+        self.assertAlmostEqual(probs[3], 1 / 2)
         self.assertDictAlmostEqual(count, {})
-    
+
     @pytest.mark.skipif(
         sys.platform == "darwin", reason="Avoid error on MacOS arm arch."
     )
@@ -155,11 +157,11 @@ def test_measure_atlast_collapse(self):
         self.circuit = BellCircuits.bell_measure_atlast()
         result = simulate(qc=self.circuit)
         probs = result.probabilities
-        self.assertAlmostEqual(probs[0], 1/2)
+        self.assertAlmostEqual(probs[0], 1 / 2)
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
-        self.assertAlmostEqual(probs[3], 1/2)
-    
+        self.assertAlmostEqual(probs[3], 1 / 2)
+
     @pytest.mark.skipif(
         sys.platform == "darwin", reason="Avoid error on MacOS arm arch."
     )
@@ -170,10 +172,12 @@ def test_measure_normal_collapse(self):
         probs = result.probabilities
         diff_00 = np.linalg.norm(np.array([1, 0, 0, 0]) - probs) ** 2
         diff_11 = np.linalg.norm(np.array([0, 0, 0, 1]) - probs) ** 2
-        success = np.allclose([diff_00, diff_11], [0, 2]) or np.allclose([diff_00, diff_11], [2, 0])
+        success = np.allclose([diff_00, diff_11], [0, 2]) or np.allclose(
+            [diff_00, diff_11], [2, 0]
+        )
         # state is 1/sqrt(2)|00> + 1/sqrt(2)|11>, up to a global phase
         self.assertTrue(success)
-    
+
     def test_singleQgate_measure_atlast(self):
         self.circuit = BasicCircuits.singleQgate_measure_atlast()
         result = simulate(qc=self.circuit, shots=1)
@@ -183,8 +187,8 @@ def test_singleQgate_measure_atlast(self):
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 1)
-        self.assertDictAlmostEqual(counts, {'11':1})
-    
+        self.assertDictAlmostEqual(counts, {"11": 1})
+
     def test_singleQgate_no_measure(self):
         self.circuit = BasicCircuits.singleQgate_no_measure()
         result = simulate(qc=self.circuit, shots=1)
@@ -195,7 +199,7 @@ def test_singleQgate_no_measure(self):
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 1)
         self.assertDictAlmostEqual(counts, {})
-        
+
     def test_singleQgate_measure_normal(self):
         self.circuit = BasicCircuits.singleQgate_measure_normal()
         result = simulate(qc=self.circuit, shots=10)
@@ -205,8 +209,8 @@ def test_singleQgate_measure_normal(self):
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 1)
-        self.assertDictAlmostEqual(counts, {'11':10})
-        
+        self.assertDictAlmostEqual(counts, {"11": 10})
+
     def test_multiQgate_measure_atlast(self):
         self.circuit = BasicCircuits.multiQgate_measure_atlast()
         result = simulate(qc=self.circuit, shots=10)
@@ -216,8 +220,8 @@ def test_multiQgate_measure_atlast(self):
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 1)
-        self.assertDictAlmostEqual(counts, {'11':10})
-    
+        self.assertDictAlmostEqual(counts, {"11": 10})
+
     def test_multiQgate_no_measure(self):
         self.circuit = BasicCircuits.multiQgate_no_measure()
         result = simulate(qc=self.circuit, shots=1)
@@ -228,7 +232,7 @@ def test_multiQgate_no_measure(self):
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 1)
         self.assertDictAlmostEqual(counts, {})
-        
+
     def test_multiQgate_measure_normal(self):
         self.circuit = BasicCircuits.multiQgate_measure_normal()
         result = simulate(qc=self.circuit, shots=10)
@@ -238,22 +242,103 @@ def test_multiQgate_measure_normal(self):
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 1)
-        self.assertDictAlmostEqual(counts, {'11':10})
-    
+        self.assertDictAlmostEqual(counts, {"11": 10})
+
     def test_anycbit_measure(self):
         self.circuit = BasicCircuits.any_cbit_measure()
         result = simulate(qc=self.circuit, shots=10)
         probs = result.probabilities
         counts = result.count
         print(probs)
-        self.assertAlmostEqual(probs[5], 1)    #0101
-        self.assertDictAlmostEqual(counts, {'0101':10})
-    
+        self.assertAlmostEqual(probs[5], 1)  # 0101
+        self.assertDictAlmostEqual(counts, {"0101": 10})
+
     def test_after_measure(self):
         self.circuit = BasicCircuits.after_measure()
         result = simulate(qc=self.circuit, shots=10)
         probs = result.probabilities
         diff_00 = np.linalg.norm(np.array([1, 0, 0, 0]) - probs) ** 2
         diff_11 = np.linalg.norm(np.array([0, 0, 0, 1]) - probs) ** 2
-        success = np.allclose([diff_00, diff_11], [0, 2]) or np.allclose([diff_00, diff_11], [2, 0])
-        self.assertTrue(success)
\ No newline at end of file
+        success = np.allclose([diff_00, diff_11], [0, 2]) or np.allclose(
+            [diff_00, diff_11], [2, 0]
+        )
+        self.assertTrue(success)
+
+
+class TestCliffordSimulatorBasis(BaseTest):
+    """Test C++ Clifford simulator"""
+
+    circuit = None
+    assertEqual = unittest.TestCase.assertEqual
+    assertAlmostEqual = unittest.TestCase.assertAlmostEqual
+    assertDictEqual = unittest.TestCase.assertDictEqual
+    assertListEqual = unittest.TestCase.assertListEqual
+    assertTrue = unittest.TestCase.assertTrue
+
+    @pytest.mark.skipif(
+        sys.platform == "darwin", reason="Avoid error on MacOS arm arch."
+    )
+    def test_simulate(self):
+        print("test_simulate")
+        self.circuit = BellCircuits.bell_no_measure()
+        result = simulate(
+            qc=self.circuit, simulator="qfvm_clifford", output="count_dict"
+        )
+        count = result.count
+        self.assertDictAlmostEqual(count, {})
+
+    def test_singleQgate_measure_atlast(self):
+        self.circuit = BasicCircuits.singleQgate_measure_atlast()
+        result = simulate(
+            qc=self.circuit, shots=1, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"11": 1})
+
+    def test_singleQgate_no_measure(self):
+        self.circuit = BasicCircuits.singleQgate_no_measure()
+        result = simulate(
+            qc=self.circuit, shots=1, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {})
+
+    def test_singleQgate_measure_normal(self):
+        self.circuit = BasicCircuits.singleQgate_measure_normal()
+        result = simulate(
+            qc=self.circuit, shots=10, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"11": 10})
+
+    def test_multiQgate_measure_atlast(self):
+        self.circuit = BasicCircuits.multiQgate_measure_atlast()
+        result = simulate(
+            qc=self.circuit, shots=10, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"11": 10})
+
+    def test_multiQgate_no_measure(self):
+        self.circuit = BasicCircuits.multiQgate_no_measure()
+        result = simulate(
+            qc=self.circuit, shots=1, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {})
+
+    def test_multiQgate_measure_normal(self):
+        self.circuit = BasicCircuits.multiQgate_measure_normal()
+        result = simulate(
+            qc=self.circuit, shots=10, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"11": 10})
+
+    def test_anycbit_measure(self):
+        self.circuit = BasicCircuits.any_cbit_measure()
+        result = simulate(
+            qc=self.circuit, shots=10, simulator="qfvm_clifford", output="count_dict"
+        )
+        counts = result.count
+        self.assertDictAlmostEqual(counts, {"0101": 10})
diff --git a/tests/quafu/simulator/classic_test.py b/tests/quafu/simulator/classic_test.py
index 2a58fde..f40bf18 100644
--- a/tests/quafu/simulator/classic_test.py
+++ b/tests/quafu/simulator/classic_test.py
@@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from quafu import QuantumCircuit
-from quafu import simulate
-from base import BaseTest
 import unittest
 
+from base import BaseTest
+
+from quafu import QuantumCircuit, simulate
+
 
 class ClassicalCircuits:
     """Container for reference circuits used by the tests."""
@@ -91,6 +92,7 @@ def multi_reset():
 
 class TestSimulatorClassic(BaseTest):
     """Test C++ simulator"""
+
     circuit = None
     assertEqual = unittest.TestCase.assertEqual
     assertAlmostEqual = unittest.TestCase.assertAlmostEqual
@@ -107,7 +109,7 @@ def test_cif_true(self):
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 1)
-        self.assertDictAlmostEqual(count, {'11': 10})
+        self.assertDictAlmostEqual(count, {"11": 10})
 
     def test_cif_false(self):
         self.circuit = ClassicalCircuits.cif_false()
@@ -118,7 +120,7 @@ def test_cif_false(self):
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 0)
-        self.assertDictAlmostEqual(count, {'00': 10})
+        self.assertDictAlmostEqual(count, {"00": 10})
 
     def test_cif_list_true(self):
         self.circuit = ClassicalCircuits.cif_list_true()
@@ -133,7 +135,7 @@ def test_cif_list_true(self):
         self.assertAlmostEqual(probs[5], 0)
         self.assertAlmostEqual(probs[6], 0)
         self.assertAlmostEqual(probs[7], 1)
-        self.assertDictAlmostEqual(count, {'111': 10})
+        self.assertDictAlmostEqual(count, {"111": 10})
 
     def test_cif_list_false(self):
         self.circuit = ClassicalCircuits.cif_list_false()
@@ -148,7 +150,7 @@ def test_cif_list_false(self):
         self.assertAlmostEqual(probs[5], 0)
         self.assertAlmostEqual(probs[6], 1)
         self.assertAlmostEqual(probs[7], 0)
-        self.assertDictAlmostEqual(count, {'110': 10})
+        self.assertDictAlmostEqual(count, {"110": 10})
 
     def test_single_reset(self):
         self.circuit = ClassicalCircuits.single_reset()
@@ -157,7 +159,7 @@ def test_single_reset(self):
         count = result.count
         self.assertAlmostEqual(probs[0], 1)
         self.assertAlmostEqual(probs[1], 0)
-        self.assertDictAlmostEqual(count, {'10': 10})
+        self.assertDictAlmostEqual(count, {"10": 10})
 
     def test_multi_reset(self):
         self.circuit = ClassicalCircuits.multi_reset()
@@ -168,4 +170,4 @@ def test_multi_reset(self):
         self.assertAlmostEqual(probs[1], 0)
         self.assertAlmostEqual(probs[2], 0)
         self.assertAlmostEqual(probs[3], 0)
-        self.assertDictAlmostEqual(count, {'1100': 10})
+        self.assertDictAlmostEqual(count, {"1100": 10})