diff --git a/quafu/algorithms/ansatz.py b/quafu/algorithms/ansatz.py
index a402024..80e5505 100644
--- a/quafu/algorithms/ansatz.py
+++ b/quafu/algorithms/ansatz.py
@@ -17,6 +17,7 @@
 
 import numpy as np
 from quafu.circuits.quantum_circuit import QuantumCircuit
+from quafu.elements import Parameter
 from quafu.synthesis.evolution import ProductFormula
 
 from .hamiltonian import Hamiltonian
@@ -52,8 +53,10 @@ def __init__(self, hamiltonian: Hamiltonian, num_qubits: int, num_layers: int =
         self._evol = ProductFormula()
 
         # Initialize parameters
-        self._beta = np.zeros(num_layers)
-        self._gamma = np.zeros(num_layers)
+        self._beta = np.array([Parameter(f"beta_{i}", 0.0) for i in range(num_layers)])
+        self._gamma = np.array(
+            [Parameter(f"gamma_{i}", 0.0) for i in range(num_layers)]
+        )
 
         # Build circuit structure
         super().__init__(num_qubits)
@@ -122,7 +125,10 @@ def __init__(self, num_qubits: int, layer: int):
             layer: Number of layers.
         """
         self._layer = layer
-        self._theta = np.zeros((layer + 1, num_qubits))
+        self._theta = np.array(
+            [Parameter(f"theta_{i}", 0.0) for i in range((layer + 1) * num_qubits)]
+        )
+        self._theta = np.reshape(self._theta, (layer + 1, num_qubits))
         super().__init__(num_qubits)
 
     def _build(self):
@@ -153,25 +159,34 @@ def __init__(
         self._transformer = InterfaceProvider.get(interface)
         self._layers = layers
 
-        # FIXME(zhaoyilun): don't use this default value
-        self._weights = np.empty((1, 1))
+        self._weights = None
 
         self._backend = backend
         super().__init__(num_qubits)
 
-    def __call__(self, features):
+    def __call__(self, inputs):
         """Compute outputs of QNN given input features"""
         from .estimator import Estimator
 
         estimator = Estimator(self, backend=self._backend)
-        return self._transformer.execute(self, features, estimator=estimator)
+        return self._transformer.execute(self, inputs, estimator=estimator)
 
     def _build(self):
         """Essentially initialize weights using transformer"""
         self.add_gates(self._layers)
 
-        self._weights = self._transformer.init_weights((1, self.num_parameters))
+        self._weights = self._transformer.init_weights((1, self.num_tunable_parameters))
 
     @property
     def weights(self):
         return self._weights
+
+    @property
+    def num_tunable_parameters(self):
+        num_tunable_params = 0
+        for g in self.gates:
+            paras = g.paras
+            for p in paras:
+                if hasattr(p, "tunable") and p.tunable:
+                    num_tunable_params += 1
+        return num_tunable_params
diff --git a/quafu/algorithms/estimator.py b/quafu/algorithms/estimator.py
index b35e4fe..93ae8da 100644
--- a/quafu/algorithms/estimator.py
+++ b/quafu/algorithms/estimator.py
@@ -13,15 +13,16 @@
 # limitations under the License.
 """Pre-build wrapper to calculate expectation value"""
 from typing import List, Optional
+
 from ..circuits.quantum_circuit import QuantumCircuit
+from ..simulators import simulate
 from ..tasks.tasks import Task
 from .hamiltonian import Hamiltonian
-from ..simulators import simulate
 
 
 def execute_circuit(circ: QuantumCircuit, observables: Hamiltonian):
     """Execute circuit on quafu simulator"""
-    sim_res = simulate(circ, hamiltonian= observables)
+    sim_res = simulate(circ, hamiltonian=observables)
     expectations = sim_res["pauli_expects"]
     return sum(expectations)
 
@@ -44,6 +45,7 @@ def __init__(
             task_options: options to config a task instance
         """
         self._circ = circ
+        self._circ.get_parameter_grads()  # parameter shift currently requires calling this for initialization
         self._backend = backend
         self._task = None
         if backend != "sim":
@@ -85,7 +87,7 @@ def run(self, observables: Hamiltonian, params: List[float]):
             Expectation value
         """
         if params is not None:
-            self._circ.update_params(params)
+            self._circ._update_params(params)
 
         if self._backend == "sim":
             return self._run_simulation(observables)
diff --git a/quafu/algorithms/gradients/__init__.py b/quafu/algorithms/gradients/__init__.py
index de792ee..1c3b61b 100644
--- a/quafu/algorithms/gradients/__init__.py
+++ b/quafu/algorithms/gradients/__init__.py
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .gradiant import grad_adjoint, grad_finit_diff, grad_para_shift
 from .param_shift import ParamShift
 from .vjp import compute_vjp, jacobian, run_circ
diff --git a/quafu/algorithms/gradient.py b/quafu/algorithms/gradients/gradiant.py
similarity index 67%
rename from quafu/algorithms/gradient.py
rename to quafu/algorithms/gradients/gradiant.py
index 6b5b266..c472a4d 100644
--- a/quafu/algorithms/gradient.py
+++ b/quafu/algorithms/gradients/gradiant.py
@@ -1,45 +1,51 @@
-from ..circuits.quantum_circuit import QuantumCircuit
-from ..simulators.simulator import SVSimulator
-from ..elements import Parameter, ParameterExpression
 import numpy as np
-from ..exceptions import CircuitError
-from ..elements.matrices import XMatrix, YMatrix, ZMatrix
-from ..elements import QuantumGate, ControlledGate
+from quafu.circuits.quantum_circuit import QuantumCircuit
+from quafu.elements import ControlledGate, Parameter, ParameterExpression, QuantumGate
+from quafu.elements.matrices import XMatrix, YMatrix, ZMatrix
+from quafu.exceptions import CircuitError
+from quafu.simulators.simulator import SVSimulator
+
 
 def assemble_grads(para_grads, gate_grads):
     grads = []
     for var in para_grads:
         grad_p = para_grads[var]
-        fullgrad = 0.
+        fullgrad = 0.0
         for pos_g in grad_p:
-            pos, gp = pos_g 
+            pos, gp = pos_g
             gg = gate_grads[pos[0]][pos[1]]
             fullgrad += gg * gp
         grads.append(fullgrad)
-    
+
     return grads
 
-def grad_para_shift(qc:QuantumCircuit, hamiltonian, backend=SVSimulator()):
+
+def grad_para_shift(qc: QuantumCircuit, hamiltonian, backend=SVSimulator()):
     """
     Parameter shift gradients. Each gate must have one parameter
     """
     para_grads = qc._calc_parameter_grads()
-    gate_grads= [[] for _ in qc.gates]
+    gate_grads = [[] for _ in qc.gates]
 
     for i, op in enumerate(qc.gates):
         if len(op.paras) > 0:
-            if isinstance(op.paras[0], Parameter) or isinstance(op.paras[0],ParameterExpression):
+            if isinstance(op.paras[0], Parameter) or isinstance(
+                op.paras[0], ParameterExpression
+            ):
                 if op.name not in ["RX", "RY", "RZ"]:
-                    raise CircuitError("It seems the circuit can not apply parameter-shift rule to calculate gradient.You may need compile the circuit first")
-                op.paras[0] = op.paras[0] + np.pi/2
+                    raise CircuitError(
+                        "It seems the circuit can not apply parameter-shift rule to calculate gradient. You may need compile the circuit first"
+                    )
+                op.paras[0] = op.paras[0] + np.pi / 2
                 res1 = sum(backend.run(qc, hamiltonian=hamiltonian)["pauli_expects"])
                 op.paras[0] = op.paras[0] - np.pi
                 res2 = sum(backend.run(qc, hamiltonian=hamiltonian)["pauli_expects"])
                 op.paras[0]._undo(2)
-                gate_grads[i].append((res1 - res2) / 2.)
-        
+                gate_grads[i].append((res1 - res2) / 2.0)
+
     return assemble_grads(para_grads, gate_grads)
 
+
 def grad_finit_diff(qc, hamiltonian, backend=SVSimulator()):
     variables = qc.variables
     grads = []
@@ -50,7 +56,7 @@ def grad_finit_diff(qc, hamiltonian, backend=SVSimulator()):
         res2 = sum(backend.run(qc, hamiltonian=hamiltonian)["pauli_expects"])
         v.value += 1e-10
         grads.append((res1 - res2) / (2 * 1e-10))
-    
+
     return grads
 
 
@@ -60,37 +66,37 @@ def grad_gate(op):
     """
     if isinstance(op, ControlledGate):
         if op._targ_name == "RX":
-            circ = QuantumCircuit(max(op.pos)+1)
+            circ = QuantumCircuit(max(op.pos) + 1)
             deriv_mat = -0.5j * XMatrix @ op._get_targ_matrix()
             circ << QuantumGate("dRX", op.targs, [], deriv_mat)
             cdim = 1 << (len(op.ctrls))
             proj_mat = np.zeros((cdim, cdim))
-            proj_mat[cdim-1, cdim-1] = 1.
+            proj_mat[cdim - 1, cdim - 1] = 1.0
             circ << QuantumGate("projCtrl", op.ctrls, [], proj_mat)
             return circ.wrap()
-        
+
         elif op._targ_name == "RY":
-            circ = QuantumCircuit(max(op.pos)+1)
+            circ = QuantumCircuit(max(op.pos) + 1)
             deriv_mat = -0.5j * YMatrix @ op._get_targ_matrix()
             circ << QuantumGate("dRY", op.targs, [], deriv_mat)
             cdim = 1 << (len(op.ctrls))
             proj_mat = np.zeros((cdim, cdim))
-            proj_mat[cdim-1, cdim-1] = 1.
+            proj_mat[cdim - 1, cdim - 1] = 1.0
             circ << QuantumGate("projCtrl", op.ctrls, [], proj_mat)
             return circ.wrap()
-        
+
         elif op._targ_name == "RZ":
-            circ = QuantumCircuit(max(op.pos)+1)
+            circ = QuantumCircuit(max(op.pos) + 1)
             deriv_mat = -0.5j * ZMatrix @ op._get_targ_matrix()
             circ << QuantumGate("dRZ", op.targs, [], deriv_mat)
             cdim = 1 << (len(op.ctrls))
             proj_mat = np.zeros((cdim, cdim))
-            proj_mat[cdim-1, cdim-1] = 1.
+            proj_mat[cdim - 1, cdim - 1] = 1.0
             circ << QuantumGate("projCtrl", op.ctrls, [], proj_mat)
             return circ.wrap()
         else:
             raise NotImplementedError
-        
+
     else:
         if op.name == "RX":
             deriv_mat = -0.5j * XMatrix @ op.matrix
@@ -103,30 +109,37 @@ def grad_gate(op):
             return QuantumGate("dRZ", op.pos, [], deriv_mat)
         else:
             raise NotImplementedError
-    
+
+
 def grad_adjoint(qc, hamiltonian, psi_in=np.array([], dtype=complex)):
     """
     Reverse mode gradient: arXiv:2009.02823
     """
     para_grads = qc._calc_parameter_grads()
     backend = SVSimulator()
-    lam = backend.run(qc, psi = psi_in)["statevector"]
+    lam = backend.run(qc, psi=psi_in)["statevector"]
     phi = np.copy(lam)
     lam = backend._apply_hamil(hamiltonian, lam)
     begin = 0
     end = len(qc.gates)
-    gate_grads= [[] for _ in range(end)]
+    gate_grads = [[] for _ in range(end)]
     for i, op in enumerate(qc.gates):
-        if len(op.paras) > 0 and (isinstance(op.paras[0], Parameter) or isinstance(op.paras[0],ParameterExpression)):
+        if len(op.paras) > 0 and (
+            isinstance(op.paras[0], Parameter)
+            or isinstance(op.paras[0], ParameterExpression)
+        ):
             begin = i
             break
-    
+
     for i in range(begin, end)[::-1]:
         op = qc.gates[i]
         phi = backend._apply_op(op.dagger(), phi)
-        if len(op.paras) > 0 and (isinstance(op.paras[0], Parameter) or isinstance(op.paras[0],ParameterExpression)):
-                mu = np.copy(phi)
-                mu = backend._apply_op(grad_gate(op), mu)
-                gate_grads[i].append(np.real(2. * np.inner(lam.conj(), mu)))
+        if len(op.paras) > 0 and (
+            isinstance(op.paras[0], Parameter)
+            or isinstance(op.paras[0], ParameterExpression)
+        ):
+            mu = np.copy(phi)
+            mu = backend._apply_op(grad_gate(op), mu)
+            gate_grads[i].append(np.real(2.0 * np.inner(lam.conj(), mu)))
         lam = backend._apply_op(op.dagger(), lam)
-    return assemble_grads(para_grads, gate_grads)
\ No newline at end of file
+    return assemble_grads(para_grads, gate_grads)
diff --git a/quafu/algorithms/gradients/param_shift.py b/quafu/algorithms/gradients/param_shift.py
index c725188..32fd56f 100644
--- a/quafu/algorithms/gradients/param_shift.py
+++ b/quafu/algorithms/gradients/param_shift.py
@@ -19,6 +19,7 @@
 
 from ..estimator import Estimator
 from ..hamiltonian import Hamiltonian
+from .gradiant import grad_para_shift
 
 
 class ParamShift:
@@ -34,7 +35,7 @@ def __call__(self, obs: Hamiltonian, params: List[float]):
             estimator (Estimator): estimator to calculate expectation values
             params (List[float]): params to optimize
         """
-        return self.grad(obs, params)
+        return self.new_grad(obs, params)
 
     def _gen_param_shift_vals(self, params):
         """Given a param list with n values, replicate to 2*n param list"""
@@ -45,6 +46,7 @@ def _gen_param_shift_vals(self, params):
         minus_params = params - offsets * np.pi / 2
         return plus_params.tolist() + minus_params.tolist()
 
+    # TODO: delete after 0.4.1
     def grad(self, obs: Hamiltonian, params: List[float]):
         """grad.
 
@@ -61,3 +63,12 @@ def grad(self, obs: Hamiltonian, params: List[float]):
         num_shift_params = len(res)
         grads = (res[: num_shift_params // 2] - res[num_shift_params // 2 :]) / 2
         return grads
+
+    def new_grad(self, obs: Hamiltonian, params: List[float]):
+        """Calculate the gradients of given the circuit based on the parameter shift rule
+        Args:
+            obs (Hamiltonian): observables for measurement.
+            params (List[float]): parameters to apply to the circuit.
+        """
+        self._est._circ._update_params(params)
+        return grad_para_shift(self._est._circ, obs)
diff --git a/quafu/algorithms/interface/torch.py b/quafu/algorithms/interface/torch.py
index e914636..bde81fe 100644
--- a/quafu/algorithms/interface/torch.py
+++ b/quafu/algorithms/interface/torch.py
@@ -17,7 +17,9 @@
 
 import numpy as np
 import torch
+from quafu.algorithms.ansatz import QuantumNeuralNetwork
 from quafu.algorithms.estimator import Estimator
+from torch import nn
 
 from quafu import QuantumCircuit
 
@@ -56,14 +58,7 @@ def execute(
             "estimator": estimator,
         }
 
-        if method == "external":
-            return ExecuteCircuits.apply(parameters, kwargs)
-        if method == "internal":
-            from ..ansatz import QuantumNeuralNetwork
-
-            assert isinstance(circ, QuantumNeuralNetwork)
-            return ExecuteCircuits.apply(circ.weights, kwargs)
-        raise NotImplementedError(f"Unsupported execution method: {method}")
+        return ExecuteCircuits.apply(parameters, kwargs)
 
 
 class ExecuteCircuits(torch.autograd.Function):
@@ -91,3 +86,42 @@ def backward(ctx, grad_out):
         vjp = compute_vjp(jac, grad_out.numpy())
         vjp = torch.from_numpy(vjp)
         return vjp, None
+
+
+class ModuleWrapper(nn.Module):
+    """
+    A wrapper class to transform quafu circuit to a torch module
+    """
+
+    def __init__(self, qnn: QuantumNeuralNetwork):
+        """
+        Initialization of quafu torch module
+
+        Args:
+            circ (QuantumCircuit): the original parameterized quantum circuit
+        """
+        super().__init__()
+        self._qnn = qnn
+        if qnn.weights is not None:
+            self.weights = nn.parameter.Parameter(qnn.weights)
+        else:
+            self.weights = None
+
+    def forward(self, inputs: torch.Tensor):
+        """
+        Args:
+            inputs (torch.Tensor): raw input data or output from previous
+                classical/quantum layers.
+        """
+        # if weights are not empty, it will be combined with inputs to form
+        # the complete parameter vector and feed to the quantum circuit
+        bsz, _ = inputs.shape  # FIXME: currently we assume 2-D inputs
+
+        # use the last dimension since it is currently initialized as (1, D)
+        if self.weights is not None:
+            weight_dim = self.weights.size(-1)
+            weights_expanded = self.weights.expand(bsz, weight_dim)
+            inputs_to_circ = torch.cat((inputs, weights_expanded), dim=1)
+        else:
+            inputs_to_circ = inputs
+        return self._qnn(inputs_to_circ)
diff --git a/quafu/algorithms/templates/angle.py b/quafu/algorithms/templates/angle.py
index c421739..432d457 100644
--- a/quafu/algorithms/templates/angle.py
+++ b/quafu/algorithms/templates/angle.py
@@ -14,7 +14,7 @@
 """Angel Embedding in Quantum Data embedding"""
 import numpy as np
 import quafu.elements.element_gates as qeg
-from quafu.elements import QuantumGate
+from quafu.elements import Parameter, QuantumGate
 
 ROT = {"X": qeg.RXGate, "Y": qeg.RYGate, "Z": qeg.RZGate}
 
@@ -45,7 +45,9 @@ def _build(self):
         gate_list = []
         for j in range(self.batch_size):
             for i in range(self.num_qubits):
-                gate = self.op(i, self.features[j, i])
+                gate = self.op(
+                    i, Parameter(f"phi_{i}", self.features[j, i], tunable=False)
+                )
                 gate_list.append(gate)
         return gate_list
 
diff --git a/quafu/circuits/quantum_circuit.py b/quafu/circuits/quantum_circuit.py
index 752a17d..70ea4c9 100644
--- a/quafu/circuits/quantum_circuit.py
+++ b/quafu/circuits/quantum_circuit.py
@@ -74,6 +74,7 @@ def __init__(self, qnum: int, cnum: Optional[int] = None, name="", *args, **kwar
     @property
     def parameterized_gates(self):
         """Return the list of gates which the parameters are tunable"""
+        # FIXME: if we add parameterized gates after calling this function it will not work
         if not self._parameterized_gates:
             self._parameterized_gates = [g for g in self.gates if len(g.paras) != 0]
         return self._parameterized_gates
@@ -272,11 +273,15 @@ def _update_params(self, values, order=[]):
             order: For transplied circuit that change the order of variables,
             need pass the order to match untranspiled circuit's variable.
         """
-
+        if len(values) != len(self.variables):
+            raise CircuitError(
+                "The size of input values must be the same to the parameters"
+            )
         for i in range(len(values)):
             val = values[order[i]] if order else values[i]
             self._variables[i].value = val
 
+    # TODO: delete after 0.4.1
     def update_params(self, paras_list: List[Any]):
         """Update parameters of parameterized gates
         Args:
diff --git a/quafu/elements/parameters.py b/quafu/elements/parameters.py
index 31feaac..dfe9cf0 100644
--- a/quafu/elements/parameters.py
+++ b/quafu/elements/parameters.py
@@ -233,12 +233,13 @@ def log(self):
 
 
 class Parameter(ParameterExpression):
-    def __init__(self, name, value: float = 0.0):
+    def __init__(self, name, value: float = 0.0, tunable: bool = True):
         self.name = name
         self.value = float(value)
         self.operands = []
         self.funcs = []
         self.latex = self.name
+        self.tunable = tunable
 
     @property
     def pivot(self):
diff --git a/tests/quafu/algorithms/merge_circuits_test.py b/tests/quafu/algorithms/construct_qlayers_test.py
similarity index 92%
rename from tests/quafu/algorithms/merge_circuits_test.py
rename to tests/quafu/algorithms/construct_qlayers_test.py
index 5af7f48..3f827be 100644
--- a/tests/quafu/algorithms/merge_circuits_test.py
+++ b/tests/quafu/algorithms/construct_qlayers_test.py
@@ -18,10 +18,10 @@
 from quafu.circuits.quantum_circuit import QuantumCircuit
 
 
-class TestMergeCircuits:
-    """Example of merging circuits"""
+class TestConstructQLayers:
+    """Test stacking multiple different quantum layers"""
 
-    def test_merge_circuits(self):
+    def test_construct_qlayers(self):
         state = np.array([7, 2, 3, 4])
         encoding_layer = AmplitudeEmbedding(state=state, num_qubits=2, normalize=True)
 
diff --git a/tests/quafu/algorithms/gradient_test.py b/tests/quafu/algorithms/gradient_test.py
index 429f25e..976461e 100644
--- a/tests/quafu/algorithms/gradient_test.py
+++ b/tests/quafu/algorithms/gradient_test.py
@@ -14,29 +14,47 @@
 
 import sys
 
+import numpy as np
 import pytest
 from quafu.algorithms.estimator import Estimator
-from quafu.algorithms.gradients import ParamShift
+from quafu.algorithms.gradients import ParamShift, grad_para_shift
 from quafu.algorithms.hamiltonian import Hamiltonian
 from quafu.circuits.quantum_circuit import QuantumCircuit
+from quafu.elements import Parameter
 
 
+# TODO: remove this test after releasing 0.4.1 as it is not necessary
 class TestParamShift:
     @pytest.mark.skipif(
         sys.platform == "darwin", reason="Avoid error on MacOS arm arch."
     )
     def test_call(self):
+        """
+        This test simply ensures that the legacy implementation of parameter shift produces
+        the same results with the new implementation
+        """
+        theta_0 = Parameter("theta_0", 0.2)
+        theta_1 = Parameter("theta_1", 0.6)
         ham = Hamiltonian.from_pauli_list([("Z0 Z1", 1), ("X1", 1)])
-        circ = QuantumCircuit(2)
-        # circ.h(0)
-        # circ.h(1)
-        circ.rx(0, 0.5)
-        circ.cnot(0, 1)
-        circ.ry(1, 0.5)
+
+        circ_0 = QuantumCircuit(2)
+        circ_0.rx(0, theta_0)
+        circ_0.cnot(0, 1)
+        circ_0.ry(1, theta_1)
 
         params = [0.2, 0.6]
-        estimator = Estimator(circ)
+        estimator = Estimator(circ_0)
         grad = ParamShift(estimator)
 
-        grads = grad(ham, params)
-        print(grads)
+        grads_0 = grad(ham, params)
+        print(grads_0)
+
+        circ_1 = QuantumCircuit(2)
+        circ_1.rx(0, theta_0)
+        circ_1.cnot(0, 1)
+        circ_1.ry(1, theta_1)
+        circ_1.get_parameter_grads()
+        grads_1 = grad_para_shift(circ_1, ham)
+        print(grads_1)
+
+        assert np.allclose(grads_0, grads_1, atol=1e-6)
diff --git a/tests/quafu/algorithms/qnn_test.py b/tests/quafu/algorithms/qnn_test.py
index 0a84f53..76e2805 100644
--- a/tests/quafu/algorithms/qnn_test.py
+++ b/tests/quafu/algorithms/qnn_test.py
@@ -16,9 +16,11 @@
 import torch
 from quafu.algorithms.ansatz import QuantumNeuralNetwork
 from quafu.algorithms.gradients import compute_vjp, jacobian
-from quafu.algorithms.interface.torch import TorchTransformer
+from quafu.algorithms.interface.torch import ModuleWrapper, TorchTransformer
+from quafu.algorithms.templates.angle import AngleEmbedding
 from quafu.algorithms.templates.basic_entangle import BasicEntangleLayers
 from quafu.circuits.quantum_circuit import QuantumCircuit
+from quafu.elements import Parameter
 from torch import nn
 from torch.utils.data import DataLoader, TensorDataset
 
@@ -80,7 +82,7 @@ def __init__(self, circ: QuantumCircuit):
 
     def forward(self, features):
         out = self.linear(features)
-        out = TorchTransformer.execute(self.circ, out, method="external")
+        out = TorchTransformer.execute(self.circ, out)
         return out
 
 
@@ -112,17 +114,18 @@ def forward(self, features):
 
 class TestLayers:
     circ = QuantumCircuit(2)
+    theta = [Parameter(f"theta_{i}", 0.1) for i in range(3)]
     circ.x(0)
-    circ.rx(0, 0.1)
-    circ.ry(1, 0.5)
-    circ.ry(0, 0.1)
+    circ.rx(0, theta[0])
+    circ.ry(1, theta[1])
+    circ.ry(0, theta[2])
 
     def _model_grad(self, model, batch_size):
         """Test one forward pass and gradient calculation of a model"""
 
         # TODO(zhaoyilun): Make out dimension configurable
         features = torch.randn(
-            batch_size, 3, requires_grad=True, dtype=torch.double
+            batch_size, 2, requires_grad=True, dtype=torch.double
         )  # batch_size=4, num_params=3
         outputs = model(features)
         targets = torch.randn(batch_size, 2, dtype=torch.double)
@@ -155,8 +158,9 @@ def test_torch_layer_standard_circuit(self):
     def test_torch_layer_qnn(self):
         """Use QuantumNeuralNetwork ansatz"""
         weights = np.random.randn(2, 2)
-        entangle_layer = BasicEntangleLayers(weights, 2)
-        qnn = QuantumNeuralNetwork(2, entangle_layer)
+        # entangle_layer = BasicEntangleLayers(weights, 2)
+        encoder_layer = AngleEmbedding(np.random.random((2,)), 2)
+        qnn = QuantumNeuralNetwork(2, encoder_layer)
         batch_size = 1
 
         # Legacy invokation style
@@ -180,7 +184,85 @@ def test_torch_layer_qnn_real_machine(self):
         model = ModelQuantumNeuralNetworkNative(qnn)
         self._model_grad(model, batch_size)
 
-    def test_classification_on_random_dataset(self, num_epochs, batch_size):
+    def test_module_wrapper(self):
+        weights = np.random.randn(2, 2)
+        entangle_layer = BasicEntangleLayers(weights, 2)
+        qnn = QuantumNeuralNetwork(2, entangle_layer)
+        qnn.measure([0, 1], [0, 1])
+
+        qlayer = ModuleWrapper(qnn)
+        params = qlayer.parameters()
+
+        assert np.allclose(
+            qlayer.weights.detach().numpy(), params.__next__().detach().numpy()
+        )
+
+    def test_classify_random_dataset_quantum(self, num_epochs, batch_size):
+        """Test a pure quantum nn training using a synthetic dataset
+
+        Args:
+            num_epochs: number of epoches for training
+            batch_size: batch size for training
+
+        """
+        # Define the hyperparameters
+        num_inputs = 2
+        num_classes = 2
+        learning_rate = 0.01
+
+        # Generate the dataset
+        dataset = _generate_random_dataset(num_inputs, 100)
+
+        # Create QNN
+        num_qubits = num_classes
+        weights = np.random.randn(num_qubits, 2)
+        encoder_layer = AngleEmbedding(np.random.random((2,)), num_qubits=2)
+        entangle_layer = BasicEntangleLayers(weights, 2)
+        qnn = QuantumNeuralNetwork(num_qubits, encoder_layer + entangle_layer)
+
+        # Create hybrid model
+        model = ModuleWrapper(qnn)
+        # model = mlp
+
+        # Define the loss function and optimizer
+        criterion = nn.CrossEntropyLoss()
+        optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
+
+        # Create data loader
+        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
+
+        # Train the model
+        for epoch in range(num_epochs):
+            for inputs, labels in data_loader:
+                # Forward pass
+                outputs = model(inputs)
+
+                # Compute the loss
+                loss = criterion(outputs, labels)
+
+                # Backward pass
+                optimizer.zero_grad()
+                loss.backward()
+
+                # Update the parameters
+                optimizer.step()
+
+            # Print the loss
+            print(f"Epoch {epoch + 1}/{num_epochs}: Loss = {loss.item()}")
+
+        # Evaluate the model on the dataset
+        correct = 0
+        total = 0
+        with torch.no_grad():
+            for inputs, labels in data_loader:
+                outputs = model(inputs)
+                _, predicted = torch.max(outputs.data, 1)
+                total += labels.size(0)
+                correct += (predicted == labels.argmax(dim=1)).sum().item()
+
+        print(f"Accuracy: {100 * correct / total:.2f}%")
+
+    def test_classify_random_dataset_hybrid(self, num_epochs, batch_size):
         """Test e2e hybrid quantum-classical nn training using a synthetic dataset
 
         Args:
diff --git a/tests/quafu/algorithms/varational_test.py b/tests/quafu/algorithms/varational_test.py
index 7bcbae8..0376b57 100644
--- a/tests/quafu/algorithms/varational_test.py
+++ b/tests/quafu/algorithms/varational_test.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import scipy.sparse as sp
-from quafu.algorithms.gradient import grad_adjoint, grad_finit_diff, grad_para_shift
+from quafu.algorithms.gradients import grad_adjoint, grad_finit_diff, grad_para_shift
 from quafu.algorithms.hamiltonian import Hamiltonian, PauliMats, PauliOp
 from quafu.elements import Parameter
 from quafu.elements.element_gates import CRYGate, CXGate, HGate, RXGate, RYGate, RZGate