From f430d73085a77dc5a23b89a7d25d9e6aa56d784e Mon Sep 17 00:00:00 2001
From: Denghui Lu <denghuilu@pku.edu.cn>
Date: Fri, 12 Jan 2024 15:14:24 +0800
Subject: [PATCH 1/3] Feature: Add CUDA support for stress_mgga function
 (#3474)

* init CPU implementation

* init gpu implementation

* fix cpu double error

* fix single precision error

* init gtest

* init test compilation

* finish work

* fix Makefile error

* fix UT error

* fix UT error
---
 source/CMakeLists.txt                         |   2 +
 source/Makefile.Objects                       |   2 +
 .../module_container/ATen/core/tensor.h       |  34 ++++++
 .../module_container/ATen/core/tensor_types.h |  14 +++
 .../ATen/kernels/test/blas_test.cpp           |   8 +-
 .../ATen/kernels/test/lapack_test.cpp         |  12 +-
 .../ATen/kernels/test/linalg_test.cpp         |   4 +-
 .../ATen/kernels/test/memory_test.cpp         |   4 +-
 .../ATen/ops/test/einsum_op_test.cpp          |  13 ++-
 .../ATen/ops/test/linalg_op_test.cpp          |   4 +-
 .../module_container/base/utils/gtest.h       |  57 ++++++++++
 .../module_container/test/test_utils.h        |  57 ----------
 source/module_basis/module_pw/fft.cpp         |  42 ++-----
 .../module_surchem/test/CMakeLists.txt        |   4 +-
 .../module_xc/CMakeLists.txt                  |   1 +
 .../kernels/cuda/xc_functional_op.cu          |  79 +++++++++++++
 .../module_xc/kernels/test/CMakeLists.txt     |   5 +
 .../kernels/test/xc_functional_op_test.cpp    |  81 +++++++++++++
 .../module_xc/kernels/xc_functional_op.cpp    |  49 ++++++++
 .../module_xc/kernels/xc_functional_op.h      |  32 ++++++
 .../module_xc/test/CMakeLists.txt             |  35 ++++--
 .../module_xc/test/test_xc3.cpp               |  17 ++-
 .../module_xc/test/test_xc5.cpp               |  10 --
 .../module_xc/test/xc3_mock.h                 | 107 ++++++++++++++++--
 .../module_xc/xc_functional.h                 |  16 ++-
 .../module_xc/xc_functional_gradcorr.cpp      |  69 ++++++-----
 .../hamilt_pwdft/kernels/cuda/stress_op.cu    |  44 ++++++-
 .../hamilt_pwdft/kernels/stress_op.cpp        |  27 +++++
 .../hamilt_pwdft/kernels/stress_op.h          |  13 +++
 .../hamilt_pwdft/kernels/test/CMakeLists.txt  |   5 +-
 .../kernels/test/stress_op_mgga_test.cpp      |  45 ++++++++
 .../hamilt_pwdft/stress_func.h                |   2 +-
 .../hamilt_pwdft/stress_func_mgga.cpp         |  72 ++++--------
 .../hamilt_pwdft/stress_pw.cpp                |   2 +-
 34 files changed, 740 insertions(+), 228 deletions(-)
 create mode 100644 source/module_base/module_container/base/utils/gtest.h
 delete mode 100644 source/module_base/module_container/test/test_utils.h
 create mode 100644 source/module_hamilt_general/module_xc/kernels/cuda/xc_functional_op.cu
 create mode 100644 source/module_hamilt_general/module_xc/kernels/test/CMakeLists.txt
 create mode 100644 source/module_hamilt_general/module_xc/kernels/test/xc_functional_op_test.cpp
 create mode 100644 source/module_hamilt_general/module_xc/kernels/xc_functional_op.cpp
 create mode 100644 source/module_hamilt_general/module_xc/kernels/xc_functional_op.h
 create mode 100644 source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_mgga_test.cpp

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 3e866b8323..de64277d1c 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -39,6 +39,7 @@ list(APPEND device_srcs
   module_hamilt_pw/hamilt_pwdft/kernels/wf_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/vnl_op.cpp
   module_base/kernels/math_op.cpp
+  module_hamilt_general/module_xc/kernels/xc_functional_op.cpp
 )
 
 if(USE_CUDA)
@@ -57,6 +58,7 @@ if(USE_CUDA)
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/wf_op.cu
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/vnl_op.cu
     module_base/kernels/cuda/math_op.cu
+    module_hamilt_general/module_xc/kernels/cuda/xc_functional_op.cu
   )
 endif()
 
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index f0e4c1331c..a0f0065a10 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -41,6 +41,7 @@ VPATH=./src_global:\
 ./module_hamilt_general/module_ewald:\
 ./module_hamilt_general/module_surchem:\
 ./module_hamilt_general/module_xc:\
+./module_hamilt_general/module_xc/kernels:\
 ./module_hamilt_pw/hamilt_pwdft:\
 ./module_hamilt_pw/hamilt_ofdft:\
 ./module_hamilt_pw/hamilt_stodft:\
@@ -392,6 +393,7 @@ OBJS_SYMMETRY=symm_other.o\
     symmetry.o\
 
 OBJS_XC=xc_functional.o\
+    xc_functional_op.o\
     xc_functional_vxc.o\
     xc_functional_gradcorr.o\
     xc_functional_wrapper_xc.o\
diff --git a/source/module_base/module_container/ATen/core/tensor.h b/source/module_base/module_container/ATen/core/tensor.h
index 826c7e19c0..3faf820dbd 100644
--- a/source/module_base/module_container/ATen/core/tensor.h
+++ b/source/module_base/module_container/ATen/core/tensor.h
@@ -244,6 +244,40 @@ class Tensor {
         return output;
     }
 
+    /**
+     * @brief Copies data from a given device to the current tensor object.
+     * 
+     * This function is designed to copy a given number of elements from a device-specific memory location
+     * to the memory associated with this object. It ensures that the size of the data being copied does not exceed
+     * the size of the destination tensor.
+     *
+     * @tparam DEVICE The device type from which the data will be copied.
+     * @tparam T The data type of the elements being copied.
+     * 
+     * @param data Pointer to the data array in the device memory that needs to be copied.
+     * @param num_elements The number of elements to copy.
+     * 
+     * @pre The number of elements to copy (`num_elements`) must be less than or equal to the number of elements 
+     *      in the destination tensor (`this->shape_.num_elements()`). If this condition is not met, the function
+     *      will trigger an error through `REQUIRES_OK`.
+     * 
+     * @note The function uses a template specialization `TEMPLATE_CZ_2` to handle the copying of memory
+     *       based on the data type `T` and the device type `DEVICE`. It utilizes the `kernels::cast_memory`
+     *       method to perform the actual memory copy operation.
+     */
+    template <typename DEVICE, typename T>
+    void copy_from_device(const T* data, int64_t num_elements = -1) {
+        if (num_elements == -1) {
+            num_elements = this->NumElements();
+        }
+        REQUIRES_OK(this->shape_.NumElements() >= num_elements,
+                    "The number of elements of the input data must match the number of elements of the tensor.")
+
+        TEMPLATE_CZ_2(this->data_type_, this->device_,
+                   kernels::cast_memory<T_, T, DEVICE_, DEVICE>()(
+                           this->data<T_>(), data, num_elements))
+    }
+
     /**
      * @brief Method to transform data from a given tensor object to the output tensor with a given data type
      *
diff --git a/source/module_base/module_container/ATen/core/tensor_types.h b/source/module_base/module_container/ATen/core/tensor_types.h
index 0722c4e53b..254ad22d3e 100644
--- a/source/module_base/module_container/ATen/core/tensor_types.h
+++ b/source/module_base/module_container/ATen/core/tensor_types.h
@@ -124,6 +124,20 @@ struct PsiToContainer<psi::DEVICE_GPU> {
     using type = container::DEVICE_GPU; /**< The return type specialization for std::complex<double>. */
 };
 
+template <typename T> 
+struct ContainerToPsi {
+    using type = T; /**< The return type based on the input type. */
+};
+
+template <>
+struct ContainerToPsi<container::DEVICE_CPU> {
+    using type = psi::DEVICE_CPU; /**< The return type specialization for std::complex<float>. */
+};
+
+template <>
+struct ContainerToPsi<container::DEVICE_GPU> {
+    using type = psi::DEVICE_GPU; /**< The return type specialization for std::complex<double>. */
+};
 
 
 /**
diff --git a/source/module_base/module_container/ATen/kernels/test/blas_test.cpp b/source/module_base/module_container/ATen/kernels/test/blas_test.cpp
index f5ff285473..a01ef8beb9 100644
--- a/source/module_base/module_container/ATen/kernels/test/blas_test.cpp
+++ b/source/module_base/module_container/ATen/kernels/test/blas_test.cpp
@@ -2,7 +2,7 @@
 
 #include <ATen/core/tensor.h>
 #include <ATen/kernels/blas.h>
-#include <test/test_utils.h>
+#include <base/utils/gtest.h>
 
 namespace container {
 namespace kernels {
@@ -11,14 +11,14 @@ template <typename T>
 class BlasTest : public testing::Test {
 public:
     BlasTest() {
-        test_utils::init_blas_handle();
+        base::utils::init_blas_handle();
     }
     ~BlasTest() override {
-        test_utils::delete_blas_handle();
+        base::utils::delete_blas_handle();
     }
 };
 
-TYPED_TEST_SUITE(BlasTest, test_utils::Types);
+TYPED_TEST_SUITE(BlasTest, base::utils::Types);
 
 TYPED_TEST(BlasTest, Dot) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
diff --git a/source/module_base/module_container/ATen/kernels/test/lapack_test.cpp b/source/module_base/module_container/ATen/kernels/test/lapack_test.cpp
index 7e340d510a..f9998c018c 100644
--- a/source/module_base/module_container/ATen/kernels/test/lapack_test.cpp
+++ b/source/module_base/module_container/ATen/kernels/test/lapack_test.cpp
@@ -2,7 +2,7 @@
 
 #include <ATen/core/tensor.h>
 #include <ATen/kernels/lapack.h>
-#include <test/test_utils.h>
+#include <base/utils/gtest.h>
 
 namespace container {
 namespace kernels {
@@ -11,16 +11,16 @@ template <typename T>
 class LapackTest : public testing::Test {
 public:
     LapackTest() {
-        test_utils::init_blas_handle();
-        test_utils::init_cusolver_handle();
+        base::utils::init_blas_handle();
+        base::utils::init_cusolver_handle();
     }
     ~LapackTest() override {
-        test_utils::delete_blas_handle();
-        test_utils::delete_cusolver_handle();
+        base::utils::delete_blas_handle();
+        base::utils::delete_cusolver_handle();
     }
 };
 
-TYPED_TEST_SUITE(LapackTest, test_utils::Types);
+TYPED_TEST_SUITE(LapackTest, base::utils::Types);
 
 TYPED_TEST(LapackTest, Trtri) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
diff --git a/source/module_base/module_container/ATen/kernels/test/linalg_test.cpp b/source/module_base/module_container/ATen/kernels/test/linalg_test.cpp
index 31f6bd21eb..8b0afe634f 100644
--- a/source/module_base/module_container/ATen/kernels/test/linalg_test.cpp
+++ b/source/module_base/module_container/ATen/kernels/test/linalg_test.cpp
@@ -2,7 +2,7 @@
 
 #include <ATen/core/tensor.h>
 #include <ATen/kernels/linalg.h>
-#include <test/test_utils.h>
+#include <base/utils/gtest.h>
 
 namespace container {
 namespace kernels {
@@ -15,7 +15,7 @@ class LinalgTest : public testing::Test {
     ~LinalgTest() override = default;
 };
 
-TYPED_TEST_SUITE(LinalgTest, test_utils::Types);
+TYPED_TEST_SUITE(LinalgTest, base::utils::Types);
 
 TYPED_TEST(LinalgTest, Add) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
diff --git a/source/module_base/module_container/ATen/kernels/test/memory_test.cpp b/source/module_base/module_container/ATen/kernels/test/memory_test.cpp
index b5bdc7eccf..e8943ae610 100644
--- a/source/module_base/module_container/ATen/kernels/test/memory_test.cpp
+++ b/source/module_base/module_container/ATen/kernels/test/memory_test.cpp
@@ -3,7 +3,7 @@
 #include <ATen/core/tensor.h>
 #include <ATen/core/tensor_map.h>
 #include <ATen/kernels/memory.h>
-#include <test/test_utils.h>
+#include <base/utils/gtest.h>
 
 namespace container {
 namespace kernels {
@@ -15,7 +15,7 @@ class MemoryTest : public testing::Test {
     ~MemoryTest() override = default;
 };
 
-TYPED_TEST_SUITE(MemoryTest, test_utils::Types);
+TYPED_TEST_SUITE(MemoryTest, base::utils::Types);
 
 TYPED_TEST(MemoryTest, ResizeAndSynchronizeMemory) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
diff --git a/source/module_base/module_container/ATen/ops/test/einsum_op_test.cpp b/source/module_base/module_container/ATen/ops/test/einsum_op_test.cpp
index ce4381e789..efec0072f0 100644
--- a/source/module_base/module_container/ATen/ops/test/einsum_op_test.cpp
+++ b/source/module_base/module_container/ATen/ops/test/einsum_op_test.cpp
@@ -2,7 +2,8 @@
 
 #include <ATen/core/tensor.h>
 #include <ATen/ops/einsum_op.h>
-#include <test/test_utils.h>
+#include <base/utils/gtest.h>
+
 
 namespace container {
 namespace op {
@@ -11,16 +12,16 @@ template <typename T>
 class EinsumOpTest : public testing::Test {
 public:
     EinsumOpTest() {
-        test_utils::init_blas_handle();
-        test_utils::init_cusolver_handle();
+        base::utils::init_blas_handle();
+        base::utils::init_cusolver_handle();
     }
     ~EinsumOpTest() override {
-        test_utils::delete_blas_handle();
-        test_utils::delete_cusolver_handle();
+        base::utils::delete_blas_handle();
+        base::utils::delete_cusolver_handle();
     }
 };
 
-TYPED_TEST_SUITE(EinsumOpTest, test_utils::Types);
+TYPED_TEST_SUITE(EinsumOpTest, base::utils::Types);
 
 TYPED_TEST(EinsumOpTest, Transform) {
     using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
diff --git a/source/module_base/module_container/ATen/ops/test/linalg_op_test.cpp b/source/module_base/module_container/ATen/ops/test/linalg_op_test.cpp
index 8f0d34491c..610daec816 100644
--- a/source/module_base/module_container/ATen/ops/test/linalg_op_test.cpp
+++ b/source/module_base/module_container/ATen/ops/test/linalg_op_test.cpp
@@ -2,7 +2,7 @@
 
 #include <ATen/core/tensor.h>
 #include <ATen/ops/linalg_op.h>
-#include <test/test_utils.h>
+#include <base/utils/gtest.h>
 
 namespace container {
 namespace op {
@@ -14,7 +14,7 @@ class LinalgOpTest : public testing::Test {
     ~LinalgOpTest() override = default;
 };
 
-TYPED_TEST_SUITE(LinalgOpTest, test_utils::Types);
+TYPED_TEST_SUITE(LinalgOpTest, base::utils::Types);
 
 TYPED_TEST(LinalgOpTest, Add) {
     using Type   = typename std::tuple_element<0, decltype(TypeParam())>::type;
diff --git a/source/module_base/module_container/base/utils/gtest.h b/source/module_base/module_container/base/utils/gtest.h
new file mode 100644
index 0000000000..4ec7ecb118
--- /dev/null
+++ b/source/module_base/module_container/base/utils/gtest.h
@@ -0,0 +1,57 @@
+#ifndef BASE_UTILS_GTEST_H_
+#define BASE_UTILS_GTEST_H_
+#include <gtest/gtest.h>
+#include <ATen/kernels/blas.h>
+#include <ATen/kernels/lapack.h>
+
+namespace base {
+namespace utils {
+
+#if __CUDA || __ROCM
+using ComplexTypes = ::testing::Types<
+        std::tuple<std::complex<float>, ct::DEVICE_CPU>, std::tuple<std::complex<float>, ct::DEVICE_GPU>,
+        std::tuple<std::complex<double>, ct::DEVICE_CPU>,  std::tuple<std::complex<double>, ct::DEVICE_GPU>>;
+using Types = ::testing::Types<
+        std::tuple<float, ct::DEVICE_CPU>, std::tuple<float, ct::DEVICE_GPU>,
+        std::tuple<double, ct::DEVICE_CPU>, std::tuple<double, ct::DEVICE_GPU>,
+        std::tuple<std::complex<float>, ct::DEVICE_CPU>, std::tuple<std::complex<float>, ct::DEVICE_GPU>,
+        std::tuple<std::complex<double>, ct::DEVICE_CPU>,  std::tuple<std::complex<double>, ct::DEVICE_GPU>>;
+#else 
+using ComplexTypes = ::testing::Types<
+        std::tuple<std::complex<float>, ct::DEVICE_CPU>,
+        std::tuple<std::complex<double>, ct::DEVICE_CPU>>;
+using Types = ::testing::Types<
+        std::tuple<float, ct::DEVICE_CPU>, 
+        std::tuple<double, ct::DEVICE_CPU>,
+        std::tuple<std::complex<float>, ct::DEVICE_CPU>,
+        std::tuple<std::complex<double>, ct::DEVICE_CPU>>;
+#endif 
+
+static inline void init_blas_handle() {
+    #if __CUDA || __ROCM
+        ct::kernels::createGpuBlasHandle();
+    #endif
+}
+
+static inline void delete_blas_handle() {
+    #if __CUDA || __ROCM
+        ct::kernels::destroyGpuBlasHandle();
+    #endif
+}
+
+static inline void init_cusolver_handle() {
+    #if __CUDA || __ROCM
+        ct::kernels::createGpuSolverHandle();
+    #endif
+}
+
+static inline void delete_cusolver_handle() {
+    #if __CUDA || __ROCM
+        ct::kernels::destroyGpuSolverHandle();
+    #endif
+}
+
+} // namespace utils
+} // namespace base
+
+#endif // BASE_UTILS_GTEST_H_
\ No newline at end of file
diff --git a/source/module_base/module_container/test/test_utils.h b/source/module_base/module_container/test/test_utils.h
deleted file mode 100644
index d50120c34a..0000000000
--- a/source/module_base/module_container/test/test_utils.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef ATEN_KERNELS_TEST_OP_TEST_UTILS_H_
-#define ATEN_KERNELS_TEST_OP_TEST_UTILS_H_
-#include <gtest/gtest.h>
-#include "module_base/module_container/ATen/kernels/blas.h"
-#include "module_base/module_container/ATen/kernels/lapack.h"
-
-namespace container {
-namespace test_utils {
-
-# if __CUDA || __ROCM
-using ComplexTypes = ::testing::Types<
-        std::tuple<std::complex<float>, DEVICE_CPU>, std::tuple<std::complex<float>, DEVICE_GPU>,
-        std::tuple<std::complex<double>, DEVICE_CPU>,  std::tuple<std::complex<double>, DEVICE_GPU>>;
-using Types = ::testing::Types<
-        std::tuple<float, DEVICE_CPU>, std::tuple<float, DEVICE_GPU>,
-        std::tuple<double, DEVICE_CPU>, std::tuple<double, DEVICE_GPU>,
-        std::tuple<std::complex<float>, DEVICE_CPU>, std::tuple<std::complex<float>, DEVICE_GPU>,
-        std::tuple<std::complex<double>, DEVICE_CPU>,  std::tuple<std::complex<double>, DEVICE_GPU>>;
-#else 
-using ComplexTypes = ::testing::Types<
-        std::tuple<std::complex<float>, DEVICE_CPU>,
-        std::tuple<std::complex<double>, DEVICE_CPU>>;
-using Types = ::testing::Types<
-        std::tuple<float, DEVICE_CPU>, 
-        std::tuple<double, DEVICE_CPU>,
-        std::tuple<std::complex<float>, DEVICE_CPU>,
-        std::tuple<std::complex<double>, DEVICE_CPU>>;
-#endif 
-
-static inline void init_blas_handle() {
-    #if __CUDA || __ROCM
-        kernels::createGpuBlasHandle();
-    #endif
-}
-
-static inline void delete_blas_handle() {
-    #if __CUDA || __ROCM
-        kernels::destroyGpuBlasHandle();
-    #endif
-}
-
-static inline void init_cusolver_handle() {
-    #if __CUDA || __ROCM
-        kernels::createGpuSolverHandle();
-    #endif
-}
-
-static inline void delete_cusolver_handle() {
-    #if __CUDA || __ROCM
-        kernels::destroyGpuSolverHandle();
-    #endif
-}
-
-} // namespace test_utils
-} // namespace container
-
-#endif // ATEN_KERNELS_TEST_OP_TEST_UTILS_H_
\ No newline at end of file
diff --git a/source/module_basis/module_pw/fft.cpp b/source/module_basis/module_pw/fft.cpp
index ce15d4a9fb..c6ea0166f7 100644
--- a/source/module_basis/module_pw/fft.cpp
+++ b/source/module_basis/module_pw/fft.cpp
@@ -22,17 +22,13 @@ void FFT::clear()
 	d_rspace = nullptr;
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu") {
-        if (this->precision == "single") {
-            if (c_auxr_3d != nullptr) {
-                delmem_cd_op()(gpu_ctx, c_auxr_3d);
-                c_auxr_3d = nullptr;
-            }
+        if (c_auxr_3d != nullptr) {
+            delmem_cd_op()(gpu_ctx, c_auxr_3d);
+            c_auxr_3d = nullptr;
         }
-        else {
-            if (z_auxr_3d != nullptr) {
-                delmem_zd_op()(gpu_ctx, z_auxr_3d);
-                z_auxr_3d = nullptr;
-            }
+        if (z_auxr_3d != nullptr) {
+            delmem_zd_op()(gpu_ctx, z_auxr_3d);
+            z_auxr_3d = nullptr;
         }
     }
 #endif // defined(__CUDA) || defined(__ROCM)
@@ -86,12 +82,8 @@ void FFT:: initfft(int nx_in, int ny_in, int nz_in, int lixy_in, int rixy_in, in
         //     fftw_malloc(sizeof(fftw_complex) * (this->nx * this->ny * this->nz)));
 #if defined(__CUDA) || defined(__ROCM)
         if (this->device == "gpu") {
-            if (this->precision == "single") {
-                resmem_cd_op()(gpu_ctx, this->c_auxr_3d, this->nx * this->ny * this->nz);
-            }
-            else {
-                resmem_zd_op()(gpu_ctx, this->z_auxr_3d, this->nx * this->ny * this->nz);
-            }
+            resmem_cd_op()(gpu_ctx, this->c_auxr_3d, this->nx * this->ny * this->nz);
+            resmem_zd_op()(gpu_ctx, this->z_auxr_3d, this->nx * this->ny * this->nz);
         }
 #endif // defined(__CUDA) || defined(__ROCM)
 #if defined(__ENABLE_FLOAT_FFTW)
@@ -242,20 +234,13 @@ void FFT :: initplan(const unsigned int& flag)
 
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu") {
-        if (this->precision == "single") {
         #if defined(__CUDA)
             cufftPlan3d(&c_handle, this->nx, this->ny, this->nz, CUFFT_C2C);
-        #elif defined(__ROCM)
-            hipfftPlan3d(&c_handle, this->nx, this->ny, this->nz, HIPFFT_C2C);
-        #endif
-        }
-        else {
-        #if defined(__CUDA)
             cufftPlan3d(&z_handle, this->nx, this->ny, this->nz, CUFFT_Z2Z);
         #elif defined(__ROCM)
+            hipfftPlan3d(&c_handle, this->nx, this->ny, this->nz, HIPFFT_C2C);
             hipfftPlan3d(&z_handle, this->nx, this->ny, this->nz, HIPFFT_Z2Z);
         #endif
-        }
     }
 #endif
 
@@ -363,20 +348,13 @@ void FFT:: cleanFFT()
     // fftw_destroy_plan(this->plan3dbackward);
 #if defined(__CUDA) || defined(__ROCM)
     if (this->device == "gpu") {
-        if (this->precision == "single") {
         #if defined(__CUDA)
             if(c_handle) { cufftDestroy(c_handle);  c_handle = {};}
-        #elif defined(__ROCM)
-            if(c_handle) { hipfftDestroy(c_handle); c_handle = {};}
-        #endif
-        }
-        else {
-        #if defined(__CUDA)
             if(z_handle) { cufftDestroy(z_handle);  z_handle = {};}
         #elif defined(__ROCM)
+            if(c_handle) { hipfftDestroy(c_handle); c_handle = {};}
             if(z_handle) { hipfftDestroy(z_handle); z_handle = {};}
         #endif
-        }
     }
 #endif
 }
diff --git a/source/module_hamilt_general/module_surchem/test/CMakeLists.txt b/source/module_hamilt_general/module_surchem/test/CMakeLists.txt
index 91ead3f567..1cc52484e7 100644
--- a/source/module_hamilt_general/module_surchem/test/CMakeLists.txt
+++ b/source/module_hamilt_general/module_surchem/test/CMakeLists.txt
@@ -28,7 +28,7 @@ AddTest(
 
 AddTest(
   TARGET surchem_cal_vcav
-  LIBS ${math_libs} planewave device base
+  LIBS ${math_libs} planewave device base container
   SOURCES cal_vcav_test.cpp  ../cal_vcav.cpp ../surchem.cpp ../../../module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp 
   ../../module_xc/xc_functional_gradcorr.cpp ../../module_xc/xc_functional.cpp
   ../../module_xc/xc_functional_wrapper_xc.cpp ../../module_xc/xc_functional_wrapper_gcxc.cpp
@@ -39,7 +39,7 @@ AddTest(
 
 AddTest(
   TARGET surchem_cal_vel
-  LIBS ${math_libs} planewave device base
+  LIBS ${math_libs} planewave device base container
   SOURCES cal_vel_test.cpp  ../cal_vel.cpp ../surchem.cpp ../cal_epsilon.cpp ../minimize_cg.cpp ../../../module_hamilt_pw/hamilt_pwdft/parallel_grid.cpp 
   ../../module_xc/xc_functional_gradcorr.cpp ../../module_xc/xc_functional.cpp
   ../../module_xc/xc_functional_wrapper_xc.cpp ../../module_xc/xc_functional_wrapper_gcxc.cpp
diff --git a/source/module_hamilt_general/module_xc/CMakeLists.txt b/source/module_hamilt_general/module_xc/CMakeLists.txt
index 5c72f608fd..087a2d22ff 100644
--- a/source/module_hamilt_general/module_xc/CMakeLists.txt
+++ b/source/module_hamilt_general/module_xc/CMakeLists.txt
@@ -21,5 +21,6 @@ endif()
 if(BUILD_TESTING)
 if(ENABLE_LIBXC)
   add_subdirectory(test)
+  add_subdirectory(kernels/test)
 endif()
 endif()
diff --git a/source/module_hamilt_general/module_xc/kernels/cuda/xc_functional_op.cu b/source/module_hamilt_general/module_xc/kernels/cuda/xc_functional_op.cu
new file mode 100644
index 0000000000..43c137153d
--- /dev/null
+++ b/source/module_hamilt_general/module_xc/kernels/cuda/xc_functional_op.cu
@@ -0,0 +1,79 @@
+#include <module_hamilt_general/module_xc/kernels/xc_functional_op.h>
+#include <module_psi/kernels/device.h>
+#include <thrust/complex.h>
+
+#define THREADS_PER_BLOCK 256
+
+namespace hamilt {
+
+template <typename T>
+__global__ void xc_functional_grad_wfc(
+    const int ik,
+    const int pol,
+    const int npw,
+    const int npwx,
+	const T tpiba,
+    const T* gcar,
+    const T* kvec_c,
+    const thrust::complex<T>* rhog,
+    thrust::complex<T>* porter)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= npw) { return; }
+	// the formula is : rho(r)^prime = \int iG * rho(G)e^{iGr} dG
+	// double kplusg = wfc_basis->getgpluskcar(ik,ig)[ipol] * tpiba;
+    T kplusg = (gcar[(ik * npwx + idx) * 3 + pol] +
+                   kvec_c[ik * 3 + pol]) * tpiba;
+	// calculate the charge density gradient in reciprocal space.
+	porter[idx] = thrust::complex<T>(0.0, kplusg) * rhog[idx];
+}
+
+template <typename T>
+__global__ void xc_functional_grad_wfc(
+    const int ipol,
+    const int nrxx,
+    const T* porter,
+    T* grad)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= nrxx) { return; }
+    grad[ipol * nrxx + idx] = porter[idx];
+}
+
+template <typename T, typename Device>
+void xc_functional_grad_wfc_op<T, Device>::operator()(
+    const int& ik,
+    const int& pol,
+    const int& npw,
+    const int& npwx,
+	const Real& tpiba,
+    const Real * gcar,
+    const Real * kvec_c,
+    const T * rhog,
+    T* porter)
+{
+    auto porter_ = reinterpret_cast<thrust::complex<Real>*>(porter);
+    auto rhog_ = reinterpret_cast<const thrust::complex<Real>*>(rhog);
+    const int block = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    xc_functional_grad_wfc<Real><<<block, THREADS_PER_BLOCK>>>(
+        ik, pol, npw, npwx, tpiba, gcar, kvec_c, rhog_, porter_);
+}
+
+template <typename T, typename Device>
+void xc_functional_grad_wfc_op<T, Device>::operator()(
+    const int& ipol,
+    const int& nrxx,
+    const T * porter,
+    T* grad)
+{
+    auto grad_ = reinterpret_cast<thrust::complex<Real>*>(grad);
+    auto porter_ = reinterpret_cast<const thrust::complex<Real>*>(porter);
+    const int block = (nrxx + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    xc_functional_grad_wfc<<<block, THREADS_PER_BLOCK>>>(
+        ipol, nrxx, porter_, grad_);
+}
+
+template struct xc_functional_grad_wfc_op<std::complex<float> , psi::DEVICE_GPU>;
+template struct xc_functional_grad_wfc_op<std::complex<double>, psi::DEVICE_GPU>;
+
+} // namespace hamilt   
\ No newline at end of file
diff --git a/source/module_hamilt_general/module_xc/kernels/test/CMakeLists.txt b/source/module_hamilt_general/module_xc/kernels/test/CMakeLists.txt
new file mode 100644
index 0000000000..abaf529ae9
--- /dev/null
+++ b/source/module_hamilt_general/module_xc/kernels/test/CMakeLists.txt
@@ -0,0 +1,5 @@
+AddTest(
+    TARGET XC_Functional_UTs
+    LIBS ${math_libs} device base container
+    SOURCES xc_functional_op_test.cpp
+)
diff --git a/source/module_hamilt_general/module_xc/kernels/test/xc_functional_op_test.cpp b/source/module_hamilt_general/module_xc/kernels/test/xc_functional_op_test.cpp
new file mode 100644
index 0000000000..2861e0601a
--- /dev/null
+++ b/source/module_hamilt_general/module_xc/kernels/test/xc_functional_op_test.cpp
@@ -0,0 +1,81 @@
+#include <module_hamilt_general/module_xc/kernels/xc_functional_op.h>
+
+#include <base/utils/gtest.h>
+#include <ATen/core/tensor.h>
+
+namespace hamilt {
+
+template<typename T>
+class XC_FunctionalOpTest : public testing::Test {
+public:
+    XC_FunctionalOpTest() = default;
+
+    ~XC_FunctionalOpTest() override = default;
+};
+
+TYPED_TEST_SUITE(XC_FunctionalOpTest, base::utils::ComplexTypes);
+
+TYPED_TEST(XC_FunctionalOpTest, xc_functional_grad_wfc_op) {
+    using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
+    using Device = typename std::tuple_element<1, decltype(TypeParam())>::type;
+    using Real = typename GetTypeReal<Type>::type;
+    
+    const int ik = 0;
+    const int ipol = 0;
+    const int npw = 4;
+    const int npwx = 4;
+    const int nrxx = 4;
+    const Real tpiba = 3.3249187157734292;
+
+    ct::Tensor gcar = std::move(ct::Tensor(
+               {static_cast<Real>(0), static_cast<Real>(0), static_cast<Real>(-0.46354790605367302),
+                static_cast<Real>(0), static_cast<Real>(0), static_cast<Real>(-0.30903193736911533),
+                static_cast<Real>(0), static_cast<Real>(0), static_cast<Real>(-0.15451596868455766),
+                static_cast<Real>(0), static_cast<Real>(0), static_cast<Real>(0)}).to_device<Device>());
+
+    ct::Tensor kvec_c = std::move(ct::Tensor({static_cast<Real>(0)}).to_device<Device>());
+
+    ct::Tensor rhog = std::move(ct::Tensor(
+               {static_cast<Type>(-3.0017116103913875e-05, 7.1056289872527673e-08),
+                static_cast<Type>(-7.5546991753311607e-05, -2.3428181621642512e-07),
+                static_cast<Type>(-0.00092404657246742273, 1.604946087155834e-06),
+                static_cast<Type>(0.98692408978015989, 1.8132214954316574e-09)}).to_device<Device>());
+
+    ct::Tensor expected_porter = std::move(ct::Tensor(
+               {static_cast<Type>(0),
+                static_cast<Type>(0),
+                static_cast<Type>(0),
+                static_cast<Type>(0)}).to_device<Device>());
+    
+    auto porter = expected_porter;
+
+    ct::Tensor porter_after = std::move(ct::Tensor(
+               {static_cast<Type>(1.909712182465086e-07, 3.6064630015014698e-16),
+                static_cast<Type>(-1.6872327080003407e-07, -3.4645680421774294e-16),
+                static_cast<Type>(-2.3926040075122384e-07, -5.37330596683816e-16),
+                static_cast<Type>(1.5465312289181674e-08, -8.7169854667834556e-17)}).to_device<Device>());
+
+    ct::Tensor expected_grad = porter;
+    auto grad = expected_grad;
+    grad.zero();
+
+    auto xc_functional_grad_wfc_solver = xc_functional_grad_wfc_op<Type, typename ct::ContainerToPsi<Device>::type>();
+    
+    xc_functional_grad_wfc_solver(
+        ik, ipol, npw, npwx, // Integers
+		tpiba,	// Double
+        gcar.data<Real>(),   // Array of Real
+        kvec_c.data<Real>(), // Array of double
+		rhog.data<Type>(), porter.data<Type>());    // Array of std::complex<double>
+
+    EXPECT_EQ(porter, expected_porter);
+
+
+	xc_functional_grad_wfc_solver(
+        ipol, nrxx,	// Integers
+		porter_after.data<Type>(), grad.data<Type>());	// Array of std::complex<double>
+
+    EXPECT_EQ(grad, expected_grad);
+}
+
+} // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_general/module_xc/kernels/xc_functional_op.cpp b/source/module_hamilt_general/module_xc/kernels/xc_functional_op.cpp
new file mode 100644
index 0000000000..d553764a42
--- /dev/null
+++ b/source/module_hamilt_general/module_xc/kernels/xc_functional_op.cpp
@@ -0,0 +1,49 @@
+#include <module_hamilt_general/module_xc/kernels/xc_functional_op.h>
+
+namespace hamilt {
+
+template <typename T, typename Device>
+void xc_functional_grad_wfc_op<T, Device>::operator()(
+    const int& ik,
+    const int& pol,
+    const int& npw,
+    const int& npwx,
+	const Real& tpiba,
+    const Real * gcar,
+    const Real * kvec_c,
+    const T * rhog,
+    T* porter)
+{
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 1024)
+#endif
+    for(int ig = 0; ig < npw; ig++) {
+		// the formula is : rho(r)^prime = \int iG * rho(G)e^{iGr} dG
+		// double kplusg = wfc_basis->getgpluskcar(ik,ig)[ipol] * tpiba;
+        Real kplusg = (gcar[(ik * npwx + ig) * 3 + pol] +
+                       kvec_c[ik * 3 + pol]) * tpiba;
+                       
+		// calculate the charge density gradient in reciprocal space.
+		porter[ig] = T(0.0, kplusg) * rhog[ig];
+	}
+}
+
+template <typename T, typename Device>
+void xc_functional_grad_wfc_op<T, Device>::operator()(
+    const int& ipol,
+    const int& nrxx,
+    const T * porter,
+    T* grad)
+{
+#ifdef _OPENMP
+#pragma omp parallel for schedule(static, 1024)
+#endif
+    for (int ir = 0; ir < nrxx; ++ir) {
+        grad[ipol * nrxx + ir] = porter[ir];
+	}
+}
+
+template struct xc_functional_grad_wfc_op<std::complex<float> , psi::DEVICE_CPU>;
+template struct xc_functional_grad_wfc_op<std::complex<double>, psi::DEVICE_CPU>;
+
+} // namespace hamilt   
\ No newline at end of file
diff --git a/source/module_hamilt_general/module_xc/kernels/xc_functional_op.h b/source/module_hamilt_general/module_xc/kernels/xc_functional_op.h
new file mode 100644
index 0000000000..1c83947182
--- /dev/null
+++ b/source/module_hamilt_general/module_xc/kernels/xc_functional_op.h
@@ -0,0 +1,32 @@
+#ifndef MODULE_HAMILT_GENERAL_MODULE_XC_KERNELS_H_
+#define MODULE_HAMILT_GENERAL_MODULE_XC_KERNELS_H_
+
+#include <complex>
+#include <module_psi/psi.h>
+#include <module_base/macros.h>
+
+namespace hamilt {
+
+template <typename T, typename Device>
+struct xc_functional_grad_wfc_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(
+        const int& ik,
+        const int& pol,
+        const int& npw,
+        const int& npwx,
+        const Real& tpiba,
+        const Real * gcar,
+        const Real * kvec_c,
+        const T * rhog,
+        T* porter);
+    
+    void operator()(
+        const int& ipol,
+        const int& nrxx,
+        const T * porter,
+        T* grad);
+};
+
+} // namespace hamilt
+#endif // MODULE_HAMILT_GENERAL_MODULE_XC_KERNELS_H_
\ No newline at end of file
diff --git a/source/module_hamilt_general/module_xc/test/CMakeLists.txt b/source/module_hamilt_general/module_xc/test/CMakeLists.txt
index eca867f8be..ee263a6d5c 100644
--- a/source/module_hamilt_general/module_xc/test/CMakeLists.txt
+++ b/source/module_hamilt_general/module_xc/test/CMakeLists.txt
@@ -254,24 +254,30 @@ AddTest(
 
 AddTest(
   TARGET XCTest_GRADCORR
-  LIBS MPI::MPI_CXX Libxc::xc ${math_libs}
+  LIBS MPI::MPI_CXX Libxc::xc ${math_libs} psi device container
   SOURCES test_xc3.cpp ../xc_functional_gradcorr.cpp ../xc_functional.cpp
     ../xc_functional_wrapper_xc.cpp ../xc_functional_wrapper_gcxc.cpp
     ../xc_functional_wrapper_tauxc.cpp
     ../xc_funct_corr_gga.cpp ../xc_funct_corr_lda.cpp ../xc_funct_exch_gga.cpp
     ../xc_funct_exch_lda.cpp ../xc_funct_hcth.cpp
     ../../../module_base/matrix.cpp
+    ../../../module_base/memory.cpp
+    ../../../module_base/libm/branred.cpp
+    ../../../module_base/libm/sincos.cpp
 )
 
 AddTest(
   TARGET XCTest_GRADWFC
-  LIBS MPI::MPI_CXX Libxc::xc ${math_libs}
+  LIBS MPI::MPI_CXX Libxc::xc ${math_libs} psi device container
   SOURCES test_xc3.cpp ../xc_functional_gradcorr.cpp ../xc_functional.cpp
     ../xc_functional_wrapper_xc.cpp ../xc_functional_wrapper_gcxc.cpp
     ../xc_functional_wrapper_tauxc.cpp
     ../xc_funct_corr_gga.cpp ../xc_funct_corr_lda.cpp ../xc_funct_exch_gga.cpp
     ../xc_funct_exch_lda.cpp ../xc_funct_hcth.cpp
     ../../../module_base/matrix.cpp
+    ../../../module_base/memory.cpp
+    ../../../module_base/libm/branred.cpp
+    ../../../module_base/libm/sincos.cpp
 )
 
 AddTest(
@@ -285,19 +291,23 @@ AddTest(
 
 AddTest(
   TARGET XCTest_VXC
-  LIBS MPI::MPI_CXX Libxc::xc ${math_libs} formatter
+  LIBS MPI::MPI_CXX Libxc::xc ${math_libs} psi device container formatter
   SOURCES test_xc5.cpp ../xc_functional_gradcorr.cpp ../xc_functional.cpp
     ../xc_functional_wrapper_xc.cpp ../xc_functional_wrapper_gcxc.cpp
     ../xc_functional_wrapper_tauxc.cpp
     ../xc_funct_corr_gga.cpp ../xc_funct_corr_lda.cpp ../xc_funct_exch_gga.cpp
     ../xc_funct_exch_lda.cpp ../xc_funct_hcth.cpp
-    ../../../module_base/matrix.cpp ../../../module_base/timer.cpp
     ../xc_functional_vxc.cpp
+    ../../../module_base/matrix.cpp
+    ../../../module_base/memory.cpp
+    ../../../module_base/timer.cpp
+    ../../../module_base/libm/branred.cpp
+    ../../../module_base/libm/sincos.cpp
 )
 
 AddTest(
   TARGET XCTest_VXC_Libxc
-  LIBS MPI::MPI_CXX Libxc::xc ${math_libs} formatter
+  LIBS MPI::MPI_CXX Libxc::xc ${math_libs} psi device container formatter
   SOURCES test_xc5.cpp ../xc_functional_gradcorr.cpp ../xc_functional.cpp
     ../xc_functional_wrapper_xc.cpp ../xc_functional_wrapper_gcxc.cpp
     ../xc_functional_wrapper_tauxc.cpp
@@ -305,11 +315,16 @@ AddTest(
     ../xc_funct_exch_lda.cpp ../xc_funct_hcth.cpp
     ../../../module_base/matrix.cpp ../../../module_base/timer.cpp
     ../xc_functional_vxc.cpp
+    ../../../module_base/matrix.cpp
+    ../../../module_base/memory.cpp
+    ../../../module_base/timer.cpp
+    ../../../module_base/libm/branred.cpp
+    ../../../module_base/libm/sincos.cpp
 )
 
 AddTest(
   TARGET XCTest_VXC_meta
-  LIBS MPI::MPI_CXX Libxc::xc ${math_libs} formatter
+  LIBS MPI::MPI_CXX Libxc::xc ${math_libs} psi device container formatter
   SOURCES test_xc5.cpp ../xc_functional_gradcorr.cpp ../xc_functional.cpp
     ../xc_functional_wrapper_xc.cpp ../xc_functional_wrapper_gcxc.cpp
     ../xc_functional_wrapper_tauxc.cpp
@@ -317,4 +332,10 @@ AddTest(
     ../xc_funct_exch_lda.cpp ../xc_funct_hcth.cpp
     ../../../module_base/matrix.cpp ../../../module_base/timer.cpp
     ../xc_functional_vxc.cpp
-)
\ No newline at end of file
+    ../../../module_base/matrix.cpp
+    ../../../module_base/memory.cpp
+    ../../../module_base/timer.cpp
+    ../../../module_base/libm/branred.cpp
+    ../../../module_base/libm/sincos.cpp
+)
+
diff --git a/source/module_hamilt_general/module_xc/test/test_xc3.cpp b/source/module_hamilt_general/module_xc/test/test_xc3.cpp
index 7b49fea0bc..601afc8863 100644
--- a/source/module_hamilt_general/module_xc/test/test_xc3.cpp
+++ b/source/module_hamilt_general/module_xc/test/test_xc3.cpp
@@ -147,9 +147,13 @@ class XCTest_GRADWFC : public testing::Test
     protected:
 
         std::complex<double> * grad = nullptr;
+        ModuleBase::Vector3<double> * gcar_wrapper = nullptr;
+        ModuleBase::Vector3<double> * kvec_c_wrapper = nullptr;
         ~XCTest_GRADWFC()
         {
-            delete[] grad;
+            delete [] grad;
+            delete [] gcar_wrapper;
+            delete [] kvec_c_wrapper;
         }
 
         void SetUp()
@@ -157,8 +161,17 @@ class XCTest_GRADWFC : public testing::Test
             ModulePW::PW_Basis_K rhopw;
             rhopw.npwk = new int[1];
             rhopw.npwk[0] = 5;
+            rhopw.npwk_max = 5;
             rhopw.nmaxgr = 5;
             rhopw.nrxx = 5;
+            rhopw.nks = 1;
+            gcar_wrapper = new ModuleBase::Vector3<double>[rhopw.npwk_max];
+            for (int ii = 0; ii < rhopw.npwk_max; ii++)
+                gcar_wrapper[ii] = ModuleBase::Vector3<double>(0,0,0);
+            kvec_c_wrapper = new ModuleBase::Vector3<double>(1,2,3);
+
+            rhopw.gcar = gcar_wrapper;
+            rhopw.kvec_c = kvec_c_wrapper;
 
             std::complex<double> rhog[5];
             for (int i=0;i<5;i++)
@@ -169,7 +182,7 @@ class XCTest_GRADWFC : public testing::Test
 
             grad = new std::complex<double>[15];
 
-            XC_Functional::grad_wfc(rhog, 0, grad, &rhopw, tpiba);
+            XC_Functional::grad_wfc<std::complex<double>, psi::DEVICE_CPU>(0, tpiba, &rhopw, rhog, grad);
         }
 };
 
diff --git a/source/module_hamilt_general/module_xc/test/test_xc5.cpp b/source/module_hamilt_general/module_xc/test/test_xc5.cpp
index ec9ac075ed..bd2f87da8a 100644
--- a/source/module_hamilt_general/module_xc/test/test_xc5.cpp
+++ b/source/module_hamilt_general/module_xc/test/test_xc5.cpp
@@ -15,16 +15,6 @@
 // v_xc_libxc, called by v_xc, when we use functionals from LIBXC
 // v_xc_meta, unified interface of mGGA functionals
 
-template<>
-void Parallel_Reduce::reduce_pool<double>(double& object)
-{
-#ifdef __MPI
-	double swap = object;
-	MPI_Allreduce(&swap , &object , 1, MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD);
-#endif
-    return;
-}
-
 class XCTest_VXC : public testing::Test
 {
     protected:
diff --git a/source/module_hamilt_general/module_xc/test/xc3_mock.h b/source/module_hamilt_general/module_xc/test/xc3_mock.h
index 8ff382b95a..48f5ce2bc7 100644
--- a/source/module_hamilt_general/module_xc/test/xc3_mock.h
+++ b/source/module_hamilt_general/module_xc/test/xc3_mock.h
@@ -1,6 +1,6 @@
 // I'm mocking FFT here because it is not possible to write
 // unit tests with FFT
-
+#include <fstream>
 namespace ModulePW
 {
     PW_Basis::PW_Basis(){};
@@ -63,17 +63,73 @@ namespace ModulePW
             out[i] = -ModuleBase::IMAG_UNIT * in[i];
         }
     }
-        template void PW_Basis_K::recip2real(const std::complex<double>* in,
-                                             std::complex<double>* out,
-                                             const int ik,
-                                             const bool add,
-                                             const double factor) const;
+    template void PW_Basis_K::recip2real(const std::complex<double>* in,
+                                         std::complex<double>* out,
+                                         const int ik,
+                                         const bool add,
+                                         const double factor) const;
 
-        ModuleBase::Vector3<double> PW_Basis_K::getgpluskcar(int, int) const
-        {
+    ModuleBase::Vector3<double> PW_Basis_K::getgpluskcar(int, int) const
+    {
         ModuleBase::Vector3<double> x = {1,2,3};
         return x;
-    };
+    }
+
+    
+    template <typename FPTYPE, typename Device>
+    void PW_Basis_K::real_to_recip(const Device* ctx,
+                       const std::complex<FPTYPE>* in,
+                       std::complex<FPTYPE>* out,
+                       const int ik,
+                       const bool add,
+                       const FPTYPE factor) const // in:(nplane,nx*ny)  ; out(nz, ns)
+    {
+        for (int i=0;i<nrxx;i++)
+        {
+            out[i] = in[i];
+        }
+    }
+    template <typename FPTYPE, typename Device>
+    void PW_Basis_K::recip_to_real(const Device* ctx,
+                       const std::complex<FPTYPE>* in,
+                       std::complex<FPTYPE>* out,
+                       const int ik,
+                       const bool add,
+                       const FPTYPE factor) const
+    {
+        for (int i = 0; i < nrxx; i++)
+        {
+            out[i] = -ModuleBase::IMAG_UNIT * in[i];
+        }
+    }
+
+    template void PW_Basis_K::real_to_recip<double, psi::DEVICE_CPU>(const psi::DEVICE_CPU* ctx,
+                                                                     const std::complex<double>* in,
+                                                                     std::complex<double>* out,
+                                                                     const int ik,
+                                                                     const bool add,
+                                                                     const double factor) const;
+    template void PW_Basis_K::recip_to_real<double, psi::DEVICE_CPU>(const psi::DEVICE_CPU* ctx,
+                                                                     const std::complex<double>* in,
+                                                                     std::complex<double>* out,
+                                                                     const int ik,
+                                                                     const bool add,
+                                                                     const double factor) const;
+#if __CUDA || __ROCM                                                                     
+    template void PW_Basis_K::real_to_recip<double, psi::DEVICE_GPU>(const psi::DEVICE_GPU* ctx,
+                                                                     const std::complex<double>* in,
+                                                                     std::complex<double>* out,
+                                                                     const int ik,
+                                                                     const bool add,
+                                                                     const double factor) const;
+
+    template void PW_Basis_K::recip_to_real<double, psi::DEVICE_GPU>(const psi::DEVICE_GPU* ctx,
+                                                                     const std::complex<double>* in,
+                                                                     std::complex<double>* out,
+                                                                     const int ik,
+                                                                     const bool add,
+                                                                     const double factor) const;
+#endif 
 
     FFT::FFT(){};
     FFT::~FFT(){};
@@ -89,6 +145,7 @@ namespace ModulePW
 namespace ModuleBase
 {
     void WARNING_QUIT(const std::string &file,const std::string &description) {return ;}
+    void WARNING(const std::string &file,const std::string &description) {};
     
     void Matrix3::Identity(){};
 
@@ -97,6 +154,7 @@ namespace ModuleBase
 
     void TITLE(const std::string &class_function_name,bool disable){};
     void TITLE(const std::string &class_name,const std::string &function_name,bool disable){};
+
 }
 
 namespace GlobalV
@@ -105,9 +163,12 @@ namespace GlobalV
     bool CAL_STRESS = 0;
     int CAL_FORCE = 0;
     int NSPIN;
+    int NPOL;
     double XC_TEMPERATURE;
     bool DOMAG;
     bool DOMAG_Z;
+    std::ofstream ofs_device;
+    std::ofstream ofs_running;
 }
 
 namespace GlobalC
@@ -139,3 +200,31 @@ void UnitCell::cal_ux()
 InfoNonlocal::InfoNonlocal(){};
 InfoNonlocal::~InfoNonlocal(){};
 #endif
+
+namespace Parallel_Reduce
+{
+    /// reduce in all process
+    template<typename T>
+    void reduce_all(T& object){};
+    template<typename T>
+    void reduce_all(T* object, const int n){};
+    template<typename T>
+    void reduce_pool(T& object){};
+    template<typename T>
+    void reduce_pool(T* object, const int n){};
+
+    template<>
+    void Parallel_Reduce::reduce_pool<double>(double& object)
+    {
+    #ifdef __MPI
+    	double swap = object;
+    	MPI_Allreduce(&swap , &object , 1, MPI_DOUBLE , MPI_SUM , MPI_COMM_WORLD);
+    #endif
+        return;
+    }
+    template void reduce_all<double>(double& object);
+    template void reduce_all<double>(double* object, const int n);
+    template void reduce_pool<float>(float& object);
+    template void reduce_pool<float>(float* object, const int n);
+    template void reduce_pool<double>(double* object, const int n);
+}
\ No newline at end of file
diff --git a/source/module_hamilt_general/module_xc/xc_functional.h b/source/module_hamilt_general/module_xc/xc_functional.h
index b32352ccaa..71ce3785cb 100644
--- a/source/module_hamilt_general/module_xc/xc_functional.h
+++ b/source/module_hamilt_general/module_xc/xc_functional.h
@@ -10,7 +10,7 @@
 #else
 #include "xc_funcs.h"
 #endif	// ifdef USE_LIBXC
-
+#include "module_base/macros.h"
 #include "module_base/global_function.h"
 #include "module_base/global_variable.h"
 #include "module_base/vector3.h"
@@ -19,6 +19,7 @@
 #include "module_basis/module_pw/pw_basis_k.h"
 #include "module_elecstate/module_charge/charge.h"
 #include "module_cell/unitcell.h"
+
 class XC_Functional
 {
 	public:
@@ -209,11 +210,14 @@ class XC_Functional
                          const UnitCell* ucell,
                          std::vector<double>& stress_gga,
                          const bool is_stress = 0);
-    static void grad_wfc(const std::complex<double>* rhog,
-                         const int ik,
-                         std::complex<double>* grad,
-                         const ModulePW::PW_Basis_K* wfc_basis,
-                         const double tpiba);
+	template <typename T, typename Device,
+          typename Real = typename GetTypeReal<T>::type>
+	static void grad_wfc(
+	    const int ik,
+	    const Real tpiba,
+	    const ModulePW::PW_Basis_K* wfc_basis,
+		const T* rhog,
+	    T* grad);
     static void grad_rho(const std::complex<double>* rhog,
                          ModuleBase::Vector3<double>* gdr,
                          const ModulePW::PW_Basis* rho_basis,
diff --git a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
index 8fba1870c6..75acbbcfe4 100644
--- a/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
+++ b/source/module_hamilt_general/module_xc/xc_functional_gradcorr.cpp
@@ -9,8 +9,14 @@
 //  and gives the spin up and spin down components of the charge.
 
 #include "xc_functional.h"
+#include "module_base/timer.h"
 #include "module_basis/module_pw/pw_basis_k.h"
 
+#include <ATen/core/tensor.h>
+#include <ATen/core/tensor_map.h>
+#include <ATen/core/tensor_types.h>
+#include <module_hamilt_general/module_xc/kernels/xc_functional_op.h>
+
 // from gradcorr.f90
 void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v,
 	const Charge* const chr, ModulePW::PW_Basis* rhopw, const UnitCell *ucell,
@@ -584,41 +590,46 @@ void XC_Functional::gradcorr(double &etxc, double &vtxc, ModuleBase::matrix &v,
 	return;
 }
 
-void XC_Functional::grad_wfc(const std::complex<double>* rhog,
-                             const int ik,
-                             std::complex<double>* grad,
-                             const ModulePW::PW_Basis_K* wfc_basis,
-                             const double tpiba)
+template <typename T, typename Device, typename Real>
+void XC_Functional::grad_wfc(
+    const int ik,
+    const Real tpiba,
+    const ModulePW::PW_Basis_K* wfc_basis,
+	const T* rhog,
+    T* grad)
 {
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
 	const int npw_k = wfc_basis->npwk[ik];
-	std::complex<double> *Porter = new std::complex<double> [wfc_basis->nmaxgr];
+	
+	auto porter = std::move(ct::Tensor(
+        ct::DataTypeToEnum<T>::value, ct::DeviceTypeToEnum<ct_Device>::value, {wfc_basis->nmaxgr}));
+	auto gcar = ct::TensorMap(
+		&wfc_basis->gcar[0][0], ct::DataType::DT_DOUBLE, ct::DeviceType::CpuDevice, {wfc_basis->nks * wfc_basis->npwk_max, 3}).to_device<ct_Device>();
+	auto kvec_c = ct::TensorMap(
+		&wfc_basis->kvec_c[0][0],ct::DataType::DT_DOUBLE, ct::DeviceType::CpuDevice, {wfc_basis->nks, 3}).to_device<ct_Device>();
+	
+	auto xc_functional_grad_wfc_solver 
+		= hamilt::xc_functional_grad_wfc_op<T, Device>();
 
-	for(int ipol=0; ipol<3; ipol++)
-	{
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static, 1024)
-#endif
-		for(int ig=0; ig<npw_k; ++ig)
-		{
-			// the formula is : rho(r)^prime = \int iG * rho(G)e^{iGr} dG
-			double kplusg = wfc_basis->getgpluskcar(ik,ig)[ipol] * tpiba;
-			// calculate the charge density gradient in reciprocal space.
-			Porter[ig] = std::complex<double>(0.0,kplusg) * rhog[ig];
-		}
+	for(int ipol=0; ipol<3; ipol++) {
+		xc_functional_grad_wfc_solver(
+            ik, ipol, npw_k, wfc_basis->npwk_max, // Integers
+			tpiba,	// Double
+            gcar.template data<Real>(),   // Array of Real
+            kvec_c.template data<Real>(), // Array of double
+			rhog, porter.data<T>());    // Array of std::complex<double>
 
 		// bring the gdr from G --> R
-		wfc_basis->recip2real(Porter, Porter, ik);
+		Device * ctx = nullptr;
+		wfc_basis->recip_to_real(ctx, porter.data<T>(), porter.data<T>(), ik);
 
-#ifdef _OPENMP
-#pragma omp parallel for schedule(static, 1024)
-#endif
-        for (int ir = 0; ir < wfc_basis->nrxx; ++ir)
-            grad[ipol * wfc_basis->nrxx + ir] = Porter[ir];
-    }//end loop ipol
-	delete[] Porter;
-	return;
+		xc_functional_grad_wfc_solver(
+            ipol, wfc_basis->nrxx,	// Integers
+			porter.data<T>(), grad);	// Array of std::complex<double>
+    }
 }
 
+
 void XC_Functional::grad_rho(const std::complex<double>* rhog,
                              ModuleBase::Vector3<double>* gdr,
                              const ModulePW::PW_Basis* rho_basis,
@@ -736,3 +747,7 @@ void XC_Functional::noncolin_rho(double *rhoout1, double *rhoout2, double *neg,
 	return;
 }
 
+template void XC_Functional::grad_wfc<std::complex<double>, psi::DEVICE_CPU, double>(const int ik, const double tpiba, const ModulePW::PW_Basis_K* wfc_basis, const std::complex<double>* rhog, std::complex<double>* grad);
+#if __CUDA || __ROCM
+template void XC_Functional::grad_wfc<std::complex<double>, psi::DEVICE_GPU, double>(const int ik, const double tpiba, const ModulePW::PW_Basis_K* wfc_basis, const std::complex<double>* rhog, std::complex<double>* grad);
+#endif // __CUDA || __ROCM
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index 43fbabe0b2..505e2ce6ee 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -3,6 +3,7 @@
 
 #include <complex>
 #include <thrust/complex.h>
+#include <module_base/macros.h>
 
 #include <cuda_runtime.h>
 
@@ -21,6 +22,28 @@ void warp_reduce(FPTYPE & val) {
     }
 }
 
+template <typename T>
+__global__ void cal_stress_mgga(
+    const int spin,
+    const int nrxx,
+    const T w1,
+    const thrust::complex<T> * gradwfc,
+    T * crosstaus)
+{
+    int idx = threadIdx.x + blockIdx.x * blockDim.x;
+    if (idx >= nrxx) { return; }
+    int ipol = 0;
+    for (int ix = 0; ix < 3; ix++) {
+        for (int iy = 0; iy < ix + 1; iy++) {
+            crosstaus[spin * nrxx * 6 + ipol * nrxx + idx] 
+                += 2.0 * w1 
+                * (gradwfc[ix * nrxx + idx].real() * gradwfc[iy*nrxx + idx].real()
+                +  gradwfc[ix * nrxx + idx].imag() * gradwfc[iy*nrxx + idx].imag());
+            ipol += 1;
+        }
+    }
+}
+
 template <typename FPTYPE>
 __global__ void cal_dbecp_noevc_nl(
         const int ipol,
@@ -213,10 +236,27 @@ void cal_stress_nl_op<FPTYPE, psi::DEVICE_GPU>::operator() (
              stress);// array of data
 }
 
-template struct cal_dbecp_noevc_nl_op<float, psi::DEVICE_GPU>;
-template struct cal_stress_nl_op<float, psi::DEVICE_GPU>;
+template <typename T, typename Device>
+void cal_stress_mgga_op<T, Device>::operator()(
+    const int& spin,
+    const int& nrxx,
+    const Real& w1,
+    const T * gradwfc,
+    Real * crosstaus)
+{
+    auto gradwfc_ = reinterpret_cast<const thrust::complex<Real>*>(gradwfc);
+    const int block = (nrxx + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+    cal_stress_mgga<Real><<<block, THREADS_PER_BLOCK>>>(
+        spin, nrxx, w1, gradwfc_, crosstaus);
+}
+
+template struct cal_stress_mgga_op<std::complex<float>,  psi::DEVICE_GPU>;
+template struct cal_stress_mgga_op<std::complex<double>, psi::DEVICE_GPU>;
 
+template struct cal_dbecp_noevc_nl_op<float, psi::DEVICE_GPU>;
 template struct cal_dbecp_noevc_nl_op<double, psi::DEVICE_GPU>;
+
+template struct cal_stress_nl_op<float, psi::DEVICE_GPU>;
 template struct cal_stress_nl_op<double, psi::DEVICE_GPU>;
 
 }  // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index 872db30dba..dfa6836df2 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -1,6 +1,8 @@
 #include "module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 
+#include <iomanip>
+
 namespace hamilt{
 
 template <typename FPTYPE>
@@ -132,6 +134,31 @@ struct cal_stress_nl_op<FPTYPE, psi::DEVICE_CPU> {
     }
 };
 
+template <typename T, typename Device>
+void cal_stress_mgga_op<T, Device>::operator()(
+    const int& spin,
+    const int& nrxx,
+    const Real& w1,
+    const T * gradwfc,
+    Real * crosstaus)
+{
+    for (int ir = 0; ir < nrxx; ir++) {
+        int ipol = 0;
+        for (int ix = 0; ix < 3; ix++) {
+            for (int iy = 0; iy < ix + 1; iy++) {
+                crosstaus[spin * nrxx * 6 + ipol * nrxx + ir] 
+                    += 2.0 * w1 
+                    * (gradwfc[ix*nrxx + ir].real() * gradwfc[iy*nrxx + ir].real()
+                    +  gradwfc[ix*nrxx + ir].imag() * gradwfc[iy*nrxx + ir].imag());
+                ipol += 1;
+            }
+        }
+    }
+}
+
+template struct cal_stress_mgga_op<std::complex<float>,  psi::DEVICE_CPU>;
+template struct cal_stress_mgga_op<std::complex<double>, psi::DEVICE_CPU>;
+
 template struct cal_dbecp_noevc_nl_op<float, psi::DEVICE_CPU>;
 template struct cal_stress_nl_op<float, psi::DEVICE_CPU>;
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index 931653d79a..d9a9d0f04a 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -3,6 +3,8 @@
 
 #include "module_psi/psi.h"
 #include <complex>
+#include <module_base/macros.h>
+
 
 namespace hamilt {
 
@@ -101,6 +103,17 @@ namespace hamilt {
                         FPTYPE* stress);
     };
 
+template <typename T, typename Device>
+struct cal_stress_mgga_op {
+    using Real = typename GetTypeReal<T>::type;
+    void operator()(
+        const int& spin,
+        const int& nrxx,
+        const Real& w1,
+        const T * gradwfc,
+        Real * crosstaus);
+};
+
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
 template <typename FPTYPE>
 struct cal_dbecp_noevc_nl_op<FPTYPE, psi::DEVICE_GPU> {
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/CMakeLists.txt b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/CMakeLists.txt
index d502e5a43f..582d17b5bc 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/CMakeLists.txt
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/CMakeLists.txt
@@ -1,11 +1,10 @@
 remove_definitions(-D__LCAO)
 remove_definitions(-D__DEEPKS)
-remove_definitions(-D__CUDA)
-remove_definitions(-D__ROCM)
 
 AddTest(
     TARGET Hamilt_Kernels_UTs
-    LIBS ${math_libs} device base
+    LIBS ${math_libs} device base container
     SOURCES ekinetic_op_test.cpp nonlocal_op_test.cpp veff_op_test.cpp meta_op_test.cpp
     force_op_test.cpp stress_op_test.cpp wf_op_test.cpp vnl_op_test.cpp
+    stress_op_mgga_test.cpp
 )
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_mgga_test.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_mgga_test.cpp
new file mode 100644
index 0000000000..d8f8a9c4c9
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/test/stress_op_mgga_test.cpp
@@ -0,0 +1,45 @@
+#include <module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h>
+
+#include <base/utils/gtest.h>
+#include <ATen/core/tensor.h>
+
+namespace hamilt {
+
+template<typename T>
+class StressMggaTest : public testing::Test {
+public:
+    StressMggaTest() = default;
+
+    ~StressMggaTest() override = default;
+};
+
+TYPED_TEST_SUITE(StressMggaTest, base::utils::ComplexTypes);
+
+TYPED_TEST(StressMggaTest, cal_stress_mgga_op) {
+    using Type = typename std::tuple_element<0, decltype(TypeParam())>::type;
+    using Device = typename std::tuple_element<1, decltype(TypeParam())>::type;
+    using Real = typename GetTypeReal<Type>::type;
+    
+    const int nrxx = 4;
+    const int nspin = 0;
+    const Real w1 = 0.00031685874122631746;
+
+    ct::Tensor gradwfc = std::move(ct::Tensor(
+               {static_cast<Type>( 0.0000001909712), static_cast<Type>(-0.0000001687233), static_cast<Type>(-0.0000002392604),
+                static_cast<Type>( 0.0000000154653), static_cast<Type>( 0.0000004211339), static_cast<Type>( 0.0000007242030),
+                static_cast<Type>( 0.0000007559914), static_cast<Type>( 0.0000005232479), static_cast<Type>( 0.0000001903915),
+                static_cast<Type>(-0.0000000207916), static_cast<Type>( 0.0000000366767), static_cast<Type>( 0.0000003353377)}).to_device<Device>());
+
+    ct::Tensor crosstaus = std::move(ct::Tensor(
+    ct::DataTypeToEnum<Real>::value, ct::DeviceTypeToEnum<Device>::value, {nrxx * 6}));
+    crosstaus.zero();
+    ct::Tensor expected_crosstaus = crosstaus;
+    expected_crosstaus.zero();
+
+    auto cal_stress_mgga_solver = cal_stress_mgga_op<Type, typename ct::ContainerToPsi<Device>::type>();
+    cal_stress_mgga_solver(nspin, nrxx, w1, gradwfc.data<Type>(), crosstaus.data<Real>());
+
+    EXPECT_EQ(crosstaus, expected_crosstaus);
+}
+
+} // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
index 6e960727c8..5d51e529e3 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
@@ -129,7 +129,7 @@ class Stress_Func
                      const Charge* const chr,
                      K_Vectors* p_kv,
                      ModulePW::PW_Basis_K* wfc_basis,
-                     const psi::Psi<complex<FPTYPE>>* psi_in = nullptr); // gga part in PW basis
+                     const psi::Psi<complex<FPTYPE>, Device>* psi_in); // gga part in PW basis
 
     // 7) the stress from the non-local pseudopotentials
     void stress_nl(ModuleBase::matrix& sigma,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_mgga.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_mgga.cpp
index 549d0599a9..cac6d8d369 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func_mgga.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_mgga.cpp
@@ -3,6 +3,9 @@
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "stress_func.h"
 
+#include <ATen/core/tensor.h>
+#include <ATen/core/tensor_types.h>
+
 // calculate the Pulay term of mGGA stress correction in PW
 template <typename FPTYPE, typename Device>
 void Stress_Func<FPTYPE, Device>::stress_mgga(ModuleBase::matrix& sigma,
@@ -11,7 +14,7 @@ void Stress_Func<FPTYPE, Device>::stress_mgga(ModuleBase::matrix& sigma,
                                               const Charge* const chr,
                                               K_Vectors* p_kv,
                                               ModulePW::PW_Basis_K* wfc_basis,
-                                              const psi::Psi<complex<FPTYPE>>* psi_in)
+                                              const psi::Psi<complex<FPTYPE>, Device>* psi_in)
 {
     ModuleBase::timer::tick("Stress_Func", "stress_mgga");
 
@@ -22,69 +25,50 @@ void Stress_Func<FPTYPE, Device>::stress_mgga(ModuleBase::matrix& sigma,
     const int nrxx = wfc_basis->nrxx;
 
     std::complex<FPTYPE>* psi;
-    int ipol2xy[3][3];
+    int ipol2xy[3][3] = {{0, 1, 3}, {1, 2, 4}, {3, 4, 5}};
     FPTYPE sigma_mgga[3][3];
 
-    std::complex<FPTYPE>* gradwfc = new std::complex<FPTYPE>[nrxx * 3];
-    ModuleBase::GlobalFunc::ZEROS(gradwfc, nrxx * 3);
+    using ct_Device = typename ct::PsiToContainer<Device>::type;
 
-    FPTYPE** crosstaus = new FPTYPE*[GlobalV::NSPIN];
-    for (int is = 0; is < GlobalV::NSPIN; ++is)
-    {
-        crosstaus[is] = new FPTYPE[nrxx * 6];
-        ModuleBase::GlobalFunc::ZEROS(crosstaus[is], 6*nrxx);
-    }
+    auto gradwfc = ct::Tensor(
+        ct::DataTypeToEnum<std::complex<FPTYPE>>::value,
+        ct::DeviceTypeToEnum<ct_Device>::value,
+        {nrxx * 3});
+    auto crosstaus = ct::Tensor(
+        ct::DataTypeToEnum<FPTYPE>::value,
+        ct::DeviceTypeToEnum<ct_Device>::value,
+        {GlobalV::NSPIN, nrxx * 6});
+    crosstaus.zero(); // Must be zeroed out 
 
+    auto cal_stress_mgga_solver = hamilt::cal_stress_mgga_op<std::complex<FPTYPE>, Device>();
     for (int ik = 0; ik < p_kv->nks; ik++)
     {
         if (GlobalV::NSPIN == 2)
             current_spin = p_kv->isk[ik];
         const int npw = p_kv->ngk[ik];
-        psi = new complex<FPTYPE>[npw];
 
         for (int ibnd = 0; ibnd < GlobalV::NBANDS; ibnd++)
         {
             const FPTYPE w1 = wg(ik, ibnd) / GlobalC::ucell.omega;
-            const std::complex<FPTYPE>* ppsi = nullptr;
-            ppsi = &(psi_in[0](ik, ibnd, 0));
-            for (int ig = 0; ig < npw; ig++)
-            {
-                psi[ig] = ppsi[ig];
-            }
-            XC_Functional::grad_wfc(psi, ik, gradwfc, wfc_basis, GlobalC::ucell.tpiba);
-
-            int ipol = 0;
-            for (int ix = 0; ix < 3; ix++)
-            {
-                for (int iy = 0; iy < ix + 1; iy++)
-                {
-                    ipol2xy[ix][iy] = ipol;
-                    ipol2xy[iy][ix] = ipol;
-                    for (int ir = 0; ir < nrxx; ir++)
-                    {
-                        crosstaus[current_spin][ipol*nrxx + ir] += 2.0 * w1
-                                                             * (gradwfc[ix*nrxx + ir].real() * gradwfc[iy*nrxx + ir].real()
-                                                                + gradwfc[ix*nrxx + ir].imag() * gradwfc[iy*nrxx + ir].imag());
-                    }
-                    ipol += 1;
-                }
-            }
+            const std::complex<FPTYPE>* psi = &psi_in[0](ik, ibnd, 0);
+            XC_Functional::grad_wfc<std::complex<FPTYPE>, Device>(ik, GlobalC::ucell.tpiba, wfc_basis, psi, gradwfc.data<std::complex<FPTYPE>>());
+            cal_stress_mgga_solver(
+                current_spin, nrxx, w1, gradwfc.data<std::complex<FPTYPE>>(), crosstaus.data<FPTYPE>());
         } // band loop
-        delete[] psi;
+        // delete[] psi;
     } // k loop
-
+    auto crosstaus_host = crosstaus.to_device<ct::DEVICE_CPU>();
+    auto crosstaus_pack = crosstaus_host.accessor<FPTYPE, 2>();
 #ifdef __MPI
     for (int is = 0; is < GlobalV::NSPIN; ++is)
     {
         for (int ipol = 0; ipol < 6; ++ipol)
         {
-            chr->reduce_diff_pools(crosstaus[is] + ipol * nrxx);
+            chr->reduce_diff_pools(&crosstaus_pack[is][ipol * nrxx]);
         }
     }
 #endif
 
-    delete[] gradwfc;
-
     for(int ix = 0; ix < 3; ix++)
     {
         for(int iy = 0; iy < 3; iy++)
@@ -105,19 +89,13 @@ void Stress_Func<FPTYPE, Device>::stress_mgga(ModuleBase::matrix& sigma,
                 for (int ir = 0; ir < nrxx; ir++)
                 {
                     FPTYPE x
-                        = v_ofk(is, ir) * (chr->kin_r[is][ir] * delta + crosstaus[is][ipol2xy[ix][iy] * nrxx + ir]);
+                        = v_ofk(is, ir) * (chr->kin_r[is][ir] * delta + crosstaus_pack[is][ipol2xy[ix][iy] * nrxx + ir]);
                     sigma_mgga[ix][iy] += x;
                 }
             }
         }
     }
 
-    for (int is = 0; is < GlobalV::NSPIN; is++)
-    {
-        delete[] crosstaus[is];
-    }
-    delete[] crosstaus;
-
 #ifdef __MPI
     for (int l = 0; l < 3; l++)
     {
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
index 5386d11fcd..8e537053fb 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
@@ -85,7 +85,7 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
                           pelec->charge,
                           p_kv,
                           wfc_basis,
-                          psi_in);
+                          d_psi_in);
     }
 
     // local contribution

From 8fd89806e7426b3b7147568a8de56f5aeb202467 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Fri, 12 Jan 2024 17:01:10 +0800
Subject: [PATCH 2/3] add a UnitTest for AutosetMag in read_atom_position()
 (#3471)

---
 source/module_cell/test/unitcell_test.cpp | 54 +++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/source/module_cell/test/unitcell_test.cpp b/source/module_cell/test/unitcell_test.cpp
index a4a944eb2d..1bc4da54a1 100644
--- a/source/module_cell/test/unitcell_test.cpp
+++ b/source/module_cell/test/unitcell_test.cpp
@@ -1385,6 +1385,60 @@ TEST_F(UcellTest,ReadAtomPositionsCAU)
 	remove("read_atom_positions.warn");
 }
 
+TEST_F(UcellTest,ReadAtomPositionsAutosetMag)
+{
+	std::string fn = "./support/STRU_MgO";
+	std::ifstream ifa(fn.c_str());
+	std::ofstream ofs_running;
+	std::ofstream ofs_warning;
+	ofs_running.open("read_atom_positions.tmp");
+	ofs_warning.open("read_atom_positions.warn");
+	//mandatory preliminaries
+	ucell->ntype = 2;
+	ucell->atoms = new Atom[ucell->ntype];
+	ucell->set_atom_flag = true;
+	GlobalV::test_pseudo_cell = 2;
+	GlobalV::BASIS_TYPE = "lcao";
+	GlobalV::deepks_setorb = true;
+	GlobalV::NSPIN = 2;
+	EXPECT_NO_THROW(ucell->read_atom_species(ifa,ofs_running));
+	EXPECT_DOUBLE_EQ(ucell->latvec.e11,4.27957);
+	EXPECT_DOUBLE_EQ(ucell->latvec.e22,4.27957);
+	EXPECT_DOUBLE_EQ(ucell->latvec.e33,4.27957);
+	//mandatory preliminaries
+	delete[] ucell->magnet.start_magnetization;
+	ucell->magnet.start_magnetization = new double[ucell->ntype];
+	ucell->read_atom_positions(ifa,ofs_running,ofs_warning);
+	for (int it = 0;it < ucell->ntype; it++)
+	{
+		for (int ia = 0;ia < ucell->atoms[it].na; ia++)
+		{
+			EXPECT_DOUBLE_EQ(ucell->atoms[it].mag[ia],1.0);
+			EXPECT_DOUBLE_EQ(ucell->atoms[it].m_loc_[ia].x,1.0);
+		}
+	}
+	//for nspin == 4
+	GlobalV::NSPIN = 4;
+	delete[] ucell->magnet.start_magnetization;
+	ucell->magnet.start_magnetization = new double[ucell->ntype];
+	ucell->read_atom_positions(ifa,ofs_running,ofs_warning);
+	for (int it = 0;it < ucell->ntype; it++)
+	{
+		for (int ia = 0;ia < ucell->atoms[it].na; ia++)
+		{
+			EXPECT_DOUBLE_EQ(ucell->atoms[it].mag[ia],sqrt(pow(1.0,2)+pow(1.0,2)+pow(1.0,2)));
+			EXPECT_DOUBLE_EQ(ucell->atoms[it].m_loc_[ia].x,1.0);
+			EXPECT_DOUBLE_EQ(ucell->atoms[it].m_loc_[ia].y,1.0);
+			EXPECT_DOUBLE_EQ(ucell->atoms[it].m_loc_[ia].z,1.0);
+		}
+	}
+	ofs_running.close();
+	ofs_warning.close();
+	ifa.close();
+	remove("read_atom_positions.tmp");
+	remove("read_atom_positions.warn");
+}
+
 TEST_F(UcellTest,ReadAtomPositionsWarning1)
 {
 	std::string fn = "./support/STRU_MgO_WarningC1";

From 8a969c746e9c39048de7d2db760993713f90ecef Mon Sep 17 00:00:00 2001
From: Jie Li <76780849+jieli-matrix@users.noreply.github.com>
Date: Fri, 12 Jan 2024 17:01:47 +0800
Subject: [PATCH 3/3] add precision tests for zeros and derivatives (#3445)

---
 source/module_base/test/math_sphbes_test.cpp | 410 +++++++++++--------
 1 file changed, 248 insertions(+), 162 deletions(-)

diff --git a/source/module_base/test/math_sphbes_test.cpp b/source/module_base/test/math_sphbes_test.cpp
index b79e3f8da7..521d4dc2f4 100644
--- a/source/module_base/test/math_sphbes_test.cpp
+++ b/source/module_base/test/math_sphbes_test.cpp
@@ -1,27 +1,28 @@
-#include"../math_sphbes.h"
-#include<iostream>
-#include<fstream>
+#include "../math_sphbes.h"
+
+#include <fstream>
+#include <iostream>
 
 #ifdef __MPI
-#include"mpi.h"
+#include "mpi.h"
 #endif
 
-#include"gtest/gtest.h"
+#include "gtest/gtest.h"
 
 #define doublethreshold 1e-7
 #define MAX(x, y) ((x) > (y) ? (x) : (y))
 
 /************************************************
-*  unit test of class Integral
-***********************************************/
+ *  unit test of class Integral
+ ***********************************************/
 
 /**
  * Note: this unit test try to ensure the invariance
  * of the spherical Bessel produced by class Sphbes,
- * and the reference results are produced by ABACUS 
+ * and the reference results are produced by ABACUS
  * at 2022-1-27.
- * 
- * Tested function: 
+ *
+ * Tested function:
  *      - Spherical_Bessel.
  *      - Spherical_Bessel_Roots
  *      - overloading of Spherical_Bessel. This funnction sets sjp[i] to 1.0 when i < msh.
@@ -32,173 +33,175 @@
 double mean(const double* vect, const int totN)
 {
     double meanv = 0.0;
-    for (int i=0; i< totN; ++i) {meanv += vect[i]/totN;}
+    for (int i = 0; i < totN; ++i)
+    {
+        meanv += vect[i] / totN;
+    }
     return meanv;
 }
 
 class Sphbes : public testing::Test
 {
-    protected:
-
-    int     msh =   700;
-    int     l0  =   0;
-    int     l1  =   1;
-    int     l2  =   2;
-    int     l3  =   3;
-    int     l4  =   4;
-    int     l5  =   5;
-    int     l6  =   6;
-    int     l7  =   7;
-    double  q   =   1.0;
-    double  *r  =   new double[msh];       
-    double  *jl =   new double[msh];
-    double  *djl =   new double[msh];
+  protected:
+    int msh = 700;
+    int l0 = 0;
+    int l1 = 1;
+    int l2 = 2;
+    int l3 = 3;
+    int l4 = 4;
+    int l5 = 5;
+    int l6 = 6;
+    int l7 = 7;
+    double q = 1.0;
+    double* r = new double[msh];
+    double* jl = new double[msh];
+    double* djl = new double[msh];
 
     void SetUp()
     {
-        for(int i=0; i<msh; ++i) {r[i] = 0.01*(i);}
+        for (int i = 0; i < msh; ++i)
+        {
+            r[i] = 0.01 * (i);
+        }
     }
 
     void TearDown()
     {
-        delete [] r;
-        delete [] jl;
-        delete [] djl;
-    }       
+        delete[] r;
+        delete[] jl;
+        delete[] djl;
+    }
 };
 
-TEST_F(Sphbes,Constructor)
+TEST_F(Sphbes, Constructor)
 {
     EXPECT_NO_THROW(ModuleBase::Sphbes sb);
 }
 
-TEST_F(Sphbes,SphericalBessel)
+TEST_F(Sphbes, SphericalBessel)
 {
-    //int l = 0;
-    ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,l0,jl);
-    //reference result is from bessel_test.cpp which is calculated by
-    //ModuleBase::Sph_Bessel_Recursive::D1
-    EXPECT_NEAR(mean(jl,msh)/0.2084468748396,   1.0,doublethreshold); 
-
-
-    //int l = 1;
-    ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,l1,jl);
-    //reference result is from bessel_test.cpp which is calculated by
-    //ModuleBase::Sph_Bessel_Recursive::D1
-    EXPECT_NEAR(mean(jl,msh)/0.12951635180384,   1.0,doublethreshold);
-
-
-    //int l = 2;
-    ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,l2,jl);
-    //the result from bessel_test.cpp calculated by
-    //ModuleBase::Sph_Bessel_Recursive::D1 is 0.124201140093879
-    //reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
-    EXPECT_NEAR(mean(jl,msh)/0.12420114009271901456,   1.0,doublethreshold);
-
-    //int l = 3;
-    ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,l3,jl);
-    //the result from bessel_test.cpp calculated by
-    //ModuleBase::Sph_Bessel_Recursive::D1 is 0.118268654505568
-    //reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
-    EXPECT_NEAR(mean(jl,msh)/0.11826865448477598408,   1.0,doublethreshold);
-
-
-    //int l = 4;
-    ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,l4,jl);
-    //the result from bessel_test.cpp calculated by
-    //ModuleBase::Sph_Bessel_Recursive::D1 is 0.0933871035384385
-    //reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
-    EXPECT_NEAR(mean(jl,msh)/0.093387100084701621383,   1.0,doublethreshold);
-
-
-    //int l = 5;
-    ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,l5,jl);
-    //the result from bessel_test.cpp calculated by
-    //ModuleBase::Sph_Bessel_Recursive::D1 is 0.0603800487910689
-    //reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
-    EXPECT_NEAR(mean(jl,msh)/0.060380048719821471925,   1.0,doublethreshold);
-
-
-    //int l = 6;
-    ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,l6,jl);
-    //the result from bessel_test.cpp calculated by
-    //ModuleBase::Sph_Bessel_Recursive::D1 is 0.0327117051555907
-    //reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
-    EXPECT_NEAR(mean(jl,msh)/0.032711705053977857549,   1.0,doublethreshold);
-
-
-    //int l = 7;
-    ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,l7,jl);
-    //the result from bessel_test.cpp calculated by
-    //ModuleBase::Sph_Bessel_Recursive::D1 is 0.0152155566653926
-    //reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
-    EXPECT_NEAR(mean(jl,msh)/0.015215556095798710851,   1.0,doublethreshold);
+    // int l = 0;
+    ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, l0, jl);
+    // reference result is from bessel_test.cpp which is calculated by
+    // ModuleBase::Sph_Bessel_Recursive::D1
+    EXPECT_NEAR(mean(jl, msh) / 0.2084468748396, 1.0, doublethreshold);
+
+    // int l = 1;
+    ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, l1, jl);
+    // reference result is from bessel_test.cpp which is calculated by
+    // ModuleBase::Sph_Bessel_Recursive::D1
+    EXPECT_NEAR(mean(jl, msh) / 0.12951635180384, 1.0, doublethreshold);
+
+    // int l = 2;
+    ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, l2, jl);
+    // the result from bessel_test.cpp calculated by
+    // ModuleBase::Sph_Bessel_Recursive::D1 is 0.124201140093879
+    // reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
+    EXPECT_NEAR(mean(jl, msh) / 0.12420114009271901456, 1.0, doublethreshold);
+
+    // int l = 3;
+    ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, l3, jl);
+    // the result from bessel_test.cpp calculated by
+    // ModuleBase::Sph_Bessel_Recursive::D1 is 0.118268654505568
+    // reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
+    EXPECT_NEAR(mean(jl, msh) / 0.11826865448477598408, 1.0, doublethreshold);
+
+    // int l = 4;
+    ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, l4, jl);
+    // the result from bessel_test.cpp calculated by
+    // ModuleBase::Sph_Bessel_Recursive::D1 is 0.0933871035384385
+    // reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
+    EXPECT_NEAR(mean(jl, msh) / 0.093387100084701621383, 1.0, doublethreshold);
+
+    // int l = 5;
+    ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, l5, jl);
+    // the result from bessel_test.cpp calculated by
+    // ModuleBase::Sph_Bessel_Recursive::D1 is 0.0603800487910689
+    // reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
+    EXPECT_NEAR(mean(jl, msh) / 0.060380048719821471925, 1.0, doublethreshold);
+
+    // int l = 6;
+    ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, l6, jl);
+    // the result from bessel_test.cpp calculated by
+    // ModuleBase::Sph_Bessel_Recursive::D1 is 0.0327117051555907
+    // reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
+    EXPECT_NEAR(mean(jl, msh) / 0.032711705053977857549, 1.0, doublethreshold);
+
+    // int l = 7;
+    ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, l7, jl);
+    // the result from bessel_test.cpp calculated by
+    // ModuleBase::Sph_Bessel_Recursive::D1 is 0.0152155566653926
+    // reference result is calculated by Sphbes::Spherical_Bessel(msh,r,q,l,jl)
+    EXPECT_NEAR(mean(jl, msh) / 0.015215556095798710851, 1.0, doublethreshold);
 }
 
-TEST_F(Sphbes,dSpherical_Bessel_dx)
+TEST_F(Sphbes, dSpherical_Bessel_dx)
 {
     double djl0;
-    for(int il = 0 ; il <= l7 ; ++il)
-    {   
-        if(il == 1) djl0 = 1.0/3.0;
-        else        djl0 = 0.0;
-        ModuleBase::Sphbes::dSpherical_Bessel_dx(msh,r,q,il,djl);
-        ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,il,jl);
+    for (int il = 0; il <= l7; ++il)
+    {
+        if (il == 1)
+            djl0 = 1.0 / 3.0;
+        else
+            djl0 = 0.0;
+        ModuleBase::Sphbes::dSpherical_Bessel_dx(msh, r, q, il, djl);
+        ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, il, jl);
         EXPECT_NEAR(djl[0], djl0, 1e-8);
-        for(int i = 1; i < msh - 1; ++i)
+        for (int i = 1; i < msh - 1; ++i)
         {
-            if(jl[i-1] < 1e-8) continue;
-            double djl_diff = (jl[i+1] - jl[i-1])/(q*(r[i+1] - r[i-1]));
+            if (jl[i - 1] < 1e-8)
+                continue;
+            double djl_diff = (jl[i + 1] - jl[i - 1]) / (q * (r[i + 1] - r[i - 1]));
             EXPECT_NEAR(djl[i], djl_diff, 1e-4);
         }
-        ModuleBase::Sphbes::dSpherical_Bessel_dx(msh,r,0,il,djl);
-        for(int i = 0 ; i < msh ; ++i)
+        ModuleBase::Sphbes::dSpherical_Bessel_dx(msh, r, 0, il, djl);
+        for (int i = 0; i < msh; ++i)
         {
             EXPECT_NEAR(djl[i], djl0, 1e-8);
         }
     }
 }
 
-TEST_F(Sphbes,SphericalBesselRoots)
+TEST_F(Sphbes, SphericalBesselRoots)
 {
     int neign = 100;
-    double **eign = new double*[8];
-    for(int i=0;i<8;++i)
+    double** eign = new double*[8];
+    for (int i = 0; i < 8; ++i)
     {
         eign[i] = new double[neign];
-        ModuleBase::Sphbes::Spherical_Bessel_Roots(neign,i,1.0e-12,eign[i],10.0);
+        ModuleBase::Sphbes::Spherical_Bessel_Roots(neign, i, 1.0e-12, eign[i], 10.0);
     }
 
-    EXPECT_NEAR(eign[0][0]/0.31415926535899563188,       1.0,doublethreshold);
-    EXPECT_NEAR(eign[0][99]/31.415926535896932847,       1.0,doublethreshold);
-    EXPECT_NEAR(mean(eign[0],100)/15.865042900628463229, 1.0,doublethreshold);
-    EXPECT_NEAR(eign[1][0]/0.44934094579091843347,       1.0,doublethreshold);
-    EXPECT_NEAR(eign[1][99]/31.572689440204385392,       1.0,doublethreshold);
-    EXPECT_NEAR(mean(eign[1],100)/16.020655759558295017, 1.0,doublethreshold);
-    EXPECT_NEAR(eign[2][0]/0.57634591968946913276,       1.0,doublethreshold);
-    EXPECT_NEAR(eign[2][99]/31.729140298172534784,       1.0,doublethreshold);
-    EXPECT_NEAR(mean(eign[2],100)/16.175128483074864505, 1.0,doublethreshold);
-    EXPECT_NEAR(eign[3][0]/0.69879320005004752492,       1.0,doublethreshold);
-    EXPECT_NEAR(eign[3][99]/31.885283678838447941,       1.0,doublethreshold);
-    EXPECT_NEAR(mean(eign[3],100)/16.328616567969248763, 1.0,doublethreshold);
-    EXPECT_NEAR(eign[4][0]/0.81825614525711076741,       1.0,doublethreshold);
-    EXPECT_NEAR(eign[4][99]/32.041124042016576823,       1.0,doublethreshold);
-    EXPECT_NEAR(mean(eign[4],100)/16.481221742387987206, 1.0,doublethreshold);
-    EXPECT_NEAR(eign[5][0]/0.93558121110426506473,       1.0,doublethreshold);
-    EXPECT_NEAR(eign[5][99]/32.196665741899131774,       1.0,doublethreshold);
-    EXPECT_NEAR(mean(eign[5],100)/16.633019118735202113, 1.0,doublethreshold);
-    EXPECT_NEAR(eign[6][0]/1.051283540809391015,         1.0,doublethreshold);
-    EXPECT_NEAR(eign[6][99]/32.351913030537232885,       1.0,doublethreshold);
-    EXPECT_NEAR(mean(eign[6],100)/16.784067905062840964, 1.0,doublethreshold);
-    EXPECT_NEAR(eign[7][0]/1.1657032192516516567,        1.0,doublethreshold);
-    EXPECT_NEAR(eign[7][99]/32.506870061157627561,       1.0,doublethreshold);
-    EXPECT_NEAR(mean(eign[7],100)/16.934416735327332049, 1.0,doublethreshold);
-    
-    for(int i=0;i<8;++i) delete [] eign[i];
-    delete [] eign;
+    EXPECT_NEAR(eign[0][0] / 0.31415926535899563188, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[0][99] / 31.415926535896932847, 1.0, doublethreshold);
+    EXPECT_NEAR(mean(eign[0], 100) / 15.865042900628463229, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[1][0] / 0.44934094579091843347, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[1][99] / 31.572689440204385392, 1.0, doublethreshold);
+    EXPECT_NEAR(mean(eign[1], 100) / 16.020655759558295017, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[2][0] / 0.57634591968946913276, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[2][99] / 31.729140298172534784, 1.0, doublethreshold);
+    EXPECT_NEAR(mean(eign[2], 100) / 16.175128483074864505, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[3][0] / 0.69879320005004752492, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[3][99] / 31.885283678838447941, 1.0, doublethreshold);
+    EXPECT_NEAR(mean(eign[3], 100) / 16.328616567969248763, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[4][0] / 0.81825614525711076741, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[4][99] / 32.041124042016576823, 1.0, doublethreshold);
+    EXPECT_NEAR(mean(eign[4], 100) / 16.481221742387987206, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[5][0] / 0.93558121110426506473, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[5][99] / 32.196665741899131774, 1.0, doublethreshold);
+    EXPECT_NEAR(mean(eign[5], 100) / 16.633019118735202113, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[6][0] / 1.051283540809391015, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[6][99] / 32.351913030537232885, 1.0, doublethreshold);
+    EXPECT_NEAR(mean(eign[6], 100) / 16.784067905062840964, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[7][0] / 1.1657032192516516567, 1.0, doublethreshold);
+    EXPECT_NEAR(eign[7][99] / 32.506870061157627561, 1.0, doublethreshold);
+    EXPECT_NEAR(mean(eign[7], 100) / 16.934416735327332049, 1.0, doublethreshold);
+
+    for (int i = 0; i < 8; ++i)
+        delete[] eign[i];
+    delete[] eign;
 }
 
-
 // TEST_F(Sphbes, SeriesAndRecurrence)
 // {
 //     // This test checks whether Spherical_Bessel and sphbesj agree with each other
@@ -256,41 +259,46 @@ TEST_F(Sphbes, SphericalBesselPrecisionGrid)
     double* Y = new double[nr * (l_hi - l_lo + 1) + 10];
 
     // case 0: x = 0, l = 0
-    EXPECT_NEAR(ModuleBase::Sphbes::sphbesj(0,0), 1.0, 1e-12);
+    EXPECT_NEAR(ModuleBase::Sphbes::sphbesj(0, 0), 1.0, 1e-12);
     // case 1: x = 0, l = 1, ... , 11
     for (int l = 1; l <= l_hi; ++l)
     {
-        EXPECT_NEAR(ModuleBase::Sphbes::sphbesj(l,0), 0.0, 1e-12);
+        EXPECT_NEAR(ModuleBase::Sphbes::sphbesj(l, 0), 0.0, 1e-12);
     }
     // case 2: wide range of x and l
     double y;
     // read reference data
     std::ifstream fin("data/bjo.bin", std::ios::binary);
     int i = 0;
-    while (fin.read(reinterpret_cast<char*>(&y), sizeof(double))){
-        Y[i] = y; ++i;
+    while (fin.read(reinterpret_cast<char*>(&y), sizeof(double)))
+    {
+        Y[i] = y;
+        ++i;
     }
     fin.close();
 
-    for(int i = 0; i < nr; ++i){
+    for (int i = 0; i < nr; ++i)
+    {
         r[i] = (i + 1) * dr;
     }
 
     // test for new sphbesj
     for (int l = l_lo; l <= l_hi; ++l)
     {
-        for(int i = 0; i< nr; ++i){
+        for (int i = 0; i < nr; ++i)
+        {
             EXPECT_NEAR(ModuleBase::Sphbes::sphbesj(l, r[i] * q), Y[l * nr + i], 1e-12);
         }
-    }   
+    }
     // test for old Bessel
-    // most of l cases precision failed to achieve 1e-12 
+    // most of l cases precision failed to achieve 1e-12
     double* jl_old = new double[nr + 10];
     for (int l = l_lo; l <= l_hi; ++l)
     {
         ModuleBase::Sphbes::Spherical_Bessel(nr, r, q, l, jl_old);
         double errs = 0.0;
-        for(int i = 0; i< nr; ++i){
+        for (int i = 0; i < nr; ++i)
+        {
             errs += MAX(jl_old[i] - Y[l * nr + i], Y[l * nr + i] - jl_old[i]);
         }
         // std::cout << " l = " << l << ", errors: " << std::scientific << errs / nr << std::endl;
@@ -313,20 +321,24 @@ TEST_F(Sphbes, SphericalBesselPrecisionNearZero)
     double y;
     std::ifstream fin("data/bjxo.bin", std::ios::binary);
     int i = 0;
-    while (fin.read(reinterpret_cast<char*>(&y), sizeof(double))){
-        Y[i] = y; ++i;
+    while (fin.read(reinterpret_cast<char*>(&y), sizeof(double)))
+    {
+        Y[i] = y;
+        ++i;
     }
     fin.close();
     // generate x
     x[0] = 1.0 / (1 << 5);
-    for(int i = 1; i < n; i++){
-        x[i] = x[i-1] / 2;
+    for (int i = 1; i < n; i++)
+    {
+        x[i] = x[i - 1] / 2;
     }
 
     // test for sphbesj near zero
     for (int l = l_lo; l <= l_hi; ++l)
     {
-        for(int i = 0; i< n; ++i){
+        for (int i = 0; i < n; ++i)
+        {
             EXPECT_NEAR(ModuleBase::Sphbes::sphbesj(l, x[i]), Y[l * n + i], 1e-12);
         }
     }
@@ -351,29 +363,103 @@ TEST_F(Sphbes, Zeros)
     }
 }
 
-int main(int argc, char **argv)
+TEST_F(Sphbes, ZerosOld)
+{
+    // This test checks whether Spherical_Bessel_Roots properly computes the zeros of sphbesj.
+
+    int lmax = 7;
+    int nzeros = 50;
+    double* zeros = new double[nzeros];
+    for (int l = 0; l <= lmax; ++l)
+    {
+        ModuleBase::Sphbes::Spherical_Bessel_Roots(nzeros, l, 1e-7, zeros, 1.0);
+        for (int i = 0; i < nzeros; ++i)
+        {
+            EXPECT_LT(std::abs(ModuleBase::Sphbes::sphbesj(l, zeros[i])), 1e-7);
+        }
+    }
+}
+
+TEST_F(Sphbes, Derivatives)
+{
+    int lmax = 20;
+    int numr = 20;
+    double* r = new double[numr];
+    double* djl = new double[numr];
+    double q = 0.001;
+    r[0] = 1.0;
+    for (int i = 0; i < numr; ++i)
+    {
+        r[i + 1] = r[i] * 2.0;
+    }
+
+    for (int l = 0; l <= lmax; ++l)
+    {
+        ModuleBase::Sphbes::dsphbesj(numr, r, q, l, djl);
+        for (int i = 0; i < numr; ++i)
+        {
+            double h = 1e-8;
+            EXPECT_LT(
+                abs(djl[i] * 2 * h
+                    - (ModuleBase::Sphbes::sphbesj(l, q * r[i] + h) - ModuleBase::Sphbes::sphbesj(l, q * r[i] - h))),
+                1e-14);
+        }
+    }
+}
+
+TEST_F(Sphbes, DerivativesOld)
+{
+    int lmax = 20;
+    int numr = 20;
+    double* r = new double[numr];
+    double* djl = new double[numr];
+    double q = 0.001;
+    r[0] = 1.0;
+    for (int i = 0; i < numr; ++i)
+    {
+        r[i + 1] = r[i] * 2.0;
+    }
+
+    for (int l = 0; l < lmax; l++)
+    {
+        ModuleBase::Sphbes::dSpherical_Bessel_dx(numr, r, q, l, djl);
+        for (int i = 0; i < numr; i++)
+        {
+            double h = 1e-8;
+            double errs
+                = abs(djl[i] * 2 * h
+                      - (ModuleBase::Sphbes::sphbesj(l, q * r[i] + h) - ModuleBase::Sphbes::sphbesj(l, q * r[i] - h)));
+            if (errs > 1e-14)
+            {
+                std::cout << "l = " << l << ", r = " << r[i] << ", errs = " << errs << std::endl;
+            }
+        }
+    }
+}
+
+int main(int argc, char** argv)
 {
 #ifdef __MPI
-	MPI_Init(&argc, &argv);
+    MPI_Init(&argc, &argv);
 #endif
 
-	testing::InitGoogleTest(&argc, argv);
+    testing::InitGoogleTest(&argc, argv);
     int result = RUN_ALL_TESTS();
 #ifdef __MPI
-	MPI_Finalize();
+    MPI_Finalize();
 #endif
-	return result;
+    return result;
 }
 
-TEST_F(Sphbes,SphericalBesselsjp)
+TEST_F(Sphbes, SphericalBesselsjp)
 {
     int iii;
-    double  *sjp = new double[msh];
-    ModuleBase::Sphbes::Spherical_Bessel(msh,r,q,l0,jl,sjp);
-    EXPECT_NEAR(mean(jl,msh)/0.2084468748396,1.0,doublethreshold);
-    for(int iii = 0 ; iii <msh ; ++iii)
-    {   
+    double* sjp = new double[msh];
+    ModuleBase::Sphbes::Spherical_Bessel(msh, r, q, l0, jl, sjp);
+    EXPECT_NEAR(mean(jl, msh) / 0.2084468748396, 1.0, doublethreshold);
+    for (int iii = 0; iii < msh; ++iii)
+    {
         EXPECT_EQ(sjp[iii], 1.0);
     }
-    delete [] sjp;
+    delete[] sjp;
 }