From fbb6f2cccd2643405dc8ce7bed57582235f9c4b8 Mon Sep 17 00:00:00 2001
From: Zhao Tianqi <hongriTianqi@users.noreply.github.com>
Date: Mon, 22 Apr 2024 10:23:02 +0800
Subject: [PATCH 01/18] Feature: output mag force in spin constraint
 calculation (#4037)

* Feature: output mag force in spin constraint calculation

* add ut for print_Mag_Force
---
 source/module_esolver/esolver_ks_lcao.cpp     |  1 +
 .../module_deltaspin/lambda_loop_helper.cpp   |  4 +-
 .../module_deltaspin/spin_constrain.cpp       | 53 +++++++++++++++++--
 .../module_deltaspin/spin_constrain.h         |  3 ++
 .../test/cal_mw_helper_test.cpp               |  6 ++-
 .../test/lambda_loop_helper_test.cpp          |  5 ++
 .../test/spin_constrain_test.cpp              |  6 ++-
 7 files changed, 69 insertions(+), 9 deletions(-)
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index d142ae8e24..949e6c4ba7 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -1131,6 +1131,7 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(const int istep)
     {
         SpinConstrain<TK, psi::DEVICE_CPU>& sc = SpinConstrain<TK, psi::DEVICE_CPU>::getScInstance();
         sc.cal_MW(istep, &(this->LM), true);
+        sc.print_Mag_Force();
     }
 
     if (!GlobalV::CAL_FORCE && !GlobalV::CAL_STRESS)
diff --git a/source/module_hamilt_lcao/module_deltaspin/lambda_loop_helper.cpp b/source/module_hamilt_lcao/module_deltaspin/lambda_loop_helper.cpp
index dcebe43f7f..38a53b3e68 100644
--- a/source/module_hamilt_lcao/module_deltaspin/lambda_loop_helper.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/lambda_loop_helper.cpp
@@ -4,8 +4,8 @@
 template <>
 void SpinConstrain<std::complex<double>, psi::DEVICE_CPU>::print_termination()
 {
-    print_2d("after-optimization spin: (print in the inner loop): ", this->Mi_, this->nspin_);
-    print_2d("after-optimization lambda: (print in the inner loop): ", this->lambda_, this->nspin_);
+    print_2d("after-optimization spin (uB): (print in the inner loop): ", this->Mi_, this->nspin_);
+    print_2d("after-optimization lambda (Ry/uB): (print in the inner loop): ", this->lambda_, this->nspin_);
     std::cout << "Inner optimization for lambda ends." << std::endl;
     std::cout << "===============================================================================" << std::endl;
 }
diff --git a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp
index 8fce1484fc..e2c73335c3 100644
--- a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp
@@ -1,4 +1,5 @@
 #include "spin_constrain.h"
+#include "module_base/formatter_fmt.h"
 
 #include <cmath>
 
@@ -541,20 +542,66 @@ void SpinConstrain<FPTYPE, Device>::print_Mi(bool print)
     int nat = this->get_nat();
     if (print)
     {
+        formatter::Fmt fmt;
+        fmt.set_width(20);
+        fmt.set_precision(10);
+        fmt.set_fillChar(' ');
+        fmt.set_fixed(false);
+        fmt.set_right(true);
+        fmt.set_error(false);
+        std::cout << "Total Magnetism (uB): " << std::endl;
         for (int iat = 0; iat < nat; ++iat)
         {
             if (this->nspin_ == 2)
             {
-                std::cout << "Total Magnetism on atom: " << iat << " " << std::setprecision(10) << " (" << Mi_[iat].z << ")" << std::endl;
+                std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(Mi_[iat].z) << std::endl;
             }
             else if (this->nspin_ ==4)
             {
-                std::cout << "Total Magnetism on atom: " << iat << " " << std::setprecision(10) << " (" << Mi_[iat].x
-                        << ", " << Mi_[iat].y << ", " << Mi_[iat].z << ")" << std::endl;
+                std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(Mi_[iat].x) << fmt.format(Mi_[iat].y) << fmt.format(Mi_[iat].z) << std::endl;
             }
         }
     }
 }
 
+/// print magnetic force (defined as \frac{\delta{L}}/{\delta{Mi}} = -lambda[iat])
+template <typename FPTYPE, typename Device>
+void SpinConstrain<FPTYPE, Device>::print_Mag_Force()
+{
+    this->check_atomCounts();
+    int nat = this->get_nat();
+    formatter::Fmt fmt;
+    fmt.set_width(20);
+    fmt.set_precision(10);
+    fmt.set_fillChar(' ');
+    fmt.set_fixed(false);
+    fmt.set_right(true);
+    fmt.set_error(false);
+    std::cout << "Final optimal lambda (Ry/uB): " << std::endl;
+    for (int iat = 0; iat < nat; ++iat)
+    {
+        if (this->nspin_ == 2)
+        {
+            std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(lambda_[iat].z) << std::endl;
+        }
+        else if (this->nspin_ ==4)
+        {
+            std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(lambda_[iat].x) << fmt.format(lambda_[iat].y) << fmt.format(lambda_[iat].z) << std::endl;
+        }
+    }
+    std::cout << "Magnetic force (Ry/uB): " << std::endl;
+    for (int iat = 0; iat < nat; ++iat)
+    {
+        if (this->nspin_ == 2)
+        {
+            std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(-lambda_[iat].z) << std::endl;
+        }
+        else if (this->nspin_ ==4)
+        {
+            std::cout << "ATOM " << std::left << std::setw(6) << iat << fmt.format(-lambda_[iat].x) << fmt.format(-lambda_[iat].y) << fmt.format(-lambda_[iat].z) << std::endl;
+        }
+    }
+}
+
 template class SpinConstrain<std::complex<double>, psi::DEVICE_CPU>;
 template class SpinConstrain<double, psi::DEVICE_CPU>;
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
index 3ec76ac04f..a966467586 100644
--- a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
+++ b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
@@ -87,6 +87,9 @@ class SpinConstrain
   /// print mi
   void print_Mi(bool print = false);
 
+  /// print magnetic force, defined as \frac{\delta{L}}/{\delta{Mi}} = -lambda[iat])
+  void print_Mag_Force();
+
   /// collect_mw from matrix multiplication result
   void collect_MW(ModuleBase::matrix& MecMulP, const ModuleBase::ComplexMatrix& mud, int nw, int isk);
 
diff --git a/source/module_hamilt_lcao/module_deltaspin/test/cal_mw_helper_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/cal_mw_helper_test.cpp
index 628cc4f2c1..4c8bb14dac 100644
--- a/source/module_hamilt_lcao/module_deltaspin/test/cal_mw_helper_test.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/test/cal_mw_helper_test.cpp
@@ -66,7 +66,8 @@ TEST_F(SpinConstrainTest, CalculateMW)
     testing::internal::CaptureStdout();
     sc.print_Mi(true);
     std::string output = testing::internal::GetCapturedStdout();
-    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism on atom: 0  (2, 3, 4)"));
+    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism (uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0         2.0000000000e+00    3.0000000000e+00    4.0000000000e+00"));
 }
 
 TEST_F(SpinConstrainTest, CollectMW)
@@ -141,7 +142,8 @@ TEST_F(SpinConstrainTest, CalculateMWS2)
     testing::internal::CaptureStdout();
     sc.print_Mi(true);
     std::string output = testing::internal::GetCapturedStdout();
-    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism on atom: 0  (-1)"));
+    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism (uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0        -1.0000000000e+00"));
 }
 
 TEST_F(SpinConstrainTest, CollectMWS2)
diff --git a/source/module_hamilt_lcao/module_deltaspin/test/lambda_loop_helper_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/lambda_loop_helper_test.cpp
index d5d4e42144..a7183263fe 100644
--- a/source/module_hamilt_lcao/module_deltaspin/test/lambda_loop_helper_test.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/test/lambda_loop_helper_test.cpp
@@ -40,10 +40,15 @@ TEST_F(SpinConstrainTest, PrintTermination)
     sc.set_sc_lambda(sc_lambda.data(), 1);
     testing::internal::CaptureStdout();
     sc.print_termination();
+    sc.print_Mag_Force();
     std::string output = testing::internal::GetCapturedStdout();
     EXPECT_THAT(output, testing::HasSubstr("Inner optimization for lambda ends."));
     EXPECT_THAT(output, testing::HasSubstr("ATOM 1         0.0000000000e+00    0.0000000000e+00    0.0000000000e+00"));
     EXPECT_THAT(output, testing::HasSubstr("ATOM 1         1.0000000000e+00    2.0000000000e+00    3.0000000000e+00"));
+    EXPECT_THAT(output, testing::HasSubstr("Final optimal lambda (Ry/uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 1         1.0000000000e+00    2.0000000000e+00    3.0000000000e+00"));
+    EXPECT_THAT(output, testing::HasSubstr("Magnetic force (Ry/uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0        -1.0000000000e+00   -2.0000000000e+00   -3.0000000000e+00"));
 }
 
 TEST_F(SpinConstrainTest, CheckRmsStop)
diff --git a/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp
index e23add1002..cd1f7c2956 100644
--- a/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp
@@ -398,10 +398,12 @@ TYPED_TEST(SpinConstrainTest, PrintMi)
     testing::internal::CaptureStdout();
     this->sc.print_Mi(true);
     std::string output = testing::internal::GetCapturedStdout();
-    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism on atom: 0  (0, 0, 0)"));
+    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism (uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0         0.0000000000e+00    0.0000000000e+00    0.0000000000e+00"));
     this->sc.set_nspin(2);
      testing::internal::CaptureStdout();
     this->sc.print_Mi(true);
     output = testing::internal::GetCapturedStdout();
-    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism on atom: 0  (0)"));
+    EXPECT_THAT(output, testing::HasSubstr("Total Magnetism (uB):"));
+    EXPECT_THAT(output, testing::HasSubstr("ATOM 0         0.0000000000e+00"));
 }
\ No newline at end of file

From 6eb39e8431adb480c58f10ca71b5425a73998d28 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Mon, 22 Apr 2024 10:28:55 +0800
Subject: [PATCH 02/18] CI: update cuda tests (#4032)

Co-authored-by: Haozhi Han <haozhi.han@stu.pku.edu.cn>
---
 .github/workflows/cuda.yml | 81 ++++----------------------------------
 1 file changed, 7 insertions(+), 74 deletions(-)

diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 2832a3e02a..6df2577bb8 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -4,88 +4,21 @@ on:
   workflow_dispatch:
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: start
-          github-token: ${{ secrets.PAT }}
-          ec2-image-id: ami-04cd9fec4a7a39019
-          ec2-instance-type: g4dn.xlarge
-          subnet-id: subnet-72d3e53e
-          security-group-id: sg-06b0c93122c08aeab
-
   test:
-    name: Do the job on the runner
-    needs: start-runner # required to start the main job when the runner is ready
-    runs-on: ${{ needs.start-runner.outputs.label }} # run the job on the newly created runner
+    name: Test on CUDA Build
+    runs-on: nvidia
     container:
       image: ghcr.io/deepmodeling/abacus-cuda
       options: --gpus all
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-      - name: Build cuSolver
+        with:
+          submodules: recursive
+      - name: Build
         run: |
           nvidia-smi
-          cmake -B build -DUSE_CUSOLVER_LCAO=ON
+          cmake -B build -DUSE_CUDA=ON -DBUILD_TESTING=ON
           cmake --build build -j4
           cmake --install build
-          cmake -B build -DBUILD_TESTING=ON
-          cmake --build build -j4 --target hsolver_diago
-      - name: Test e2e
-        run: |
-          export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-          cd tests/integrate
-          echo "ks_solver cusolver" >> ./270_NO_MD_2O/INPUT
-          ./Autotest.sh -r 270_NO_MD_2O
-      - name: Test UT
-        run: |
-          cd source/src_pdiag/test/
-          cp ../../../build/source/src_pdiag/test/hsolver_diago .
-          ./hsolver_diago
-          bash diago_parallel_test.sh
-      - name: Test performance
-        run: |
-          cd examples/performance
-          ls -d P1*lcao* > allcase
-          sed -i '/ks_solver/d' P1*lcao*/INPUT
-          sed -i '$a ks_solver cusolver' P1*lcao*/INPUT
-          bash run.sh
-          cat sumall.dat
-
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner # required to get output from the start-runner job
-      - test # required to wait when the main job is done
-    runs-on: ubuntu-latest
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-      - name: Stop EC2 runner
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: stop
-          github-token: ${{ secrets.PAT }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
+      # TODO: add tests

From f709ba050552ebf03a3bcddf4361da5eec581630 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Mon, 22 Apr 2024 13:27:22 +0800
Subject: [PATCH 03/18] add some docs (#4039)

---
 docs/advanced/scf/converge.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/advanced/scf/converge.md b/docs/advanced/scf/converge.md
index f74da82522..4f77f2f3b7 100644
--- a/docs/advanced/scf/converge.md
+++ b/docs/advanced/scf/converge.md
@@ -10,6 +10,8 @@ For each of the mixing types, we also provide variables for controlling relevant
 
 `mixing_ndim` is the mixing dimensions in DIIS (broyden or pulay) mixing. Gerenally, a larger `mixing_ndim` leads to a better convergence. the default choice `mixing_ndim=8` should work fine in most cases. For `mixing_type`, the default choice is `broyden`, which is slightly better than `Pulay` typically. Besides that, a large `mixing_beta` means a larger change in electron density for each SCF step. For well-behaved systems, a larger `mixing_beta` leads to faster convergence. However, for some difficult cases, a smaller `mixing_beta` is preferred to avoid numerical instabilities.
 
+For most isolated systems, Kerker preconditioning is unnecessary. You can turn off it by setting `mixing_gg0  0.0` to get a faster convergence.
+
 For non-spin-polarized calculations, the default choices usually achieve convergence. If convergence issue arises in metallic systems, you can try different value of Kerker preconditioning [mixing_gg0](../input_files/input-main.md#mixing_gg0) and [mixing_gg0_min](../input_files/input-main.md#mixing_gg0_min), and try to reduce `mixing_beta`, which is 0.8 defaultly for `nspin=1`.
 
 For magnetic calculations, `mixing_beta_mag` and `mixing_gg0_mag` are activated. Considering collinear calculations, you can rely on the default value for most cases. If convergence issue arises, you can try to reduce `mixing_beta` and `mixing_beta_mag` together. For non-collinear calculations, tradtional broyden usually works, especially for a given magnetic configuration. If one is not interested in the energies of a given magnetic configuration but wants to determine the ground state by relaxing the magnetic moments’ directions, the standard Broyden mixing algorithm sometimes fails to find the correct magnetic configuration. If so, we can set [mixing_angle=1.0](../input_files/input-main.md#mixing_angle), which is a promising mixing method proposed by J. Phys. Soc. Jpn. 82 (2013) 114706.

From 25244770a1a8ea0cd999e9b99b084f8262cefa02 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Tue, 23 Apr 2024 14:34:02 +0800
Subject: [PATCH 04/18] fix final mag in STRU_ION_D (#4044)

---
 source/module_cell/read_atoms.cpp    | 30 +++++++++++++++++-----------
 source/module_cell/unitcell.h        |  2 +-
 source/module_io/mulliken_charge.cpp | 16 +++++++++++++++
 3 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/source/module_cell/read_atoms.cpp b/source/module_cell/read_atoms.cpp
index c5836b8aea..acf9a42600 100644
--- a/source/module_cell/read_atoms.cpp
+++ b/source/module_cell/read_atoms.cpp
@@ -1003,6 +1003,7 @@ void UnitCell::print_stru_file(const std::string &fn, const int &type, const int
 
 	if(type==1)
 	{
+		int nat_tmp = 0;
 		ofs << "Cartesian" << std::endl;
 		for(int it=0; it<ntype; it++)
 		{
@@ -1028,28 +1029,31 @@ void UnitCell::print_stru_file(const std::string &fn, const int &type, const int
                     ofs << context.str();
                 }
 
-                if (GlobalV::NSPIN == 2)
+                if (GlobalV::NSPIN == 2 && GlobalV::out_mul)
                 {
                     // output magnetic information
                     ofs << " mag ";
                     context.set_context("double_w6_f2");
-                    context << atoms[it].mag[ia];
+                    context << atom_mulliken[nat_tmp][1];
                     ofs << context.str();
                 }
-                else if (GlobalV::NSPIN == 4)
+                else if (GlobalV::NSPIN == 4 && GlobalV::out_mul)
                 {
                     // output magnetic information
                     ofs << " mag ";
-                    context.set_context("vector3d");
-                    context << atoms[it].m_loc_[ia].x << " " << atoms[it].m_loc_[ia].y << " " << atoms[it].m_loc_[ia].z;
-                    ofs << context.str();
+					ofs << std::fixed << std::setprecision(5);
+                    ofs << std::setw(8) << atom_mulliken[nat_tmp][1] 
+						<< std::setw(8) << atom_mulliken[nat_tmp][2] 
+						<< std::setw(8) << atom_mulliken[nat_tmp][3];
                 }
                 ofs << std::endl;
+				nat_tmp++;
 			}
 		}
 	}
 	else if(type==2)
 	{
+		int nat_tmp = 0;
 		ofs << "Direct" << std::endl;
 		for(int it=0; it<ntype; it++)
 		{
@@ -1075,23 +1079,25 @@ void UnitCell::print_stru_file(const std::string &fn, const int &type, const int
                     ofs << context.str();
                 }
 
-                if (GlobalV::NSPIN == 2)
+                if (GlobalV::NSPIN == 2 && GlobalV::out_mul)
                 {
                     // output magnetic information
                     ofs << " mag ";
                     context.set_context("double_w6_f2");
-                    context << atoms[it].mag[ia];
+                    context << atom_mulliken[nat_tmp][1];
                     ofs << context.str();
                 }
-                else if (GlobalV::NSPIN == 4)
+                else if (GlobalV::NSPIN == 4 && GlobalV::out_mul)
                 {
                     // output magnetic information
                     ofs << " mag ";
-                    context.set_context("vector3d");
-                    context << atoms[it].m_loc_[ia].x << " " << atoms[it].m_loc_[ia].y << " " << atoms[it].m_loc_[ia].z;
-                    ofs << context.str();
+					ofs << std::fixed << std::setprecision(5);
+                    ofs << std::setw(8) << atom_mulliken[nat_tmp][1] 
+						<< std::setw(8) << atom_mulliken[nat_tmp][2] 
+						<< std::setw(8) << atom_mulliken[nat_tmp][3];
                 }
                 ofs << std::endl;
+				nat_tmp++;
 			}
 		}
 	}
diff --git a/source/module_cell/unitcell.h b/source/module_cell/unitcell.h
index 6223e3b9f6..1d1e520da9 100644
--- a/source/module_cell/unitcell.h
+++ b/source/module_cell/unitcell.h
@@ -23,7 +23,7 @@ class UnitCell
     Magnetism magnet;  // magnetism Yu Liu 2021-07-03
     void cal_ux();
     bool judge_parallel(double a[3],ModuleBase::Vector3<double> b);
-	double *atom_mag;
+    std::vector<std::vector<double>> atom_mulliken;  //[nat][nspin]
 	int n_mag_at;
 
     std::string& Coordinate = lat.Coordinate;
diff --git a/source/module_io/mulliken_charge.cpp b/source/module_io/mulliken_charge.cpp
index 72b0046cc3..60983030cb 100644
--- a/source/module_io/mulliken_charge.cpp
+++ b/source/module_io/mulliken_charge.cpp
@@ -312,6 +312,16 @@ void ModuleIO::out_mulliken(const int& step, LCAO_Matrix* LM, const elecstate::E
 		os << "Decomposed Mulliken populations" << std::endl;
         GlobalV::ofs_running << std::endl <<std::endl;
 
+        // alloacte space for ucell.atom_mulliken
+        if (GlobalC::ucell.atom_mulliken.empty())
+        {
+            GlobalC::ucell.atom_mulliken.resize(GlobalC::ucell.nat);
+            for (int iat = 0; iat < GlobalC::ucell.nat; iat++)
+            {
+                GlobalC::ucell.atom_mulliken[iat].resize(GlobalV::NSPIN);
+            }
+        }
+
         for (size_t i = 0; i != GlobalC::ucell.nat; ++i)
         {
             double total_charge = 0.0, atom_mag = 0.0;
@@ -428,6 +438,8 @@ void ModuleIO::out_mulliken(const int& step, LCAO_Matrix* LM, const elecstate::E
                 os << "Total Charge on atom:  " << GlobalC::ucell.atoms[t].label <<  std::setw(20) << total_charge <<std::endl;
                 os << "Total Magnetism on atom:  " << GlobalC::ucell.atoms[t].label <<  std::setw(20) << ModuleIO::output_cut(atom_mag) <<std::endl;
                 GlobalV::ofs_running << "Total Magnetism on atom:  " << GlobalC::ucell.atoms[t].label <<  std::setw(20) << std::setprecision(10) << atom_mag <<std::endl;
+                GlobalC::ucell.atom_mulliken[i][0] = total_charge;
+                GlobalC::ucell.atom_mulliken[i][1] = atom_mag;
             }
             else if (GlobalV::NSPIN==4)
             {
@@ -443,6 +455,10 @@ void ModuleIO::out_mulliken(const int& step, LCAO_Matrix* LM, const elecstate::E
                 GlobalV::ofs_running << "Total Magnetism on atom:  " << GlobalC::ucell.atoms[t].label <<  std::setw(10)
                 << "("  << std::setprecision(10) << spin2 << ", " << std::setprecision(10) << spin3 << ", " << std::setprecision(10) << spin4 << ")"
                 <<std::endl;
+                GlobalC::ucell.atom_mulliken[i][0] = spin1;
+                GlobalC::ucell.atom_mulliken[i][1] = spin2;
+                GlobalC::ucell.atom_mulliken[i][2] = spin3;
+                GlobalC::ucell.atom_mulliken[i][3] = spin4;
             }
             os << std::endl <<std::endl;
         }

From bec5e64685a0255588b7ba955963571521cb0cc0 Mon Sep 17 00:00:00 2001
From: Zhao Tianqi <hongriTianqi@users.noreply.github.com>
Date: Wed, 24 Apr 2024 12:32:28 +0800
Subject: [PATCH 05/18] Test: case test for noncolin force (#4048)

---
 tests/integrate/204_NO_KP_NC/INPUT            |   2 +
 tests/integrate/204_NO_KP_NC/STRU             |   2 +-
 tests/integrate/204_NO_KP_NC/result.ref       |   8 +-
 .../204_NO_KP_NC_deltaspin/mulliken.txt.ref   | 168 +++++++++---------
 .../204_NO_KP_NC_deltaspin/result.ref         |   7 +-
 5 files changed, 96 insertions(+), 91 deletions(-)

diff --git a/tests/integrate/204_NO_KP_NC/INPUT b/tests/integrate/204_NO_KP_NC/INPUT
index 8886378fdf..4bb8c943c4 100644
--- a/tests/integrate/204_NO_KP_NC/INPUT
+++ b/tests/integrate/204_NO_KP_NC/INPUT
@@ -23,3 +23,5 @@ smearing_sigma			0.002
 
 #Parameters (5.Mixing)
 
+
+cal_force 1
diff --git a/tests/integrate/204_NO_KP_NC/STRU b/tests/integrate/204_NO_KP_NC/STRU
index 73cf74c4f9..cd92e43261 100644
--- a/tests/integrate/204_NO_KP_NC/STRU
+++ b/tests/integrate/204_NO_KP_NC/STRU
@@ -18,5 +18,5 @@ Direct
 Si	// Element type	
 0.0	// magnetism
 2	
-0.00 0.00 0.00 1 1 1 mag 1.0 angle1 90 angle2 0
+0.10 0.00 0.00 1 1 1 mag 1.0 angle1 90 angle2 0
 0.25 0.25 0.25 1 1 1 mag 1.0 angle1 90 angle2 180
diff --git a/tests/integrate/204_NO_KP_NC/result.ref b/tests/integrate/204_NO_KP_NC/result.ref
index e0ce28aca8..d1c497de0c 100644
--- a/tests/integrate/204_NO_KP_NC/result.ref
+++ b/tests/integrate/204_NO_KP_NC/result.ref
@@ -1,3 +1,5 @@
-etotref -211.1858438233880
-etotperatomref -105.5929219117
-totaltimeref 1.3488
+etotref -210.3233812890469
+etotperatomref -105.1616906445
+totalforceref 17.244244
+totaltimeref 28.3911
+23.58
diff --git a/tests/integrate/204_NO_KP_NC_deltaspin/mulliken.txt.ref b/tests/integrate/204_NO_KP_NC_deltaspin/mulliken.txt.ref
index bffad6b08a..168c5723a3 100644
--- a/tests/integrate/204_NO_KP_NC_deltaspin/mulliken.txt.ref
+++ b/tests/integrate/204_NO_KP_NC_deltaspin/mulliken.txt.ref
@@ -3,92 +3,92 @@ CALCULATE THE MULLIkEN ANALYSIS FOR EACH ATOM
  Total charge:	32
 Decomposed Mulliken populations
 0                 Zeta of Fe                        Spin 1                        Spin 2                        Spin 3                        Spin 4
-s                        0                           1.317                       0.06196                       -0.2625                      -0.07949
-  sum over m                                         1.317                       0.06196                       -0.2625                      -0.07949
-s                        1                           1.726                      -0.01809                       0.09886                      -0.01413
-  sum over m                                         1.726                      -0.01809                       0.09886                      -0.01413
-s                        2                         0.03246                      -0.04153                        0.2209                      -0.02228
-  sum over m                                       0.03246                      -0.04153                        0.2209                      -0.02228
-s                        3                        -0.02921                      0.005609                        -0.025                     -0.005114
-  sum over m                                      -0.02921                      0.005609                        -0.025                     -0.005114
-  sum over m+zeta                                    3.046                      0.007945                        0.0323                        -0.121
-pz                        0                           2.034                     -0.001186                      0.005932                    -3.981e-06
-px                        0                           2.033                     -0.001283                      0.006419                    -3.989e-06
-py                        0                           2.033                     -0.001188                      0.005944                    -3.979e-06
-  sum over m                                           6.1                     -0.003658                        0.0183                    -1.195e-05
-pz                        1                        -0.02621                     0.0005578                     -0.002789                             0
-px                        1                        -0.02639                     0.0006107                     -0.003054                             0
-py                        1                        -0.02603                     0.0005536                     -0.002768                             0
-  sum over m                                      -0.07863                      0.001722                     -0.008611                             0
-  sum over m+zeta                                    6.021                     -0.001936                      0.009684                    -1.277e-05
-dz^2                        0                           1.964                     0.0008269                     -0.004128                    -1.088e-05
-dxz                        0                           1.044                         0.156                       -0.7849                    -0.0003055
-dyz                        0                          0.9592                        0.1564                       -0.7869                    -0.0003096
-dx^2-y^2                        0                           1.967                      0.000752                     -0.003754                    -1.059e-05
-dxy                        0                           1.055                        0.1558                       -0.7835                    -0.0003047
-  sum over m                                         6.988                        0.4698                        -2.363                    -0.0009413
-dz^2                        1                         0.03863                    -0.0008716                      0.004365                    -1.357e-05
-dxz                        1                        -0.03708                     -0.004148                       0.02101                     1.956e-05
-dyz                        1                        -0.03373                     -0.004494                       0.02274                     1.968e-05
-dx^2-y^2                        1                         0.03943                    -0.0009117                      0.004566                    -1.471e-05
-dxy                        1                        -0.03733                     -0.004056                       0.02055                     1.945e-05
-  sum over m                                      -0.03008                      -0.01448                       0.07324                     3.041e-05
-  sum over m+zeta                                    6.958                        0.4553                         -2.29                    -0.0009109
-fz^3                        0                       -0.007044                     0.0007552                     -0.003776                    -1.406e-06
-fxz^2                        0                       -0.002046                     0.0002628                     -0.001314                             0
-fyz^2                        0                        -0.00273                       0.00029                      -0.00145                             0
-fzx^2-zy^2                        0                       5.811e-05                             0                     3.451e-06                             0
-fxyz                        0                        1.14e-05                     1.249e-06                    -6.306e-06                             0
-fx^3-3*xy^2                        0                       -0.003379                     0.0004381                      -0.00219                             0
-f3yx^2-y^3                        0                        -0.00407                     0.0004626                     -0.002313                             0
-  sum over m                                       -0.0192                      0.002209                      -0.01105                    -4.307e-06
-  sum over m+zeta                                  -0.0192                      0.002209                      -0.01105                    -4.307e-06
-Total Charge on atom:  Fe               16.01
-Total Magnetism on atom:  Fe                   (0.4635, -2.259, -0.1219)
+s                        0                           1.317                       0.05552                        0.2843                       0.02903
+  sum over m                                         1.317                       0.05552                        0.2843                       0.02903
+s                        1                           1.726                      -0.01923                      -0.09498                      0.005159
+  sum over m                                         1.726                      -0.01923                      -0.09498                      0.005159
+s                        2                         0.03246                      -0.04333                       -0.2148                      0.008137
+  sum over m                                       0.03246                      -0.04333                       -0.2148                      0.008137
+s                        3                        -0.02921                      0.005194                       0.02641                      0.001867
+  sum over m                                      -0.02921                      0.005194                       0.02641                      0.001867
+  sum over m+zeta                                    3.046                     -0.001842                     0.0009368                       0.04419
+pz                        0                           2.034                     -0.001185                     -0.005932                     1.545e-06
+px                        0                           2.033                     -0.001283                     -0.006419                     1.538e-06
+py                        0                           2.033                     -0.001188                     -0.005944                     1.543e-06
+  sum over m                                           6.1                     -0.003656                      -0.01829                     4.626e-06
+pz                        1                        -0.02622                     0.0005602                      0.002791                             0
+px                        1                        -0.02639                     0.0006145                      0.003054                             0
+py                        1                        -0.02603                     0.0005563                       0.00277                             0
+  sum over m                                      -0.07864                      0.001731                      0.008615                             0
+  sum over m+zeta                                    6.021                     -0.001925                      -0.00968                     5.611e-06
+dz^2                        0                           1.964                     0.0008273                      0.004131                     4.077e-06
+dxz                        0                           1.044                        0.1755                        0.7507                      0.002258
+dyz                        0                          0.9544                        0.1768                        0.7532                      0.002329
+dx^2-y^2                        0                           1.967                     0.0007523                      0.003756                     3.978e-06
+dxy                        0                           1.055                        0.1751                        0.7495                      0.002251
+  sum over m                                         6.984                         0.529                         2.261                      0.006846
+dz^2                        1                         0.03863                    -0.0008699                     -0.004363                     5.197e-06
+dxz                        1                        -0.03759                     -0.005346                      -0.01936                    -0.0001322
+dyz                        1                        -0.03407                     -0.005734                      -0.02118                    -0.0001342
+dx^2-y^2                        1                         0.03943                    -0.0009093                     -0.004564                     5.691e-06
+dxy                        1                        -0.03787                     -0.005246                       -0.0189                    -0.0001314
+  sum over m                                      -0.03146                      -0.01811                      -0.06836                     -0.000387
+  sum over m+zeta                                    6.952                        0.5109                         2.193                      0.006459
+fz^3                        0                       -0.007049                     0.0007578                      0.003775                             0
+fxz^2                        0                       -0.002045                     0.0002638                      0.001312                             0
+fyz^2                        0                       -0.002729                     0.0002912                      0.001448                             0
+fzx^2-zy^2                        0                       6.273e-05                             0                    -6.642e-06                             0
+fxyz                        0                       1.153e-05                     1.446e-06                     5.675e-06                             0
+fx^3-3*xy^2                        0                        -0.00338                       0.00044                      0.002189                             0
+f3yx^2-y^3                        0                        -0.00407                     0.0004646                      0.002311                             0
+  sum over m                                       -0.0192                      0.002219                       0.01103                     2.581e-06
+  sum over m+zeta                                  -0.0192                      0.002219                       0.01103                     2.581e-06
+Total Charge on atom:  Fe                  16
+Total Magnetism on atom:  Fe                   (0.5093, 2.195, 0.05066)
 
 
 1                 Zeta of Fe                        Spin 1                        Spin 2                        Spin 3                        Spin 4
-s                        0                           1.275                       0.04699                       -0.2823                       0.07949
-  sum over m                                         1.275                       0.04699                       -0.2823                       0.07949
-s                        1                           1.755                      -0.01866                       0.08491                       0.01412
-  sum over m                                         1.755                      -0.01866                       0.08491                       0.01412
-s                        2                        -0.02899                      -0.04221                        0.1978                       0.02226
-  sum over m                                      -0.02899                      -0.04221                        0.1978                       0.02226
-s                        3                        -0.04712                       0.00595                      -0.03281                      0.005133
-  sum over m                                      -0.04712                       0.00595                      -0.03281                      0.005133
-  sum over m+zeta                                    2.954                     -0.007928                      -0.03239                         0.121
-pz                        0                           2.032                     -0.001371                       0.00685                     3.967e-06
-px                        0                           2.025                    -0.0009218                      0.004606                     3.958e-06
-py                        0                           2.032                     -0.001333                      0.006664                     3.965e-06
-  sum over m                                         6.089                     -0.003626                       0.01812                     1.189e-05
-pz                        1                        -0.02529                     0.0005803                     -0.002904                             0
-px                        1                        -0.01606                     0.0001295                    -0.0006492                             0
-py                        1                        -0.02466                     0.0005625                     -0.002815                             0
-  sum over m                                      -0.06602                      0.001272                     -0.006367                             0
-  sum over m+zeta                                    6.023                     -0.002353                       0.01175                      1.25e-05
-dz^2                        0                           1.957                      0.001154                     -0.005778                     1.149e-05
-dxz                        0                           1.091                        0.1517                       -0.7637                    -8.462e-05
-dyz                        0                          0.9556                        0.1553                       -0.7815                    -8.443e-05
-dx^2-y^2                        0                           1.947                      0.001648                     -0.008249                     1.233e-05
-dxy                        0                           1.106                        0.1508                       -0.7591                    -8.432e-05
-  sum over m                                         7.056                        0.4606                        -2.318                    -0.0002295
-dz^2                        1                         0.03925                     -0.001067                      0.005328                     1.289e-05
-dxz                        1                        -0.03558                     -0.002824                       0.01439                       2.5e-06
-dyz                        1                        -0.03117                     -0.003962                        0.0201                     2.798e-06
-dx^2-y^2                        1                         0.04266                     -0.001401                      0.006997                      1.29e-05
-dxy                        1                        -0.03637                     -0.002747                       0.01401                     2.475e-06
-  sum over m                                      -0.02122                        -0.012                       0.06082                     3.356e-05
-  sum over m+zeta                                    7.035                        0.4486                        -2.257                     -0.000196
-fz^3                        0                       -0.006615                     0.0007206                     -0.003605                     1.352e-06
-fxz^2                        0                       -0.001955                     0.0002554                     -0.001278                             0
-fyz^2                        0                       -0.002684                     0.0002735                     -0.001368                             0
-fzx^2-zy^2                        0                       9.383e-05                      1.68e-05                    -8.473e-05                             0
-fxyz                        0                       2.053e-05                      3.66e-06                    -1.839e-05                             0
-fx^3-3*xy^2                        0                       -0.003204                     0.0004266                     -0.002134                             0
-f3yx^2-y^3                        0                       -0.003695                     0.0004558                     -0.002281                             0
-  sum over m                                      -0.01804                      0.002152                      -0.01077                     4.022e-06
-  sum over m+zeta                                 -0.01804                      0.002152                      -0.01077                     4.022e-06
-Total Charge on atom:  Fe               15.99
-Total Magnetism on atom:  Fe                   (0.4405, -2.289, 0.1208)
+s                        0                           1.275                       0.05341                        0.2605                      -0.02903
+  sum over m                                         1.275                       0.05341                        0.2605                      -0.02903
+s                        1                           1.755                      -0.01752                      -0.08879                     -0.005156
+  sum over m                                         1.755                      -0.01752                      -0.08879                     -0.005156
+s                        2                        -0.02898                       -0.0404                       -0.2039                      -0.00813
+  sum over m                                      -0.02898                       -0.0404                       -0.2039                      -0.00813
+s                        3                        -0.04711                      0.006367                       0.03139                     -0.001874
+  sum over m                                      -0.04711                      0.006367                       0.03139                     -0.001874
+  sum over m+zeta                                    2.954                      0.001862                    -0.0008532                      -0.04419
+pz                        0                           2.032                     -0.001369                     -0.006852                    -1.367e-06
+px                        0                           2.025                    -0.0009208                     -0.004608                    -1.387e-06
+py                        0                           2.032                     -0.001332                     -0.006666                    -1.366e-06
+  sum over m                                         6.089                     -0.003622                      -0.01813                    -4.119e-06
+pz                        1                        -0.02528                     0.0005889                      0.002889                             0
+px                        1                        -0.01606                     0.0001369                     0.0006408                             0
+py                        1                        -0.02466                      0.000571                      0.002802                             0
+  sum over m                                        -0.066                      0.001297                      0.006331                     2.367e-06
+  sum over m+zeta                                    6.023                     -0.002325                      -0.01179                    -1.753e-06
+dz^2                        0                           1.957                      0.001158                      0.005774                    -3.913e-06
+dxz                        0                           1.097                        0.1724                        0.7275                      0.002311
+dyz                        0                          0.9509                        0.1759                        0.7475                      0.002269
+dx^2-y^2                        0                           1.947                      0.001654                      0.008245                    -4.075e-06
+dxy                        0                           1.113                        0.1714                        0.7227                      0.002304
+  sum over m                                         7.065                        0.5225                         2.212                      0.006876
+dz^2                        1                         0.03925                     -0.001062                     -0.005333                    -4.383e-06
+dxz                        1                         -0.0366                     -0.003947                      -0.01263                    -0.0001213
+dyz                        1                        -0.03157                     -0.005197                      -0.01856                    -0.0001267
+dx^2-y^2                        1                         0.04266                     -0.001394                     -0.007002                    -4.206e-06
+dxy                        1                        -0.03743                     -0.003854                      -0.01222                    -0.0001203
+  sum over m                                      -0.02369                      -0.01545                      -0.05575                    -0.0003768
+  sum over m+zeta                                    7.041                        0.5071                         2.156                      0.006499
+fz^3                        0                       -0.006614                     0.0007261                      0.003596                             0
+fxz^2                        0                       -0.001954                     0.0002565                      0.001276                             0
+fyz^2                        0                       -0.002684                     0.0002742                      0.001366                             0
+fzx^2-zy^2                        0                        9.09e-05                      1.99e-05                     8.018e-05                             0
+fxyz                        0                       2.062e-05                     4.102e-06                     1.816e-05                             0
+fx^3-3*xy^2                        0                       -0.003203                     0.0004291                       0.00213                             0
+f3yx^2-y^3                        0                       -0.003698                     0.0004635                      0.002271                             0
+  sum over m                                      -0.01804                      0.002174                       0.01074                             0
+  sum over m+zeta                                 -0.01804                      0.002174                       0.01074                             0
+Total Charge on atom:  Fe                  16
+Total Magnetism on atom:  Fe                   (0.5088, 2.154, -0.03769)
 
 
diff --git a/tests/integrate/204_NO_KP_NC_deltaspin/result.ref b/tests/integrate/204_NO_KP_NC_deltaspin/result.ref
index 8a17a1fada..5db4c4ed06 100644
--- a/tests/integrate/204_NO_KP_NC_deltaspin/result.ref
+++ b/tests/integrate/204_NO_KP_NC_deltaspin/result.ref
@@ -1,4 +1,5 @@
-etotref -6844.326716364628
-etotperatomref -3422.1633581823
+etotref -6844.685232778927
+etotperatomref -3422.3426163895
 Compare_mulliken_pass 0
-totaltimeref 36.59
+totaltimeref 28.5759
+14.63

From 92ef679df24c406d83f5130ea655560e058fb6b6 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Fri, 26 Apr 2024 12:24:41 +0800
Subject: [PATCH 06/18] Fix: add a check_rho() to avoid negative charge density
 (#4053)

* add a check_rho() to avoid negative rho_up/rho_dn

* add mocks in UTs

* change warningquit to warning about rho

* Update source/module_elecstate/module_charge/charge.cpp

Co-authored-by: kirk0830 <67682086+kirk0830@users.noreply.github.com>

* Update source/module_elecstate/module_charge/charge.cpp

Co-authored-by: kirk0830 <67682086+kirk0830@users.noreply.github.com>

---------

Co-authored-by: kirk0830 <67682086+kirk0830@users.noreply.github.com>
---
 source/module_elecstate/elecstate.cpp         |  1 +
 .../module_elecstate/module_charge/charge.cpp | 41 +++++++++++++++++--
 .../module_elecstate/module_charge/charge.h   |  6 ++-
 source/module_elecstate/test/charge_test.cpp  |  6 +--
 .../test/elecstate_base_test.cpp              |  3 ++
 .../test/elecstate_pw_test.cpp                |  3 ++
 6 files changed, 51 insertions(+), 9 deletions(-)

diff --git a/source/module_elecstate/elecstate.cpp b/source/module_elecstate/elecstate.cpp
index 393c2d07d5..6a8af42fb7 100644
--- a/source/module_elecstate/elecstate.cpp
+++ b/source/module_elecstate/elecstate.cpp
@@ -218,6 +218,7 @@ void ElecState::init_scf(const int istep, const ModuleBase::ComplexMatrix& struc
     if (istep == 0)
     {
         this->charge->init_rho(this->eferm, strucfac, this->bigpw->nbz, this->bigpw->bz);
+        this->charge->check_rho(); // check the rho
     }
 
     // renormalize the charge density
diff --git a/source/module_elecstate/module_charge/charge.cpp b/source/module_elecstate/module_charge/charge.cpp
index b1c7dee91d..e659c46213 100644
--- a/source/module_elecstate/module_charge/charge.cpp
+++ b/source/module_elecstate/module_charge/charge.cpp
@@ -700,7 +700,7 @@ void Charge::save_rho_before_sum_band(void)
     return;
 }
 
-double Charge::check_ne(const double* rho_in) const
+double Charge::cal_rho2ne(const double* rho_in) const
 {
     double ne = 0.0;
     for (int ir = 0; ir < this->rhopw->nrxx; ir++)
@@ -711,12 +711,45 @@ double Charge::check_ne(const double* rho_in) const
     Parallel_Reduce::reduce_pool(ne);
 #endif
     ne = ne * elecstate::get_ucell_omega() / (double)this->rhopw->nxyz;
-    std::cout << std::setprecision(10);
-    std::cout << " check the electrons number from rho, ne =" << ne << std::endl;
-    std::cout << std::setprecision(6);
+
     return ne;
 }
 
+void Charge::check_rho()
+{
+    if (this->nspin==1 || this->nspin==4)
+    {
+        double ne = 0.0;
+        ne = this->cal_rho2ne(rho[0]);
+        if (std::abs(ne - GlobalV::nelec) > 1.0e-6)
+        {
+            ModuleBase::WARNING("Charge", "Charge is not equal to the number of electrons!");
+        }
+    }
+    else if (this->nspin == 2)
+    {
+        // for spin up
+        double ne_up = 0.0;
+        ne_up = this->cal_rho2ne(rho[0]);
+        if (ne_up < 0.0)
+        {
+            ModuleBase::WARNING_QUIT("Charge", "Number of spin-down electrons set in starting magnetization exceeds all available.");
+        }
+        // for spin down
+        double ne_dn = 0.0;
+        ne_dn = this->cal_rho2ne(rho[1]);
+        if (ne_dn < 0.0)
+        {
+            ModuleBase::WARNING_QUIT("Charge", "Number of spin-up electrons set in starting magnetization exceeds all available.");
+        }
+        // for total charge
+        if (std::abs(ne_up + ne_dn - GlobalV::nelec) > 1.0e-6)
+        {
+            ModuleBase::WARNING("Charge", "Charge is not equal to the number of electrons!");
+        }
+    }
+}
+
 // LiuXh add 20180619
 void Charge::init_final_scf()
 {
diff --git a/source/module_elecstate/module_charge/charge.h b/source/module_elecstate/module_charge/charge.h
index 58bc2ef615..71fdc25120 100644
--- a/source/module_elecstate/module_charge/charge.h
+++ b/source/module_elecstate/module_charge/charge.h
@@ -98,9 +98,11 @@ class Charge
         double *rhocg
     ) const;
 
-	double check_ne(const double *rho_in) const;
+	double cal_rho2ne(const double *rho_in) const;
 
-    void init_final_scf(); //LiuXh add 20180619
+  void check_rho(); // to check whether the charge density is normal
+
+  void init_final_scf(); //LiuXh add 20180619
 
 	public:
     /**
diff --git a/source/module_elecstate/test/charge_test.cpp b/source/module_elecstate/test/charge_test.cpp
index 4c50265284..37cdd0ab05 100644
--- a/source/module_elecstate/test/charge_test.cpp
+++ b/source/module_elecstate/test/charge_test.cpp
@@ -64,7 +64,7 @@ void Set_GlobalV_Default()
  *     - calculate \sum_{is}^nspin \sum_{ir}^nrxx rho[is][ir]
  *   - RenormalizeRho: Charge::renormalize_rho()
  *     - renormalize rho so as to ensure the sum of rho equals to total number of electrons
- *   - CheckNe: Charge::check_ne()
+ *   - CheckNe: Charge::cal_rho2ne()
  *     - check the total number of electrons summed from rho[is]
  *   - SaveRhoBeforeSumBand: Charge::save_rho_before_sum_band()
  *     - meaning as the function name
@@ -185,7 +185,7 @@ TEST_F(ChargeTest, CheckNe)
     elecstate::tmp_ucell_omega = ucell->omega;
     charge->renormalize_rho();
     EXPECT_NEAR(charge->sum_rho(), 8.0, 1e-10);
-    EXPECT_NEAR(charge->check_ne(charge->rho[0]), 8.0, 1e-10);
+    EXPECT_NEAR(charge->cal_rho2ne(charge->rho[0]), 8.0, 1e-10);
 }
 
 TEST_F(ChargeTest, SaveRhoBeforeSumBand)
@@ -207,7 +207,7 @@ TEST_F(ChargeTest, SaveRhoBeforeSumBand)
     elecstate::tmp_ucell_omega = ucell->omega;
     charge->renormalize_rho();
     charge->save_rho_before_sum_band();
-    EXPECT_NEAR(charge->check_ne(charge->rho_save[0]), 8.0, 1e-10);
+    EXPECT_NEAR(charge->cal_rho2ne(charge->rho_save[0]), 8.0, 1e-10);
 }
 
 TEST_F(ChargeTest, InitFinalScf)
diff --git a/source/module_elecstate/test/elecstate_base_test.cpp b/source/module_elecstate/test/elecstate_base_test.cpp
index 2f5ee12e6d..33b831e6a9 100644
--- a/source/module_elecstate/test/elecstate_base_test.cpp
+++ b/source/module_elecstate/test/elecstate_base_test.cpp
@@ -74,6 +74,9 @@ void Charge::set_rhopw(ModulePW::PW_Basis*)
 void Charge::renormalize_rho()
 {
 }
+void Charge::check_rho()
+{
+}
 
 /************************************************
  *  unit test of elecstate.cpp
diff --git a/source/module_elecstate/test/elecstate_pw_test.cpp b/source/module_elecstate/test/elecstate_pw_test.cpp
index ea859072ad..b82dd3cef8 100644
--- a/source/module_elecstate/test/elecstate_pw_test.cpp
+++ b/source/module_elecstate/test/elecstate_pw_test.cpp
@@ -144,6 +144,9 @@ void Charge::set_rhopw(ModulePW::PW_Basis*)
 void Charge::renormalize_rho()
 {
 }
+void Charge::check_rho()
+{
+}
 
 void Set_GlobalV_Default()
 {

From 7f84a0996a90e439c95a34e560e5fe33b452ac6c Mon Sep 17 00:00:00 2001
From: Peng Xingliang <91927439+pxlxingliang@users.noreply.github.com>
Date: Fri, 26 Apr 2024 19:07:47 +0800
Subject: [PATCH 07/18] update version (#4061)

---
 source/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/version.h b/source/version.h
index db044fd958..3be59f6341 100644
--- a/source/version.h
+++ b/source/version.h
@@ -1,3 +1,3 @@
 #ifndef VERSION
-#define VERSION "v3.6.1"
+#define VERSION "v3.6.2"
 #endif

From f372175d267b92e99e20c668e50111c5acae4860 Mon Sep 17 00:00:00 2001
From: GRYS <57103981+grysgreat@users.noreply.github.com>
Date: Sat, 27 Apr 2024 21:07:24 +0800
Subject: [PATCH 08/18] doc: add docs about using MPI in docker. (#4060)

* add docs.

* Update docs/community/faq.md

Co-authored-by: Chun Cai <amoycaic@gmail.com>

* Update docs/community/faq.md

Co-authored-by: Chun Cai <amoycaic@gmail.com>

---------

Co-authored-by: Chun Cai <amoycaic@gmail.com>
---
 docs/community/faq.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/community/faq.md b/docs/community/faq.md
index e2a1cdee02..724519a35e 100644
--- a/docs/community/faq.md
+++ b/docs/community/faq.md
@@ -80,6 +80,8 @@ If the program prompt something like "KILLED BY SIGNAL: 9 (Killed)", it may be c
 
 If the error message is "Segmentation fault", or there is no enough information on the error, please feel free to submit an issue.
 
+**4. Error "Read -1" when using mpirun in docker environment**
+This is a [known issue](https://github.com/open-mpi/ompi/issues/4948) of OpenMPI running in a docker container. In this case, please set environment variable `OMPI_MCA_btl_vader_single_copy_mechanism=none`.
 ## Miscellaneous
 
 **1. How to visualize charge density file?**

From b74f2a0b67fb190bc8d35a5d88f3ece6cdec94f0 Mon Sep 17 00:00:00 2001
From: Wenfei Li <38569667+wenfei-li@users.noreply.github.com>
Date: Sun, 28 Apr 2024 10:39:53 +0800
Subject: [PATCH 09/18] Feature : descriptors for equivariant DeePKS (#3894)

* Feature : add interfaces for calculating equivariant pdm

* add priting of equiv descriptors

* modify calculation of new pdm

* modify calculation of new pdm (multi-k)

---------

Co-authored-by: wenfei-li <liwenfei@gmail.com>
---
 .../module_esolver/esolver_ks_lcao_elec.cpp   |   2 +-
 .../esolver_ks_lcao_tmpfunc.cpp               |   8 +-
 .../hamilt_lcaodft/FORCE_gamma.cpp            |   2 +-
 .../hamilt_lcaodft/FORCE_k.cpp                |   4 +-
 .../operator_lcao/deepks_lcao.cpp             |  14 +-
 .../module_deepks/LCAO_deepks.cpp             |  47 ++--
 .../module_deepks/LCAO_deepks.h               |  23 +-
 .../module_deepks/LCAO_deepks_interface.cpp   |   6 +-
 .../module_deepks/LCAO_deepks_io.cpp          |  36 ++-
 .../module_deepks/LCAO_deepks_pdm.cpp         | 211 +++++++++++++-----
 .../module_deepks/LCAO_deepks_torch.cpp       |  74 ++++--
 .../module_deepks/test/LCAO_deepks_test.cpp   |   6 +-
 12 files changed, 313 insertions(+), 120 deletions(-)

diff --git a/source/module_esolver/esolver_ks_lcao_elec.cpp b/source/module_esolver/esolver_ks_lcao_elec.cpp
index 5eeaaa2e63..35ee6df306 100644
--- a/source/module_esolver/esolver_ks_lcao_elec.cpp
+++ b/source/module_esolver/esolver_ks_lcao_elec.cpp
@@ -689,7 +689,7 @@ void ESolver_KS_LCAO<TK, TR>::nscf(void)
         const elecstate::DensityMatrix<TK, double>* dm
             = dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM();
         this->dpks_cal_projected_DM(dm);
-        GlobalC::ld.cal_descriptor(); // final descriptor
+        GlobalC::ld.cal_descriptor(GlobalC::ucell.nat); // final descriptor
         GlobalC::ld.cal_gedm(GlobalC::ucell.nat);
     }
 #endif
diff --git a/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp b/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp
index a8ffb911f7..95fd4312a0 100644
--- a/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp
+++ b/source/module_esolver/esolver_ks_lcao_tmpfunc.cpp
@@ -93,9 +93,7 @@ namespace ModuleESolver
         GlobalC::ld.cal_projected_DM_k(dm,
             GlobalC::ucell,
             GlobalC::ORB,
-            GlobalC::GridD,
-            this->kv.nks,
-            this->kv.kvec_d);
+            GlobalC::GridD);
     }
 
 
@@ -106,9 +104,7 @@ namespace ModuleESolver
         GlobalC::ld.cal_projected_DM_k(dm,
             GlobalC::ucell,
             GlobalC::ORB,
-            GlobalC::GridD,
-            this->kv.nks,
-            this->kv.kvec_d);
+            GlobalC::GridD);
     }
 #endif
 }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
index a6670fdc9a..0751a9e46f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_gamma.cpp
@@ -73,7 +73,7 @@ void Force_LCAO_gamma::ftable_gamma(const bool isforce,
     {
         const std::vector<std::vector<double>>& dm_gamma = DM->get_DMK_vector();
         GlobalC::ld.cal_projected_DM(DM, GlobalC::ucell, GlobalC::ORB, GlobalC::GridD);
-        GlobalC::ld.cal_descriptor();
+        GlobalC::ld.cal_descriptor(GlobalC::ucell.nat);
         GlobalC::ld.cal_gedm(GlobalC::ucell.nat);
         GlobalC::ld.cal_f_delta_gamma(
 				dm_gamma, 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
index 2dea6d49f3..736813ccbf 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_k.cpp
@@ -124,8 +124,8 @@ void Force_LCAO_k::ftable_k(const bool isforce,
     if (GlobalV::deepks_scf)
     {
         const std::vector<std::vector<std::complex<double>>>& dm_k = DM->get_DMK_vector();
-        GlobalC::ld.cal_projected_DM_k(DM, GlobalC::ucell, GlobalC::ORB, GlobalC::GridD, kv.nks, kv.kvec_d);
-        GlobalC::ld.cal_descriptor();
+        GlobalC::ld.cal_projected_DM_k(DM, GlobalC::ucell, GlobalC::ORB, GlobalC::GridD);
+        GlobalC::ld.cal_descriptor(GlobalC::ucell.nat);
         GlobalC::ld.cal_gedm(GlobalC::ucell.nat);
 
         GlobalC::ld.cal_f_delta_k(dm_k,
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
index c5e3174539..b61a4b248c 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
@@ -156,7 +156,7 @@ void DeePKS<OperatorLCAO<double, double>>::contributeHR()
             *this->ucell,
             GlobalC::ORB,
             GlobalC::GridD);
-        GlobalC::ld.cal_descriptor();        
+        GlobalC::ld.cal_descriptor(this->ucell->nat);        
         GlobalC::ld.cal_gedm(this->ucell->nat);
         //GlobalC::ld.add_v_delta(*this->ucell,
         //    GlobalC::ORB,
@@ -189,10 +189,8 @@ void DeePKS<OperatorLCAO<std::complex<double>, double>>::contributeHR()
         GlobalC::ld.cal_projected_DM_k(this->DM,
             *this->ucell,
             GlobalC::ORB,
-            GlobalC::GridD,
-            this->nks,
-            this->kvec_d);
-        GlobalC::ld.cal_descriptor();
+            GlobalC::GridD);
+        GlobalC::ld.cal_descriptor(this->ucell->nat);
         // calculate dE/dD
         GlobalC::ld.cal_gedm(this->ucell->nat);
 
@@ -230,10 +228,8 @@ void DeePKS<OperatorLCAO<std::complex<double>, std::complex<double>>>::contribut
         GlobalC::ld.cal_projected_DM_k(this->DM,
             *this->ucell,
             GlobalC::ORB,
-            GlobalC::GridD,
-            this->nks,
-            this->kvec_d);
-        GlobalC::ld.cal_descriptor();
+            GlobalC::GridD);
+        GlobalC::ld.cal_descriptor(this->ucell->nat);
         // calculate dE/dD
         GlobalC::ld.cal_gedm(this->ucell->nat);
 
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
index 18192ab8a8..a29c0754fa 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.cpp
@@ -87,17 +87,36 @@ void LCAO_Deepks::init(
     assert(nm >= 0);
     assert(tot_inl_per_atom >= 0);
     
-    const int tot_inl = tot_inl_per_atom * nat;
+    int tot_inl = tot_inl_per_atom * nat;
+
+    if(if_equiv) tot_inl = nat;
 
     this->lmaxd = lm;
     this->nmaxd = nm;
-    this->inlmax = tot_inl;
+    
     GlobalV::ofs_running << " lmax of descriptor = " << this->lmaxd << std::endl;
     GlobalV::ofs_running << " nmax of descriptor= " << nmaxd << std::endl;
-	GlobalV::ofs_running << " total basis (all atoms) for descriptor= " << std::endl;
-    
-    //init pdm**
-    const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+
+    int pdm_size = 0;
+    this->inlmax = tot_inl;
+    if(!if_equiv)
+    {
+        GlobalV::ofs_running << " total basis (all atoms) for descriptor= " << std::endl;
+
+        //init pdm**
+        pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+    }
+    else
+    {
+        for(int il = 0; il < this->lmaxd + 1; il++)
+        {
+            pdm_size += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+        }
+        pdm_size = pdm_size * pdm_size;
+        this->des_per_atom=pdm_size;
+        GlobalV::ofs_running << " Equivariant version, size of pdm matrices : " << pdm_size << std::endl;
+    }
+
     this->pdm = new double* [this->inlmax];
     for (int inl = 0;inl < this->inlmax;inl++)
     {
@@ -106,15 +125,17 @@ void LCAO_Deepks::init(
     }
 
     // cal n(descriptor) per atom , related to Lmax, nchi(L) and m. (not total_nchi!)
-	this->des_per_atom=0; // mohan add 2021-04-21
-    for (int l = 0; l <= this->lmaxd; l++)
+    if(!if_equiv)
     {
-        this->des_per_atom += orb.Alpha[0].getNchi(l) * (2 * l + 1);
-    }
-
-    this->n_descriptor = nat * this->des_per_atom;
+        this->des_per_atom=0; // mohan add 2021-04-21
+        for (int l = 0; l <= this->lmaxd; l++)
+        {
+            this->des_per_atom += orb.Alpha[0].getNchi(l) * (2 * l + 1);
+        }
+        this->n_descriptor = nat * this->des_per_atom;
 
-    this->init_index(ntype, nat, na, tot_inl, orb);
+        this->init_index(ntype, nat, na, tot_inl, orb);
+    }
     this->allocate_nlm(nat);
 
     this->pv = &pv_in;
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
index dbf6ee26ad..1ec77ed412 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks.h
@@ -93,6 +93,8 @@ class LCAO_Deepks
     int nks_V_delta = 0;
 
     bool init_pdm = false; //for DeePKS NSCF calculation
+
+    bool if_equiv = false; //equivariant version
     
 	// deep neural network module that provides corrected Hamiltonian term and
 	// related derivatives.
@@ -106,7 +108,7 @@ class LCAO_Deepks
     std::vector<std::map<key_tuple,std::unordered_map<int,std::vector<std::vector<double>>>>> nlm_save_k;
 
     // projected density matrix
-	double** pdm;	//[tot_Inl][2l+1][2l+1]	caoyu modified 2021-05-07
+	double** pdm;	//[tot_Inl][2l+1][2l+1]	caoyu modified 2021-05-07; if equivariant version: [nat][nlm*nlm]
 	std::vector<torch::Tensor> pdm_tensor;
 
 	// descriptors
@@ -287,11 +289,20 @@ class LCAO_Deepks
         const elecstate::DensityMatrix<std::complex<double>, double>* dm,
         const UnitCell &ucell,
         const LCAO_Orbitals &orb,
-        Grid_Driver& GridD,
-        const int nks,
-        const std::vector<ModuleBase::Vector3<double>> &kvec_d);
+        Grid_Driver& GridD);
     void check_projected_dm(void);
 
+    void cal_projected_DM_equiv(
+        const elecstate::DensityMatrix<double, double>* dm,
+        const UnitCell &ucell,
+        const LCAO_Orbitals &orb,
+        Grid_Driver& GridD);
+    void cal_projected_DM_k_equiv(
+        const elecstate::DensityMatrix<std::complex<double>, double>* dm,
+        const UnitCell &ucell,
+        const LCAO_Orbitals &orb,
+        Grid_Driver& GridD);
+        
     //calculate the gradient of pdm with regard to atomic positions
     //d/dX D_{Inl,mm'}
     void cal_gdmx(//const ModuleBase::matrix& dm,
@@ -439,10 +450,12 @@ class LCAO_Deepks
 
     ///Calculates descriptors
     ///which are eigenvalues of pdm in blocks of I_n_l
-	void cal_descriptor(void);
+	void cal_descriptor(const int nat);
     ///print descriptors based on LCAO basis
     void check_descriptor(const UnitCell &ucell);
 
+    void cal_descriptor_equiv(const int nat);
+
     ///calculates gradient of descriptors w.r.t atomic positions
     ///----------------------------------------------------
     ///m, n: 2*l+1
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
index a6b14dcb58..6c0f72e242 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_interface.cpp
@@ -88,7 +88,7 @@ void LCAO_Deepks_Interface::out_deepks_labels(double etot,
         // this part is for integrated test of deepks
         ld->cal_projected_DM(dm, ucell, orb, GridD);
         ld->check_projected_dm(); // print out the projected dm for NSCF calculaiton
-        ld->cal_descriptor();     // final descriptor
+        ld->cal_descriptor(nat);     // final descriptor
         ld->check_descriptor(ucell);
 
         if (GlobalV::deepks_out_labels)
@@ -188,9 +188,9 @@ void LCAO_Deepks_Interface::out_deepks_labels(double etot,
     {
         // this part is for integrated test of deepks
         // so it is printed no matter even if deepks_out_labels is not used
-        ld->cal_projected_DM_k(dm, ucell, orb, GridD, nks, kvec_d);
+        ld->cal_projected_DM_k(dm, ucell, orb, GridD);
         ld->check_projected_dm(); // print out the projected dm for NSCF calculaiton
-        ld->cal_descriptor();     // final descriptor
+        ld->cal_descriptor(nat);     // final descriptor
         ld->check_descriptor(ucell);
 
         if (GlobalV::deepks_out_labels)
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
index 3d50ad646c..646e5c0a63 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_io.cpp
@@ -65,19 +65,39 @@ void LCAO_Deepks::save_npy_d(const int nat)
     ModuleBase::TITLE("LCAO_Deepks", "save_npy_d");
     if(GlobalV::MY_RANK!=0) return;
     //save descriptor in .npy format
-    vector<double> npy_des;
-    for (int inl = 0;inl < inlmax;++inl)
+    if(!if_equiv)
     {
-        int nm = 2*inl_l[inl] + 1;
-        for(int im=0;im<nm;im++)
+        vector<double> npy_des;
+        for (int inl = 0;inl < inlmax;++inl)
         {
-            npy_des.push_back(this->d_tensor[inl].index({im}).item().toDouble());
+            int nm = 2*inl_l[inl] + 1;
+            for(int im=0;im<nm;im++)
+            {
+                npy_des.push_back(this->d_tensor[inl].index({im}).item().toDouble());
+            }
+        }
+        const long unsigned dshape[] = {static_cast<unsigned long>(nat), static_cast<unsigned long>(this->des_per_atom)};
+        if (GlobalV::MY_RANK == 0)
+        {
+            npy::SaveArrayAsNumpy("dm_eig.npy", false, 2, dshape, npy_des);
         }
     }
-    const long unsigned dshape[] = {static_cast<unsigned long>(nat), static_cast<unsigned long>(this->des_per_atom)};
-    if (GlobalV::MY_RANK == 0)
+    else
     {
-        npy::SaveArrayAsNumpy("dm_eig.npy", false, 2, dshape, npy_des);
+        // a rather unnecessary way of writing this, but I'll do it for now
+        std::vector<double> npy_des;
+        for(int iat = 0; iat < nat; iat ++)
+        {
+            for(int i = 0; i < this->des_per_atom; i++)
+            {
+                npy_des.push_back(this->d_tensor[iat].index({i}).item().toDouble());
+            }
+        }
+        const long unsigned dshape[] = {static_cast<unsigned long>(nat), static_cast<unsigned long>(this->des_per_atom)};
+        if (GlobalV::MY_RANK == 0)
+        {
+            npy::SaveArrayAsNumpy("dm_eig.npy", false, 2, dshape, npy_des);
+        }        
     }
     return;
 }
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
index bf21360ebe..88a56582e6 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_pdm.cpp
@@ -34,7 +34,20 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<double, double
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_projected_DM");
 
-    const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+    int pdm_size;
+    if(!if_equiv)
+    {
+        pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+    }
+    else
+    {
+        int nproj = 0;
+        for(int il = 0; il < this->lmaxd + 1; il++)
+        {
+            nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+        }
+        pdm_size = nproj * nproj;
+    }
     if (GlobalV::init_chg == "file" && !this->init_pdm) //for DeePKS NSCF calculation 
     {
         std::ifstream ifs("pdm.dat");
@@ -80,23 +93,42 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<double, double
             //trace alpha orbital
             std::vector<int> trace_alpha_row;
             std::vector<int> trace_alpha_col;
-            int ib=0;
-            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+            if(!if_equiv)
             {
-                for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                int ib=0;
+                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
                 {
-                    const int inl = this->inl_index[T0](I0, L0, N0);
-                    const int nm = 2*L0+1;
-            
-                    for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
                     {
-                        for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                        const int inl = this->inl_index[T0](I0, L0, N0);
+                        const int nm = 2*L0+1;
+                
+                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                         {
-                            trace_alpha_row.push_back(ib+m1);
-                            trace_alpha_col.push_back(ib+m2);
+                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            {
+                                trace_alpha_row.push_back(ib+m1);
+                                trace_alpha_col.push_back(ib+m2);
+                            }
                         }
+                        ib+=nm;
+                    }
+                }
+            }
+            else
+            {
+                int nproj = 0;
+                for(int il = 0; il < this->lmaxd + 1; il++)
+                {
+                    nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                }
+                for(int iproj = 0; iproj < nproj; iproj ++)
+                {
+                    for(int jproj = 0; jproj < nproj; jproj ++)
+                    {
+                        trace_alpha_row.push_back(iproj);
+                        trace_alpha_col.push_back(jproj);
                     }
-                    ib+=nm;
                 }
             }
             const int trace_alpha_size = trace_alpha_row.size();
@@ -199,25 +231,46 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<double, double
                         &row_size);
 				}//ad2
                 // do dot of g_1dmt and s_1t to get orbital_pdm_shell
-                int ib=0, index=0, inc=1;
-                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                if(!if_equiv)
                 {
-                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                    int ib=0, index=0, inc=1;
+                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
                     {
-                        const int inl = this->inl_index[T0](I0, L0, N0);
-                        const int nm = 2*L0+1;
-                
-                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                        for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
                         {
-                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            const int inl = this->inl_index[T0](I0, L0, N0);
+                            const int nm = 2*L0+1;
+                    
+                            for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                             {
-                                int ind = m1*nm + m2;
-                                pdm[inl][ind] += 
-                                    ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
-                                index++;
+                                for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                                {
+                                    int ind = m1*nm + m2;
+                                    pdm[inl][ind] += 
+                                        ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
+                                    index++;
+                                }
                             }
+                            ib+=nm;
+                        }
+                    }
+                }
+                else
+                {
+                    int index = 0, inc=1;
+                    int nproj = 0;
+                    for(int il = 0; il < this->lmaxd + 1; il++)
+                    {
+                        nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                    }
+                    for(int iproj = 0; iproj < nproj; iproj ++)
+                    {
+                        for(int jproj = 0; jproj < nproj; jproj ++)
+                        {
+                            pdm[iat][iproj * nproj + jproj] += 
+                                ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
+                            index ++;
                         }
-                        ib+=nm;
                     }
                 }
 			}//ad1
@@ -234,11 +287,23 @@ void LCAO_Deepks::cal_projected_DM(const elecstate::DensityMatrix<double, double
 void LCAO_Deepks::cal_projected_DM_k(const elecstate::DensityMatrix<std::complex<double>, double>* dm,
     const UnitCell &ucell,
     const LCAO_Orbitals &orb,
-    Grid_Driver& GridD,
-    const int nks,
-    const std::vector<ModuleBase::Vector3<double>> &kvec_d)
+    Grid_Driver& GridD)
 {
-    const int pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+
+    int pdm_size;
+    if(!if_equiv)
+    {
+        pdm_size = (this->lmaxd * 2 + 1) * (this->lmaxd * 2 + 1);
+    }
+    else
+    {
+        int nproj = 0;
+        for(int il = 0; il < this->lmaxd + 1; il++)
+        {
+            nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+        }
+        pdm_size = nproj * nproj;        
+    }
 
     if (GlobalV::init_chg == "file" && !this->init_pdm) //for DeePKS NSCF calculation 
     {
@@ -285,25 +350,44 @@ void LCAO_Deepks::cal_projected_DM_k(const elecstate::DensityMatrix<std::complex
             //trace alpha orbital
             std::vector<int> trace_alpha_row;
             std::vector<int> trace_alpha_col;
-            int ib=0;
-            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+            if(!if_equiv)
             {
-                for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                int ib=0;
+                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
                 {
-                    const int inl = this->inl_index[T0](I0, L0, N0);
-                    const int nm = 2*L0+1;
-            
-                    for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
                     {
-                        for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                        const int inl = this->inl_index[T0](I0, L0, N0);
+                        const int nm = 2*L0+1;
+                
+                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                         {
-                            trace_alpha_row.push_back(ib+m1);
-                            trace_alpha_col.push_back(ib+m2);
+                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            {
+                                trace_alpha_row.push_back(ib+m1);
+                                trace_alpha_col.push_back(ib+m2);
+                            }
                         }
+                        ib+=nm;
                     }
-                    ib+=nm;
                 }
             }
+            else
+            {
+                int nproj = 0;
+                for(int il = 0; il < this->lmaxd + 1; il++)
+                {
+                    nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                }
+                for(int iproj = 0; iproj < nproj; iproj ++)
+                {
+                    for(int jproj = 0; jproj < nproj; jproj ++)
+                    {
+                        trace_alpha_row.push_back(iproj);
+                        trace_alpha_col.push_back(jproj);
+                    }
+                }                
+            }
             const int trace_alpha_size = trace_alpha_row.size();
 
             for (int ad1=0; ad1<GridD.getAdjacentNum()+1 ; ++ad1)
@@ -411,26 +495,47 @@ void LCAO_Deepks::cal_projected_DM_k(const elecstate::DensityMatrix<std::complex
                         &row_size);
 				}//ad2
                 // do dot of g_1dmt and s_1t to get orbital_pdm_shell
-                int ib=0, index=0, inc=1;
-                for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+                if(!if_equiv)
                 {
-                    for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                    int ib=0, index=0, inc=1;
+                    for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
                     {
-                        const int inl = this->inl_index[T0](I0, L0, N0);
-                        const int nm = 2*L0+1;
-                
-                        for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                        for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
                         {
-                            for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                            const int inl = this->inl_index[T0](I0, L0, N0);
+                            const int nm = 2*L0+1;
+                    
+                            for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                             {
-                                int ind = m1*nm + m2;
-                                pdm[inl][ind] += 
-                                    ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
-                                index++;
+                                for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                                {
+                                    int ind = m1*nm + m2;
+                                    pdm[inl][ind] += 
+                                        ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
+                                    index++;
+                                }
                             }
+                            ib+=nm;
                         }
-                        ib+=nm;
                     }
+                }
+                else
+                {
+                    int index = 0, inc=1;
+                    int nproj = 0;
+                    for(int il = 0; il < this->lmaxd + 1; il++)
+                    {
+                        nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
+                    }
+                    for(int iproj = 0; iproj < nproj; iproj ++)
+                    {
+                        for(int jproj = 0; jproj < nproj; jproj ++)
+                        {
+                            pdm[iat][iproj * nproj + jproj] += 
+                                ddot_(&row_size, g_1dmt.data()+index*row_size, &inc, s_1t.data()+index*row_size, &inc);
+                            index ++;
+                        }
+                    }                    
                 }
 			}//ad1
         }//I0
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
index 57c7974e12..c7d81ac2d3 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_torch.cpp
@@ -30,12 +30,39 @@
 #include "module_base/libm/libm.h"
 #include "module_base/blas_connector.h"
 
+void LCAO_Deepks::cal_descriptor_equiv(const int nat)
+{
+    ModuleBase::TITLE("LCAO_Deepks", "cal_descriptor_equiv");
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor_equiv");
+
+    // a rather unnecessary way of writing this, but I'll do it for now
+    if (!this->d_tensor.empty())
+    {
+        this->d_tensor.erase(this->d_tensor.begin(), this->d_tensor.end());
+    }
+
+    for(int iat = 0; iat < nat; iat++)
+    {
+        auto tmp = torch::zeros(des_per_atom,torch::kFloat64);
+        std::memcpy(tmp.data_ptr(),pdm[iat],sizeof(double)*tmp.numel());
+        this->d_tensor.push_back(tmp);
+    }
+
+    ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor_equiv");
+}
+
 //calculates descriptors from projected density matrices
-void LCAO_Deepks::cal_descriptor(void)
+void LCAO_Deepks::cal_descriptor(const int nat)
 {
     ModuleBase::TITLE("LCAO_Deepks", "cal_descriptor");
     ModuleBase::timer::tick("LCAO_Deepks", "cal_descriptor");
 
+    if(if_equiv)
+    {
+        this->cal_descriptor_equiv(nat);
+        return;
+    }
+
     //init pdm_tensor and d_tensor
     torch::Tensor tmp;
 
@@ -85,24 +112,41 @@ void LCAO_Deepks::check_descriptor(const UnitCell &ucell)
     if(GlobalV::MY_RANK!=0) return;
     std::ofstream ofs("descriptor.dat");
     ofs<<std::setprecision(10);
-    for (int it = 0; it < ucell.ntype; it++)
+    if(!if_equiv)
     {
-        for (int ia = 0; ia < ucell.atoms[it].na; ia++)
+        for (int it = 0; it < ucell.ntype; it++)
         {
-            int iat=ucell.itia2iat(it,ia);
-            ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << std::endl;
-            int id = 0;
-            for(int inl=0;inl<inlmax/ucell.nat;inl++)
+            for (int ia = 0; ia < ucell.atoms[it].na; ia++)
             {
-                int nm = 2*inl_l[inl]+1;
-                for(int im=0;im<nm;im++)
+                int iat=ucell.itia2iat(it,ia);
+                ofs << ucell.atoms[it].label << " atom_index " << ia + 1 << " n_descriptor " << this->des_per_atom << std::endl;
+                int id = 0;
+                for(int inl=0;inl<inlmax/ucell.nat;inl++)
                 {
-                    const int ind=iat*inlmax/ucell.nat+inl;
-                    ofs << d_tensor[ind].index({im}).item().toDouble() << " ";
-                    if (id % 8 == 7) ofs << std::endl;
-                    id++;
-                }
-            }   
+                    int nm = 2*inl_l[inl]+1;
+                    for(int im=0;im<nm;im++)
+                    {
+                        const int ind=iat*inlmax/ucell.nat+inl;
+                        ofs << d_tensor[ind].index({im}).item().toDouble() << " ";
+                        if (id % 8 == 7) ofs << std::endl;
+                        id++;
+                    }
+                }   
+                ofs << std::endl << std::endl;
+            }
+        }
+    }
+    else
+    {
+        for(int iat = 0; iat < ucell.nat; iat ++)
+        {
+            int it = ucell.iat2it[iat];
+            ofs << ucell.atoms[it].label << " atom_index " << iat + 1 << " n_descriptor " << this->des_per_atom << std::endl;
+            for(int i = 0; i < this->des_per_atom; i ++)
+            {
+                ofs << this->pdm[iat][i] << " ";
+                if (i % 8 == 7) ofs << std::endl;
+            }
             ofs << std::endl << std::endl;
         }
     }
diff --git a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
index adb73fc568..f42ce77fc5 100644
--- a/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
+++ b/source/module_hamilt_lcao/module_deepks/test/LCAO_deepks_test.cpp
@@ -129,9 +129,7 @@ void test_deepks::check_pdm(void)
 		this->ld.cal_projected_DM_k(dm_k_new,
 			ucell,
 			ORB,
-            Test_Deepks::GridD,
-			kv.nkstot,
-			kv.kvec_d);		
+            Test_Deepks::GridD);		
 	}
 	this->ld.check_projected_dm();
 	this->compare_with_ref("pdm.dat","pdm_ref.dat");
@@ -187,7 +185,7 @@ void test_deepks::check_gdmx(void)
 
 void test_deepks::check_descriptor(void)
 {
-	this->ld.cal_descriptor();
+	this->ld.cal_descriptor(ucell.nat);
 	this->ld.check_descriptor(ucell);
 	this->compare_with_ref("descriptor.dat","descriptor_ref.dat");
 }

From c36e5170aa38537ea4b3d680f643502f0441df15 Mon Sep 17 00:00:00 2001
From: Wenfei Li <38569667+wenfei-li@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:14:01 +0800
Subject: [PATCH 10/18] Feature : reading and writing of h(r) and dm(r) in npz
 format (#4066)

* Feature : reading and writing of h(r) and dm(r) in npz format

* fix bug

* udpate makefile

* fix makefile and cmake

---------

Co-authored-by: wenfei-li <liwenfei@gmail.com>
---
 CMakeLists.txt                                |  22 +
 docs/advanced/input_files/input-main.md       |  14 +
 source/Makefile.Objects                       |   1 +
 source/module_base/global_variable.cpp        |   3 +
 source/module_base/global_variable.h          |   3 +
 source/module_elecstate/elecstate_lcao.cpp    |   7 +
 source/module_esolver/CMakeLists.txt          |   1 +
 source/module_esolver/esolver_ks.cpp          |  16 +
 source/module_esolver/esolver_ks_lcao.cpp     |  32 ++
 source/module_esolver/esolver_ks_lcao.h       |   3 +
 .../module_esolver/esolver_ks_lcao_elec.cpp   |  23 +
 source/module_esolver/io_npz.cpp              | 499 ++++++++++++++++++
 source/module_io/input.cpp                    |  32 +-
 source/module_io/input.h                      |   3 +
 source/module_io/input_conv.cpp               |   3 +
 source/module_io/parameter_pool.cpp           |  12 +
 source/module_io/test/input_conv_test.cpp     |   3 +
 source/module_io/test/write_input_test.cpp    |   3 +
 source/module_io/write_input.cpp              |   3 +
 19 files changed, 681 insertions(+), 2 deletions(-)
 create mode 100644 source/module_esolver/io_npz.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 44fcd8f094..6a820b8b63 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@ option(COMMIT_INFO "Print commit information in log" ON)
 option(ENABLE_FFT_TWO_CENTER "Enable FFT-based two-center integral method." ON)
 option(ENABLE_GOOGLEBENCH "Enable GOOGLE-benchmark usage." OFF)
 option(ENABLE_RAPIDJSON "Enable rapid-json usage." OFF)
+option(ENABLE_CNPY "Enable cnpy usage." OFF)
 
 # enable json support
 if(ENABLE_RAPIDJSON)
@@ -467,6 +468,27 @@ if(ENABLE_DEEPKS)
   add_compile_definitions(__DEEPKS)
 endif()
 
+if (ENABLE_CNPY)
+  find_path(cnpy_SOURCE_DIR
+    cnpy.h
+    HINTS ${libnpy_INCLUDE_DIR}
+  )
+  if(NOT cnpy_SOURCE_DIR)
+    include(FetchContent)
+    FetchContent_Declare(
+      cnpy
+      GIT_REPOSITORY https://github.com/rogersce/cnpy.git
+      GIT_PROGRESS TRUE
+    )
+    FetchContent_MakeAvailable(cnpy)
+  else()
+    include_directories(${cnpy_INCLUDE_DIR})
+  endif()
+  include_directories(${cnpy_SOURCE_DIR})
+  target_link_libraries(${ABACUS_BIN_NAME} cnpy)
+  add_compile_definitions(__USECNPY)
+endif()
+
 function(git_submodule_update)
   if(GIT_SUBMOD_RESULT EQUAL "0")
     message(DEBUG "Submodule init'ed")
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 16bebcab9d..f1345f04c4 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -1607,6 +1607,20 @@ These variables are used to control the output of properties.
 - **Description**: Whether to print the upper triangular part of the exchange-correlation matrices in **Kohn-Sham orbital representation** (unit: Ry): $\braket{\psi_i|V_\text{xc}^\text{(semi-)local}+V_\text{exx}+V_\text{DFTU}|\psi_j}$ for each k point into files in the directory `OUT.${suffix}`, which is useful for the subsequent GW calculation. (Note that currently DeePKS term is not included. ) The files are named `k-$k-Vxc`, the meaning of `$k`corresponding to k point and spin  is same as [hs_matrix.md](../elec_properties/hs_matrix.md#out_mat_hs).
 - **Default**: False
 
+### out_hr_npz/out_dm_npz
+
+- **Type**: Boolean
+- **Availability**: Numerical atomic orbital basis
+- **Description**: Whether to print Hamiltonian matrices H(R)/density matrics DM(R) in npz format. This feature does not work for gamma-only calculations. Currently only intended for internal usage.
+- **Default**: False
+
+### dm_to_rho
+
+- **Type**: Boolean
+- **Availability**: Numerical atomic orbital basis
+- **Description**: Reads density matrix DM(R) in npz format and creates electron density on grids. This feature does not work for gamma-only calculations. Only supports serial calculations. Currently only intended for internal usage.
+- **Default**: False
+
 ### out_app_flag
 
 - **Type**: Boolean
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 5416eb69e7..f61b7cced1 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -222,6 +222,7 @@ OBJS_ESOLVER_LCAO=esolver_ks_lcao.o\
       esolver_ks_lcao_elec.o\
       esolver_ks_lcao_tddft.o\
       esolver_ks_lcao_tmpfunc.o\
+      io_npz.o\
 
 OBJS_GINT=gint.o\
       gint_gamma.o\
diff --git a/source/module_base/global_variable.cpp b/source/module_base/global_variable.cpp
index b9ceb358da..84ae03d359 100644
--- a/source/module_base/global_variable.cpp
+++ b/source/module_base/global_variable.cpp
@@ -283,6 +283,9 @@ bool out_bandgap = false; // QO added for bandgap printing
 int out_interval = 1;    // convert from out_hsR_interval liuyu 2023-04-18
 
 bool out_mat_xc = false; // output Vxc in KS-wfc representation for GW calculation
+bool out_hr_npz = false;
+bool out_dm_npz = false;
+bool dm_to_rho = false; // reads dm in npz format, then prints density in cube format
 
 //==========================================================
 // Deltaspin related
diff --git a/source/module_base/global_variable.h b/source/module_base/global_variable.h
index a504307ce2..bf181c84d9 100644
--- a/source/module_base/global_variable.h
+++ b/source/module_base/global_variable.h
@@ -313,6 +313,9 @@ extern bool out_bandgap;
 extern int out_interval;
 
 extern bool out_mat_xc; // output Vxc in KS-wfc representation for GW calculation
+extern bool out_hr_npz; //writes h(r) in npz format
+extern bool out_dm_npz; //writes dm(r) in npz format
+extern bool dm_to_rho; //reads in dm(r) and creates density
 
 // Deltaspin related
 extern bool sc_mag_switch; // 0: no deltaspin; 1: constrain atomic magnetic moments;
diff --git a/source/module_elecstate/elecstate_lcao.cpp b/source/module_elecstate/elecstate_lcao.cpp
index a70153fe68..4fdbe229fb 100644
--- a/source/module_elecstate/elecstate_lcao.cpp
+++ b/source/module_elecstate/elecstate_lcao.cpp
@@ -93,6 +93,12 @@ void ElecStateLCAO<std::complex<double>>::psiToRho(const psi::Psi<std::complex<d
     ModuleBase::timer::tick("ElecStateLCAO", "psiToRho");
 
     this->calculate_weights();
+
+// the calculations of dm, and dm -> rho are, technically, two separate functionalities, as we cannot
+// rule out the possibility that we may have a dm from other sources, such as read from file.
+// However, since we are not separating them now, I opt to add a flag to control how dm is obtained as of now
+if(!GlobalV::dm_to_rho)
+{
     this->calEBand();
 
     ModuleBase::GlobalFunc::NOTE("Calculate the density matrix.");
@@ -130,6 +136,7 @@ void ElecStateLCAO<std::complex<double>>::psiToRho(const psi::Psi<std::complex<d
             this->print_psi(psi);
         }
     }
+}
     // old 2D-to-Grid conversion has been replaced by new Gint Refactor 2023/09/25
     //this->loc->cal_dk_k(*this->lowf->gridt, this->wg, (*this->klist));
     for (int is = 0; is < GlobalV::NSPIN; is++)
diff --git a/source/module_esolver/CMakeLists.txt b/source/module_esolver/CMakeLists.txt
index 978160295e..44fe0eabe1 100644
--- a/source/module_esolver/CMakeLists.txt
+++ b/source/module_esolver/CMakeLists.txt
@@ -18,6 +18,7 @@ if(ENABLE_LCAO)
       esolver_ks_lcao_elec.cpp
       esolver_ks_lcao_tddft.cpp
       esolver_ks_lcao_tmpfunc.cpp
+      io_npz.cpp
   )
 endif()
 
diff --git a/source/module_esolver/esolver_ks.cpp b/source/module_esolver/esolver_ks.cpp
index 59db2849fd..e69e61c296 100644
--- a/source/module_esolver/esolver_ks.cpp
+++ b/source/module_esolver/esolver_ks.cpp
@@ -390,6 +390,7 @@ void ESolver_KS<T, Device>::run(const int istep, UnitCell& ucell)
 		ModuleBase::timer::tick(this->classname, "run");
 
 		this->before_scf(istep); //Something else to do before the iter loop
+		if(GlobalV::dm_to_rho) return; //nothing further is needed
 
 		ModuleBase::GlobalFunc::DONE(GlobalV::ofs_running, "INIT SCF");
 
@@ -631,6 +632,21 @@ ModuleIO::Output_Rho ESolver_KS<T, Device>::create_Output_Rho(
 {
 	const int precision = 3;
 	std::string tag = "CHG";
+	if(GlobalV::dm_to_rho)
+	{
+		return ModuleIO::Output_Rho(this->pw_big,
+								this->pw_rhod,
+								is,
+								GlobalV::NSPIN,
+								pelec->charge->rho[is],
+								iter,
+								this->pelec->eferm.get_efval(is),
+								&(GlobalC::ucell),
+								GlobalV::global_out_dir,
+								precision,
+								tag,
+								prefix);
+	}
 	return ModuleIO::Output_Rho(this->pw_big,
 			this->pw_rhod,
 			is,
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index 949e6c4ba7..80c2d94c79 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -1116,6 +1116,38 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(const int istep)
     }
 #endif
 
+    if(GlobalV::out_hr_npz)
+    {
+        this->p_hamilt->updateHk(0); // first k point, up spin
+        hamilt::HamiltLCAO<std::complex<double>, double>* p_ham_lcao 
+            = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt);
+        std::string zipname = "output_HR0.npz";
+        this->output_mat_npz(zipname,*(p_ham_lcao->getHR()));
+
+        if(GlobalV::NSPIN==2)
+        {
+            this->p_hamilt->updateHk(this->kv.nks/2); // the other half of k points, down spin
+            hamilt::HamiltLCAO<std::complex<double>, double>* p_ham_lcao 
+                = dynamic_cast<hamilt::HamiltLCAO<std::complex<double>, double>*>(this->p_hamilt);
+            zipname = "output_HR1.npz";
+            this->output_mat_npz(zipname,*(p_ham_lcao->getHR()));            
+        }
+    }
+
+    if(GlobalV::out_dm_npz)
+    {
+        const elecstate::DensityMatrix<TK, double>* dm
+            = dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM();
+        std::string zipname = "output_DM0.npz";
+        this->output_mat_npz(zipname,*(dm->get_DMR_pointer(1)));
+
+        if(GlobalV::NSPIN==2)
+        {
+            zipname = "output_DM1.npz";
+            this->output_mat_npz(zipname,*(dm->get_DMR_pointer(2)));       
+        }
+    }
+
     if (!md_skip_out(GlobalV::CALCULATION, istep, GlobalV::out_interval))
     {
         this->create_Output_Mat_Sparse(istep).write();
diff --git a/source/module_esolver/esolver_ks_lcao.h b/source/module_esolver/esolver_ks_lcao.h
index f579579cea..9886081e0f 100644
--- a/source/module_esolver/esolver_ks_lcao.h
+++ b/source/module_esolver/esolver_ks_lcao.h
@@ -115,6 +115,9 @@ namespace ModuleESolver
         /// @brief create ModuleIO::Output_Mat_Sparse object to output sparse density matrix of H, S, T, r
         ModuleIO::Output_Mat_Sparse<TK> create_Output_Mat_Sparse(int istep);
 
+        void read_mat_npz(std::string& zipname, hamilt::HContainer<double>& hR);
+        void output_mat_npz(std::string& zipname, const hamilt::HContainer<double>& hR);
+
         /// @brief check if skip the corresponding output in md calculation
         bool md_skip_out(std::string calculation, int istep, int interval);
 
diff --git a/source/module_esolver/esolver_ks_lcao_elec.cpp b/source/module_esolver/esolver_ks_lcao_elec.cpp
index 35ee6df306..3646858a49 100644
--- a/source/module_esolver/esolver_ks_lcao_elec.cpp
+++ b/source/module_esolver/esolver_ks_lcao_elec.cpp
@@ -329,6 +329,29 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(int istep)
         ->get_DM()
         ->init_DMR(*(dynamic_cast<hamilt::HamiltLCAO<TK, TR>*>(this->p_hamilt)->getHR()));
 
+    if(GlobalV::dm_to_rho)
+    {
+        std::string zipname = "output_DM0.npz";
+        elecstate::DensityMatrix<TK, double>* dm
+            = dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM();
+        this->read_mat_npz(zipname,*(dm->get_DMR_pointer(1)));
+        if(GlobalV::NSPIN == 2)
+        {
+            zipname = "output_DM1.npz";
+            this->read_mat_npz(zipname,*(dm->get_DMR_pointer(2)));
+        }
+
+        this->pelec->psiToRho(*this->psi);
+
+        this->create_Output_Rho(0, istep).write();
+        if(GlobalV::NSPIN == 2)
+        {
+            this->create_Output_Rho(1, istep).write();
+        }
+
+        return;
+    }
+
     // the electron charge density should be symmetrized,
     // here is the initialization
     Symmetry_rho srho;
diff --git a/source/module_esolver/io_npz.cpp b/source/module_esolver/io_npz.cpp
new file mode 100644
index 0000000000..3415ae3004
--- /dev/null
+++ b/source/module_esolver/io_npz.cpp
@@ -0,0 +1,499 @@
+//Deals with io of dm(r)/h(r) in npz format
+
+#include "module_esolver/esolver_ks_lcao.h"
+
+#include "module_base/parallel_reduce.h"
+#include "module_cell/module_neighbor/sltk_atom_arrange.h"
+#include "module_cell/module_neighbor/sltk_grid_driver.h"
+#include "module_hamilt_general/module_xc/xc_functional.h"
+#include "module_hamilt_lcao/module_dftu/dftu.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
+#ifdef __DEEPKS
+#include "module_hamilt_lcao/module_deepks/LCAO_deepks.h"	//caoyu add 2021-07-26
+#endif
+#include "module_base/timer.h"
+
+#ifdef __MPI
+#include <mpi.h>
+#include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
+#endif
+
+#ifdef __USECNPY
+#include "cnpy.h"
+#endif
+
+#include "module_base/element_name.h"
+
+namespace ModuleESolver
+{
+
+template <typename TK, typename TR>
+void ESolver_KS_LCAO<TK, TR>::read_mat_npz(std::string& zipname, hamilt::HContainer<double>& hR)
+{
+    ModuleBase::TITLE("LCAO_Hamilt","read_mat_npz");
+
+    const Parallel_Orbitals* paraV = this->LOWF.ParaV;
+
+#ifdef __USECNPY
+
+#ifdef __MPI
+
+    if(GlobalV::NPROC!=1)
+    {
+        std::cout << "read_mat_npz is not supported in NPI mode yet" << std::endl;
+        return;
+    }
+
+/*
+    hamilt::HContainer<double>* HR_serial;
+    Parallel_Orbitals serialV;
+    serialV.set_proc_dim(1,0);
+    serialV.mpi_create_cart(MPI_COMM_WORLD);
+    serialV.set_local2global(GlobalV::NLOCAL, GlobalV::NLOCAL, GlobalV::ofs_running, GlobalV::ofs_warning);
+    serialV.set_global2local(GlobalV::NLOCAL, GlobalV::NLOCAL, false, GlobalV::ofs_running);
+    serialV.set_atomic_trace(GlobalC::ucell.get_iat2iwt(), GlobalC::ucell.nat, GlobalV::NLOCAL);
+*/
+    if(GlobalV::MY_RANK == 0)
+    {
+        //HR_serial = new hamilt::HContainer<double>(&serialV);
+
+        cnpy::npz_t my_npz = cnpy::npz_load(zipname);
+        std::vector<std::string> fnames;
+
+        //check consistency
+        // 1. lattice vectors
+        double* lattice_vector = my_npz["lattice_vectors"].data<double>();
+        assert(std::abs(lattice_vector[0] - GlobalC::ucell.lat0 * GlobalC::ucell.a1.x) < 1e-6);
+        assert(std::abs(lattice_vector[1] - GlobalC::ucell.lat0 * GlobalC::ucell.a1.y) < 1e-6);
+        assert(std::abs(lattice_vector[2] - GlobalC::ucell.lat0 * GlobalC::ucell.a1.z) < 1e-6);
+        assert(std::abs(lattice_vector[3] - GlobalC::ucell.lat0 * GlobalC::ucell.a2.x) < 1e-6);
+        assert(std::abs(lattice_vector[4] - GlobalC::ucell.lat0 * GlobalC::ucell.a2.y) < 1e-6);
+        assert(std::abs(lattice_vector[5] - GlobalC::ucell.lat0 * GlobalC::ucell.a2.z) < 1e-6);
+        assert(std::abs(lattice_vector[6] - GlobalC::ucell.lat0 * GlobalC::ucell.a3.x) < 1e-6);
+        assert(std::abs(lattice_vector[7] - GlobalC::ucell.lat0 * GlobalC::ucell.a3.y) < 1e-6);
+        assert(std::abs(lattice_vector[8] - GlobalC::ucell.lat0 * GlobalC::ucell.a3.z) < 1e-6);
+
+        // 2. atoms
+        double* atom_info = my_npz["atom_info"].data<double>();
+        for(int iat = 0; iat < GlobalC::ucell.nat; ++iat)
+        {
+            const int it = GlobalC::ucell.iat2it[iat];
+            const int ia = GlobalC::ucell.iat2ia[iat];
+
+            //get atomic number (copied from write_cube.cpp)
+            std::string element = "";
+            element = GlobalC::ucell.atoms[it].label;
+			std::string::iterator temp = element.begin();
+			while (temp != element.end())
+			{
+				if ((*temp >= '1') && (*temp <= '9'))
+				{
+					temp = element.erase(temp);
+				}
+				else
+				{
+					temp++;
+				}
+			}
+            int z = 0;
+            for(int j=0; j!=ModuleBase::element_name.size(); j++)
+            {
+                if (element == ModuleBase::element_name[j])
+                {
+                    z=j+1;
+                    break;
+                }
+            }
+
+            assert(atom_info[iat*5] == it);
+            assert(atom_info[iat*5+1] == z);
+            //I will not be checking the coordinates for now in case the direct coordinates provided in the 
+            //npz file do not fall in the range [0,1); if a protocol is to be set in the future such that
+            //this could be guaranteed, then the following lines could be uncommented
+            //assert(std::abs(atom_info[iat*5+2] - GlobalC::ucell.atoms[it].taud[ia].x) < 1e-6);
+            //assert(std::abs(atom_info[iat*5+3] - GlobalC::ucell.atoms[it].taud[ia].y) < 1e-6);
+            //assert(std::abs(atom_info[iat*5+4] - GlobalC::ucell.atoms[it].taud[ia].z) < 1e-6);            
+        }
+
+        // 3. orbitals
+        for(int it = 0; it < GlobalC::ucell.ntype; ++it)
+        {
+            std::string filename="orbital_info_"+std::to_string(it);
+            double* orbital_info = my_npz[filename].data<double>();
+            for(int iw = 0; iw < GlobalC::ucell.atoms[it].nw; ++iw)
+            {
+                assert(orbital_info[iw*3] == GlobalC::ucell.atoms[it].iw2n[iw]);
+                assert(orbital_info[iw*3+1] == GlobalC::ucell.atoms[it].iw2l[iw]);
+                const int im = GlobalC::ucell.atoms[it].iw2m[iw];
+                const int m = (im % 2 == 0) ? -im/2 : (im+1)/2; // copied from LCAO_gen_fixedH.cpp
+                assert(orbital_info[iw*3+2] == m);
+            }
+        }
+
+        //starts reading the matrix
+        for(auto const& imap: my_npz)
+            fnames.push_back(imap.first);
+
+        for(int i = 0; i < fnames.size(); i ++)
+        {
+            if(fnames[i].find("mat_") == std::string::npos) continue;
+
+            std::vector<std::string> tokens;
+            std::string token;
+            std::stringstream fname_tmp(fnames[i]);
+            char delimiter = '_'; 
+        
+            while (std::getline(fname_tmp, token, delimiter)) { 
+                tokens.push_back(token); 
+            }
+            
+            cnpy::NpyArray arr = my_npz[fnames[i]];
+
+            int iat1 = std::stoi(tokens[1]);
+            int iat2 = std::stoi(tokens[2]);
+            int Rx = std::stoi(tokens[3]);
+            int Ry = std::stoi(tokens[4]);
+            int Rz = std::stoi(tokens[5]);
+
+            int it1 = GlobalC::ucell.iat2it[iat1];
+            int it2 = GlobalC::ucell.iat2it[iat2];
+
+            assert(arr.shape[0] == GlobalC::ucell.atoms[it1].nw);
+            assert(arr.shape[1] == GlobalC::ucell.atoms[it2].nw);
+            
+            //hamilt::AtomPair<double> tmp(iat1,iat2,Rx,Ry,Rz,&serialV);
+            //HR_serial->insert_pair(tmp);
+            hamilt::AtomPair<double> tmp(iat1,iat2,Rx,Ry,Rz,paraV);
+            hR.insert_pair(tmp);
+
+            // use symmetry : H_{mu,nu,R} = H_{nu,mu,-R}
+            if(Rx!=0 || Ry!=0 || Rz!=0)
+            {
+                //hamilt::AtomPair<double> tmp(iat2,iat1,-Rx,-Ry,-Rz,&serialV);
+                //HR_serial->insert_pair(tmp);
+                hamilt::AtomPair<double> tmp(iat2,iat1,-Rx,-Ry,-Rz,paraV);
+                hR.insert_pair(tmp);
+            }
+        }
+
+        //HR_serial->allocate();
+        hR.allocate();
+
+        for(int i = 0; i < fnames.size(); i ++)
+        {
+            if(fnames[i].find("mat_") == std::string::npos) continue;
+
+            std::vector<std::string> tokens;
+            std::string token;
+            std::stringstream fname_tmp(fnames[i]);
+            char delimiter = '_'; 
+        
+            while (std::getline(fname_tmp, token, delimiter)) { 
+                tokens.push_back(token); 
+            }
+            
+            cnpy::NpyArray arr = my_npz[fnames[i]];
+
+            int iat1 = std::stoi(tokens[1]);
+            int iat2 = std::stoi(tokens[2]);
+            int Rx = std::stoi(tokens[3]);
+            int Ry = std::stoi(tokens[4]);
+            int Rz = std::stoi(tokens[5]);
+
+            int it1 = GlobalC::ucell.iat2it[iat1];
+            int it2 = GlobalC::ucell.iat2it[iat2];
+
+            assert(arr.shape[0] == GlobalC::ucell.atoms[it1].nw);
+            assert(arr.shape[1] == GlobalC::ucell.atoms[it2].nw);
+
+            double* submat_read = arr.data<double>();
+            
+            //hamilt::BaseMatrix<double>* submat = HR_serial->find_matrix(iat1,iat2,Rx,Ry,Rz);
+            hamilt::BaseMatrix<double>* submat = hR.find_matrix(iat1,iat2,Rx,Ry,Rz);
+
+            for(int i = 0; i < arr.shape[0]; i ++)
+            {
+                for(int j = 0; j < arr.shape[1]; j ++)
+                {
+                    submat->add_element(i,j,submat_read[i*arr.shape[1]+j]);
+                }
+            }
+            
+            // use symmetry : H_{mu,nu,R} = H_{nu,mu,-R}
+            if(Rx!=0 || Ry!=0 || Rz!=0)
+            {
+                //hamilt::BaseMatrix<double>* submat = HR_serial->find_matrix(iat2,iat1,-Rx,-Ry,-Rz);
+                hamilt::BaseMatrix<double>* submat = hR.find_matrix(iat2,iat1,-Rx,-Ry,-Rz);
+                for(int i = 0; i < arr.shape[0]; i ++)
+                {
+                    for(int j = 0; j < arr.shape[1]; j ++)
+                    {
+                        submat->add_element(j,i,submat_read[i*arr.shape[1]+j]);
+                    }
+                }
+            }
+        }
+
+    }
+#else
+
+        cnpy::npz_t my_npz = cnpy::npz_load(zipname);
+        std::vector<std::string> fnames;
+
+        for(auto const& imap: my_npz)
+            fnames.push_back(imap.first);
+
+        for(int i = 0; i < fnames.size(); i ++)
+        {
+            if(fnames[i].find("mat_") == std::string::npos) continue;
+
+            std::vector<std::string> tokens;
+            std::string token;
+            std::stringstream fname_tmp(fnames[i]);
+            char delimiter = '_'; 
+        
+            while (std::getline(fname_tmp, token, delimiter)) { 
+                tokens.push_back(token); 
+            }
+            
+            cnpy::NpyArray arr = my_npz[fnames[i]];
+
+            int iat1 = std::stoi(tokens[1]);
+            int iat2 = std::stoi(tokens[2]);
+            int Rx = std::stoi(tokens[3]);
+            int Ry = std::stoi(tokens[4]);
+            int Rz = std::stoi(tokens[5]);
+
+            int it1 = GlobalC::ucell.iat2it[iat1];
+            int it2 = GlobalC::ucell.iat2it[iat2];
+
+            assert(arr.shape[0] == GlobalC::ucell.atoms[it1].nw);
+            assert(arr.shape[1] == GlobalC::ucell.atoms[it2].nw);
+            
+            hamilt::AtomPair<double> tmp(iat1,iat2,Rx,Ry,Rz,paraV);
+            hR->insert_pair(tmp);
+            // use symmetry : H_{mu,nu,R} = H_{nu,mu,-R}
+            if(Rx!=0 || Ry!=0 || Rz!=0)
+            {
+                hamilt::AtomPair<double> tmp(iat2,iat1,-Rx,-Ry,-Rz,paraV);
+                hR->insert_pair(tmp);
+            }
+        }
+
+        hR->allocate();
+
+        for(int i = 0; i < fnames.size(); i ++)
+        {
+            if(fnames[i].find("mat_") == std::string::npos) continue;
+
+            std::vector<std::string> tokens;
+            std::string token;
+            std::stringstream fname_tmp(fnames[i]);
+            char delimiter = '_'; 
+        
+            while (std::getline(fname_tmp, token, delimiter)) { 
+                tokens.push_back(token); 
+            }
+            
+            cnpy::NpyArray arr = my_npz[fnames[i]];
+
+            int iat1 = std::stoi(tokens[1]);
+            int iat2 = std::stoi(tokens[2]);
+            int Rx = std::stoi(tokens[3]);
+            int Ry = std::stoi(tokens[4]);
+            int Rz = std::stoi(tokens[5]);
+
+            int it1 = GlobalC::ucell.iat2it[iat1];
+            int it2 = GlobalC::ucell.iat2it[iat2];
+
+            assert(arr.shape[0] == GlobalC::ucell.atoms[it1].nw);
+            assert(arr.shape[1] == GlobalC::ucell.atoms[it2].nw);
+
+            double* submat_read = arr.data<double>();
+            
+            hamilt::BaseMatrix<double>* submat = hR->find_matrix(iat1,iat2,Rx,Ry,Rz);
+
+            for(int i = 0; i < arr.shape[0]; i ++)
+            {
+                for(int j = 0; j < arr.shape[1]; j ++)
+                {
+                    submat->add_element(i,j,submat_read[i*arr.shape[1]+j]);
+                }
+            }
+            
+            // use symmetry : H_{mu,nu,R} = H_{nu,mu,-R}
+            if(Rx!=0 || Ry!=0 || Rz!=0)
+            {
+                hamilt::BaseMatrix<double>* submat = hR->find_matrix(iat2,iat1,-Rx,-Ry,-Rz);
+                for(int i = 0; i < arr.shape[0]; i ++)
+                {
+                    for(int j = 0; j < arr.shape[1]; j ++)
+                    {
+                        submat->add_element(j,i,submat_read[i*arr.shape[1]+j]);
+                    }
+                }
+            }
+        }
+
+#endif
+
+#endif
+}
+
+template <typename TK, typename TR>
+void ESolver_KS_LCAO<TK, TR>::output_mat_npz(std::string& zipname, const hamilt::HContainer<double>& hR)
+{
+    ModuleBase::TITLE("LCAO_Hamilt","output_mat_npz");
+
+#ifdef __USECNPY
+    std::string filename = "";
+
+    if(GlobalV::MY_RANK == 0)
+    {
+        
+// first block: lattice vectors
+        filename = "lattice_vectors";
+        std::vector<double> lattice_vectors;
+        lattice_vectors.resize(9);
+        lattice_vectors[0] = GlobalC::ucell.lat0 * GlobalC::ucell.a1.x;
+        lattice_vectors[1] = GlobalC::ucell.lat0 * GlobalC::ucell.a1.y;
+        lattice_vectors[2] = GlobalC::ucell.lat0 * GlobalC::ucell.a1.z;
+        lattice_vectors[3] = GlobalC::ucell.lat0 * GlobalC::ucell.a2.x;
+        lattice_vectors[4] = GlobalC::ucell.lat0 * GlobalC::ucell.a2.y;
+        lattice_vectors[5] = GlobalC::ucell.lat0 * GlobalC::ucell.a2.z;
+        lattice_vectors[6] = GlobalC::ucell.lat0 * GlobalC::ucell.a3.x;
+        lattice_vectors[7] = GlobalC::ucell.lat0 * GlobalC::ucell.a3.y;
+        lattice_vectors[8] = GlobalC::ucell.lat0 * GlobalC::ucell.a3.z;
+
+        cnpy::npz_save(zipname,filename,lattice_vectors);
+
+// second block: atom info
+        filename = "atom_info";
+        double* atom_info = new double[GlobalC::ucell.nat*5];
+        for(int iat = 0; iat < GlobalC::ucell.nat; ++iat)
+        {
+            const int it = GlobalC::ucell.iat2it[iat];
+            const int ia = GlobalC::ucell.iat2ia[iat];
+
+            //get atomic number (copied from write_cube.cpp)
+            std::string element = "";
+            element = GlobalC::ucell.atoms[it].label;
+			std::string::iterator temp = element.begin();
+			while (temp != element.end())
+			{
+				if ((*temp >= '1') && (*temp <= '9'))
+				{
+					temp = element.erase(temp);
+				}
+				else
+				{
+					temp++;
+				}
+			}
+            int z = 0;
+            for(int j=0; j!=ModuleBase::element_name.size(); j++)
+            {
+                if (element == ModuleBase::element_name[j])
+                {
+                    z=j+1;
+                    break;
+                }
+            }
+
+            atom_info[iat*5] = it;
+            atom_info[iat*5+1] = z;
+            atom_info[iat*5+2] = GlobalC::ucell.atoms[it].taud[ia].x;
+            atom_info[iat*5+3] = GlobalC::ucell.atoms[it].taud[ia].y;
+            atom_info[iat*5+4] = GlobalC::ucell.atoms[it].taud[ia].z;
+        }
+        std::vector<size_t> shape={(size_t)GlobalC::ucell.nat,5};
+
+        cnpy::npz_save(zipname,filename,atom_info,shape,"a");
+        delete[] atom_info;
+
+//third block: orbital info
+        for(int it = 0; it < GlobalC::ucell.ntype; ++it)
+        {
+            filename="orbital_info_"+std::to_string(it);
+            double* orbital_info = new double[GlobalC::ucell.atoms[it].nw*3];
+            for(int iw = 0; iw < GlobalC::ucell.atoms[it].nw; ++iw)
+            {
+                orbital_info[iw*3] = GlobalC::ucell.atoms[it].iw2n[iw];
+                orbital_info[iw*3+1] = GlobalC::ucell.atoms[it].iw2l[iw];
+                const int im = GlobalC::ucell.atoms[it].iw2m[iw];
+                const int m = (im % 2 == 0) ? -im/2 : (im+1)/2; // copied from LCAO_gen_fixedH.cpp
+                orbital_info[iw*3+2] = m;
+            }
+            shape={(size_t)GlobalC::ucell.atoms[it].nw,3};
+
+            cnpy::npz_save(zipname,filename,orbital_info,shape,"a");
+        }
+    }
+
+//fourth block: hr(i0,jR)
+#ifdef __MPI
+    hamilt::HContainer<double>* HR_serial;
+    Parallel_Orbitals serialV;
+    serialV.set_global2local(GlobalV::NLOCAL, GlobalV::NLOCAL, false, GlobalV::ofs_running);
+    serialV.set_atomic_trace(GlobalC::ucell.get_iat2iwt(), GlobalC::ucell.nat, GlobalV::NLOCAL);
+    if(GlobalV::MY_RANK == 0)
+    {
+        HR_serial = new hamilt::HContainer<double>(&serialV);
+    }
+    hamilt::gatherParallels(hR, HR_serial, 0);
+
+    if(GlobalV::MY_RANK==0)
+    {
+        for(int iap=0;iap<HR_serial[0].size_atom_pairs();++iap)
+        {
+            int atom_i = HR_serial[0].get_atom_pair(iap).get_atom_i();
+            int atom_j = HR_serial[0].get_atom_pair(iap).get_atom_j();
+            if(atom_i > atom_j) continue;
+            int start_i = serialV.atom_begin_row[atom_i];
+            int start_j = serialV.atom_begin_col[atom_j];
+            int row_size = serialV.get_row_size(atom_i);
+            int col_size = serialV.get_col_size(atom_j);
+            for(int iR=0;iR<HR_serial[0].get_atom_pair(iap).get_R_size();++iR)
+            {
+                auto& matrix = HR_serial[0].get_atom_pair(iap).get_HR_values(iR);
+                int* r_index = HR_serial[0].get_atom_pair(iap).get_R_index(iR);
+                filename = "mat_"+std::to_string(atom_i)+"_"+std::to_string(atom_j)+"_"
+                    +std::to_string(r_index[0])+"_"+std::to_string(r_index[1])+"_"+std::to_string(r_index[2]);
+                std::vector<size_t> shape = {(size_t)row_size,(size_t)col_size};
+                cnpy::npz_save(zipname,filename,matrix.get_pointer(),shape,"a");
+            }
+        }
+
+    }
+#else
+    const Parallel_Orbitals* paraV = this->LM->ParaV;
+    auto row_indexes = paraV->get_indexes_row();
+    auto col_indexes = paraV->get_indexes_col();
+    for(int iap=0;iap<hR.size_atom_pairs();++iap)
+    {
+        int atom_i = hR.get_atom_pair(iap).get_atom_i();
+        int atom_j = hR.get_atom_pair(iap).get_atom_j();
+        int start_i = paraV->atom_begin_row[atom_i];
+        int start_j = paraV->atom_begin_col[atom_j];
+        int row_size = paraV->get_row_size(atom_i);
+        int col_size = paraV->get_col_size(atom_j);
+        for(int iR=0;iR<hR.get_atom_pair(iap).get_R_size();++iR)
+        {
+            auto& matrix = hR.get_atom_pair(iap).get_HR_values(iR);
+            int* r_index = hR.get_atom_pair(iap).get_R_index(iR);
+
+            filename = "mat_"+std::to_string(atom_i)+"_"+std::to_string(atom_j)+"_"
+                +std::to_string(r_index[0])+"_"+std::to_string(r_index[1])+"_"+std::to_string(r_index[2]);
+            std::vector<size_t> shape = {(size_t)row_size,(size_t)col_size};
+            cnpy::npz_save(zipname,filename,matrix.get_pointer(),shape,"a");
+        }
+    }
+#endif
+#endif
+}
+
+template class ESolver_KS_LCAO<double, double>;
+template class ESolver_KS_LCAO<std::complex<double>, double>;
+template class ESolver_KS_LCAO<std::complex<double>, std::complex<double>>;
+}
\ No newline at end of file
diff --git a/source/module_io/input.cpp b/source/module_io/input.cpp
index 60fc3ce47b..4fe9acac6d 100644
--- a/source/module_io/input.cpp
+++ b/source/module_io/input.cpp
@@ -346,6 +346,9 @@ void Input::Default(void)
     out_proj_band = 0;
     out_mat_hs = {0, 8};
     out_mat_xc = 0;
+    out_hr_npz = 0;
+    out_dm_npz = 0;
+    dm_to_rho = 0;
     cal_syns = 0;
     dmax = 0.01;
     out_mat_hs2 = 0; // LiuXh add 2019-07-15
@@ -1435,6 +1438,18 @@ bool Input::Read(const std::string& fn)
         {
             read_bool(ifs, out_mat_xc);
         }
+        else if (strcmp("out_hr_npz", word) == 0)
+        {
+            read_bool(ifs, out_hr_npz);
+        }
+        else if (strcmp("out_dm_npz", word) == 0)
+        {
+            read_bool(ifs, out_dm_npz);
+        }
+        else if (strcmp("dm_to_rho", word) == 0)
+        {
+            read_bool(ifs, dm_to_rho);
+        }
         else if (strcmp("out_interval", word) == 0)
         {
             read_value(ifs, out_interval);
@@ -2637,10 +2652,20 @@ bool Input::Read(const std::string& fn)
             gamma_only_local = 0;
         }
     }
-    if ((out_mat_r || out_mat_hs2 || out_mat_t || out_mat_dh) && gamma_only_local)
+    if ((out_mat_r || out_mat_hs2 || out_mat_t || out_mat_dh || out_hr_npz || out_dm_npz || dm_to_rho) && gamma_only_local)
     {
         ModuleBase::WARNING_QUIT("Input",
-                                 "printing of H(R)/S(R)/dH(R)/T(R) is not available for gamma only calculations");
+                                 "printing of H(R)/S(R)/dH(R)/T(R)/DM(R) is not available for gamma only calculations");
+    }
+    if(dm_to_rho && GlobalV::NPROC > 1)
+    {
+        ModuleBase::WARNING_QUIT("Input", "dm_to_rho is not available for parallel calculations");
+    }
+    if(out_hr_npz || out_dm_npz || dm_to_rho)
+    {
+#ifndef __USECNPY
+        ModuleBase::WARNING_QUIT("Input", "to write in npz format, please recompile with -DENABLE_CNPY=1");
+#endif
     }
     if (out_mat_dh && nspin == 4)
     {
@@ -3411,6 +3436,9 @@ void Input::Bcast()
     Parallel_Common::bcast_bool(out_mat_t);
     Parallel_Common::bcast_bool(out_mat_dh);
     Parallel_Common::bcast_bool(out_mat_xc);
+    Parallel_Common::bcast_bool(out_hr_npz);
+    Parallel_Common::bcast_bool(out_dm_npz);
+    Parallel_Common::bcast_bool(dm_to_rho);
     Parallel_Common::bcast_bool(out_mat_r); // jingan add 2019-8-14
     Parallel_Common::bcast_int(out_wfc_lcao);
     Parallel_Common::bcast_bool(out_alllog);
diff --git a/source/module_io/input.h b/source/module_io/input.h
index 857068d2da..a4f9f18895 100644
--- a/source/module_io/input.h
+++ b/source/module_io/input.h
@@ -271,6 +271,9 @@ class Input
     bool out_proj_band; // projected band structure calculation jiyy add 2022-05-11
     std::vector<int> out_mat_hs; // output H matrix and S matrix in local basis.
     bool out_mat_xc; // output exchange-correlation matrix in KS-orbital representation.
+    bool out_hr_npz;// output exchange-correlation matrix in KS-orbital representation.
+    bool out_dm_npz;
+    bool dm_to_rho;
     bool cal_syns; // calculate asynchronous S matrix to output
     double dmax; // maximum displacement of all atoms in one step (bohr)
     bool out_mat_hs2; // LiuXh add 2019-07-16, output H(R) matrix and S(R) matrix in local basis.
diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp
index 229c517ef8..9a9dd1896b 100644
--- a/source/module_io/input_conv.cpp
+++ b/source/module_io/input_conv.cpp
@@ -684,6 +684,9 @@ void Input_Conv::Convert(void)
     hsolver::HSolverLCAO<std::complex<double>>::out_mat_t = INPUT.out_mat_t;
     hsolver::HSolverLCAO<std::complex<double>>::out_mat_dh = INPUT.out_mat_dh;
     GlobalV::out_mat_xc = INPUT.out_mat_xc;
+    GlobalV::out_hr_npz = INPUT.out_hr_npz;
+    GlobalV::out_dm_npz = INPUT.out_dm_npz;
+    GlobalV::dm_to_rho = INPUT.dm_to_rho;
     if (GlobalV::GAMMA_ONLY_LOCAL)
     {
         elecstate::ElecStateLCAO<double>::out_wfc_lcao = INPUT.out_wfc_lcao;
diff --git a/source/module_io/parameter_pool.cpp b/source/module_io/parameter_pool.cpp
index 9be1fe00ac..a3852ccd6a 100644
--- a/source/module_io/parameter_pool.cpp
+++ b/source/module_io/parameter_pool.cpp
@@ -926,6 +926,18 @@ void input_parameters_set(std::map<std::string, InputParameter> input_parameters
     {
         INPUT.out_mat_xc = *static_cast<bool*>(input_parameters["out_mat_xc"].get());
     }
+    else if (input_parameters.count("out_hr_npz") != 0)
+    {
+        INPUT.out_hr_npz = *static_cast<bool*>(input_parameters["out_hr_npz"].get());
+    }
+    else if (input_parameters.count("out_dm_npz") != 0)
+    {
+        INPUT.out_dm_npz = *static_cast<bool*>(input_parameters["out_dm_npz"].get());
+    }
+    else if (input_parameters.count("dm_to_rho") != 0)
+    {
+        INPUT.dm_to_rho = *static_cast<bool*>(input_parameters["dm_to_rho"].get());
+    }
     else if (input_parameters.count("cal_syns") != 0)
     {
         INPUT.cal_syns = *static_cast<bool*>(input_parameters["cal_syns"].get());
diff --git a/source/module_io/test/input_conv_test.cpp b/source/module_io/test/input_conv_test.cpp
index fac85bc51d..ea5d2f5266 100644
--- a/source/module_io/test/input_conv_test.cpp
+++ b/source/module_io/test/input_conv_test.cpp
@@ -154,6 +154,9 @@ TEST_F(InputConvTest, Conv)
     EXPECT_EQ(hsolver::HSolverLCAO<double>::out_mat_dh, INPUT.out_mat_dh);
     EXPECT_EQ(hsolver::HSolverLCAO<std::complex<double>>::out_mat_dh, INPUT.out_mat_dh);
     EXPECT_EQ(GlobalV::out_mat_xc, false);
+	EXPECT_EQ(GlobalV::out_hr_npz, false);
+	EXPECT_EQ(GlobalV::out_dm_npz, false);
+	EXPECT_EQ(GlobalV::dm_to_rho, false);
     EXPECT_EQ(GlobalV::out_interval, 1);
     EXPECT_EQ(elecstate::ElecStateLCAO<double>::out_wfc_lcao, false);
     EXPECT_EQ(berryphase::berry_phase_flag, false);
diff --git a/source/module_io/test/write_input_test.cpp b/source/module_io/test/write_input_test.cpp
index 8d3b8640df..8f9059a67e 100644
--- a/source/module_io/test/write_input_test.cpp
+++ b/source/module_io/test/write_input_test.cpp
@@ -324,6 +324,9 @@ TEST_F(write_input, LCAO5)
                 testing::HasSubstr("lcao_rmax                      30 #max R for 1D two-center integration table"));
     EXPECT_THAT(output, testing::HasSubstr("out_mat_hs                     0 #output H and S matrix"));
     EXPECT_THAT(output, testing::HasSubstr("out_mat_xc                     0 #output exchange-correlation matrix in KS-orbital representation"));
+    EXPECT_THAT(output, testing::HasSubstr("out_hr_npz                     0 #output hr(I0,JR) submatrices in npz format"));
+    EXPECT_THAT(output, testing::HasSubstr("out_dm_npz                     0 #output dmr(I0,JR) submatrices in npz format"));
+    EXPECT_THAT(output, testing::HasSubstr("dm_to_rho                      0 #reads dmr in npz format and calculates electron density"));
     EXPECT_THAT(output, testing::HasSubstr("out_mat_hs2                    0 #output H(R) and S(R) matrix"));
     EXPECT_THAT(output, testing::HasSubstr("out_mat_dh                     0 #output of derivative of H(R) matrix"));
     EXPECT_THAT(
diff --git a/source/module_io/write_input.cpp b/source/module_io/write_input.cpp
index cfb7174508..5035a618b5 100644
--- a/source/module_io/write_input.cpp
+++ b/source/module_io/write_input.cpp
@@ -229,6 +229,9 @@ ModuleBase::GlobalFunc::OUTP(ofs, "out_bandgap", out_bandgap, "if true, print ou
     ModuleBase::GlobalFunc::OUTP(ofs, "out_mat_hs2", out_mat_hs2, "output H(R) and S(R) matrix");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_mat_dh", out_mat_dh, "output of derivative of H(R) matrix");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_mat_xc", out_mat_xc, "output exchange-correlation matrix in KS-orbital representation");
+    ModuleBase::GlobalFunc::OUTP(ofs, "out_hr_npz", out_hr_npz, "output hr(I0,JR) submatrices in npz format");
+    ModuleBase::GlobalFunc::OUTP(ofs, "out_dm_npz", out_dm_npz, "output dmr(I0,JR) submatrices in npz format");
+    ModuleBase::GlobalFunc::OUTP(ofs, "dm_to_rho", dm_to_rho, "reads dmr in npz format and calculates electron density");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_interval", out_interval, "interval for printing H(R) and S(R) matrix during MD");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_app_flag", out_app_flag, "whether output r(R), H(R), S(R), T(R), and dH(R) matrices in an append manner during MD");
     ModuleBase::GlobalFunc::OUTP(ofs, "out_mat_t", out_mat_t, "output T(R) matrix");

From 579048d8b0b553657502fa43c1d4f1aaf4e2e387 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Tue, 30 Apr 2024 15:06:55 +0800
Subject: [PATCH 11/18] pin MPI version (#4065)

---
 Dockerfile.intel | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/Dockerfile.intel b/Dockerfile.intel
index 39ca4c1431..269713a16c 100644
--- a/Dockerfile.intel
+++ b/Dockerfile.intel
@@ -16,18 +16,16 @@ RUN apt-get update && \
         intel-oneapi-compiler-dpcpp-cpp \
         intel-oneapi-compiler-fortran \
         intel-oneapi-mkl-devel \
-        intel-oneapi-mpi-devel \
+        intel-oneapi-mpi-devel="2021.11.*" \
         intel-oneapi-vtune
-
-
-ENV I_MPI_ROOT='/opt/intel/oneapi/mpi/latest' \
+ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest \
     LIBRARY_PATH=/opt/intel/oneapi/tbb/latest/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/ippcp/latest/lib/:/opt/intel/oneapi/ipp/latest/lib:/opt/intel/oneapi/dpl/latest/lib:/opt/intel/oneapi/dnnl/latest/lib:/opt/intel/oneapi/dal/latest/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/ccl/latest/lib/ \
     LD_LIBRARY_PATH=/opt/intel/oneapi/tbb/latest/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib:/opt/intel/oneapi/itac/latest/slib:/opt/intel/oneapi/ippcp/latest/lib/:/opt/intel/oneapi/ipp/latest/lib:/opt/intel/oneapi/dpl/latest/lib:/opt/intel/oneapi/dnnl/latest/lib:/opt/intel/oneapi/debugger/latest/opt/debugger/lib:/opt/intel/oneapi/dal/latest/lib:/opt/intel/oneapi/compiler/latest/opt/oclfpga/host/linux64/lib:/opt/intel/oneapi/compiler/latest/opt/compiler/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/ccl/latest/lib/ \
     PATH=/opt/intel/oneapi/vtune/latest/bin64:/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/bin:/opt/intel/oneapi/mpi/latest/bin:/opt/intel/oneapi/mkl/latest/bin/:/opt/intel/oneapi/itac/latest/bin:/opt/intel/oneapi/inspector/latest/bin64:/opt/intel/oneapi/dpcpp-ct/latest/bin:/opt/intel/oneapi/dev-utilities/latest/bin:/opt/intel/oneapi/debugger/latest/opt/debugger/bin:/opt/intel/oneapi/compiler/latest/opt/oclfpga/bin:/opt/intel/oneapi/compiler/latest/bin:/opt/intel/oneapi/advisor/latest/bin64:/opt/mamba/bin:/opt/mamba/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin \
-    MKLROOT='/opt/intel/oneapi/mkl/latest' \
-    FI_PROVIDER_PATH='/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric' \
-    CMAKE_PREFIX_PATH='/opt/intel/oneapi/tbb/latest/env/..:/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/ipp/latest/lib/cmake/ipp:/opt/intel/oneapi/dpl/latest/lib/cmake/oneDPL:/opt/intel/oneapi/dnnl/latest/lib/cmake:/opt/intel/oneapi/dal/latest:/opt/intel/oneapi/compiler/latest' \
-    CMPLR_ROOT='/opt/intel/oneapi/compiler/latest'
+    MKLROOT=/opt/intel/oneapi/mkl/latest \
+    FI_PROVIDER_PATH=/opt/intel/oneapi/mpi/latest/opt/mpi/libfabric/lib/prov:/usr/lib/x86_64-linux-gnu/libfabric \
+    CMAKE_PREFIX_PATH=/opt/intel/oneapi/tbb/latest/env/..:/opt/intel/oneapi/mkl/latest/lib/cmake:/opt/intel/oneapi/ipp/latest/lib/cmake/ipp:/opt/intel/oneapi/dpl/latest/lib/cmake/oneDPL:/opt/intel/oneapi/dnnl/latest/lib/cmake:/opt/intel/oneapi/dal/latest:/opt/intel/oneapi/compiler/latest \
+    CMPLR_ROOT=/opt/intel/oneapi/compiler/latest
 
 SHELL ["/bin/bash", "-c"]
 ENV CC=mpiicx CXX=mpiicpx FC=mpiifx

From a2a0e19082284742f54171cde77c2384ee689c63 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Tue, 30 Apr 2024 16:10:14 +0800
Subject: [PATCH 12/18] add some docs (#4073)

---
 docs/advanced/input_files/input-main.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index f1345f04c4..993e250478 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -438,7 +438,7 @@ These variables are used to control general system parameters.
 
 - **Type**: Integer
 - **Description**: takes value 1, 0 or -1.
-  - -1: No symmetry will be considered.
+  - -1: No symmetry will be considered. It is recommended to set -1 for non-colinear + soc calculations, where time reversal symmetry is broken sometimes.
   - 0: Only time reversal symmetry would be considered in symmetry operations, which implied k point and -k point would be treated as a single k point with twice the weight.
   - 1: Symmetry analysis will be performed to determine the type of Bravais lattice and associated symmetry operations. (point groups, space groups, primitive cells, and irreducible k-points)
 - **Default**: 

From 1439426b631f1dccfbecb1806915e18b2c566722 Mon Sep 17 00:00:00 2001
From: wqzhou <33364058+WHUweiqingzhou@users.noreply.github.com>
Date: Tue, 30 Apr 2024 17:02:18 +0800
Subject: [PATCH 13/18] Fix: use std::abs() instead of abs() (#4076)

---
 source/module_elecstate/module_charge/charge_mixing.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/module_elecstate/module_charge/charge_mixing.cpp b/source/module_elecstate/module_charge/charge_mixing.cpp
index 57083301b2..e327904d0e 100755
--- a/source/module_elecstate/module_charge/charge_mixing.cpp
+++ b/source/module_elecstate/module_charge/charge_mixing.cpp
@@ -504,7 +504,7 @@ void Charge_Mixing::mix_rho_recip(Charge* chr)
         {
             chr->rhog[0][ig] = rhog_magabs[ig]; // rhog
             double norm = std::sqrt(chr->rho[1][ig] * chr->rho[1][ig] + chr->rho[2][ig] * chr->rho[2][ig] + chr->rho[3][ig] * chr->rho[3][ig]);
-            if (abs(norm) < 1e-10) continue;
+            if (std::abs(norm) < 1e-10) continue;
             double rescale_tmp = rho_magabs[npw + ig] / norm; 
             chr->rho[1][ig] *= rescale_tmp;
             chr->rho[2][ig] *= rescale_tmp;

From efb67a9725848f9f829b14af5c64b3cb1436be20 Mon Sep 17 00:00:00 2001
From: Taoni Bao <baotaoni@pku.edu.cn>
Date: Sat, 4 May 2024 17:01:24 +0800
Subject: [PATCH 14/18] Refactor istate charge to reduce dependence on other
 modules (#4086)

---
 source/module_elecstate/fp_energy.cpp         |  47 +++++--
 source/module_elecstate/fp_energy.h           |  11 +-
 .../module_esolver/esolver_ks_lcao_elec.cpp   | 132 +++++++++---------
 source/module_io/input.h                      |   9 ++
 source/module_io/istate_charge.cpp            | 120 +++++++++-------
 source/module_io/istate_charge.h              |  38 +++--
 6 files changed, 206 insertions(+), 151 deletions(-)

diff --git a/source/module_elecstate/fp_energy.cpp b/source/module_elecstate/fp_energy.cpp
index 47ed45f400..767c18cebe 100644
--- a/source/module_elecstate/fp_energy.cpp
+++ b/source/module_elecstate/fp_energy.cpp
@@ -1,36 +1,38 @@
 #include "fp_energy.h"
+
 #include "module_base/global_variable.h"
 #ifdef USE_PAW
 #include "module_cell/module_paw/paw_cell.h"
 #endif
 
+#include "module_base/tool_quit.h"
+
 #include <iomanip>
 #include <iostream>
 
-#include "module_base/tool_quit.h"
 namespace elecstate
 {
 
 /// @brief calculate etot
 double fenergy::calculate_etot()
 {
-    if(GlobalV::use_paw)
+    if (GlobalV::use_paw)
     {
-        etot = eband + deband + etxc + ewald_energy - hartree_energy + demet + descf + exx + efield + gatefield
-               + evdw + esol_el + esol_cav + edftu + edeepks_scf;
+        etot = eband + deband + etxc + ewald_energy - hartree_energy + demet + descf + exx + efield + gatefield + evdw
+               + esol_el + esol_cav + edftu + edeepks_scf;
     }
     else
     {
-        etot = eband + deband + (etxc - etxcc) + ewald_energy + hartree_energy + demet + descf + exx + efield + gatefield
-               + evdw + esol_el + esol_cav + edftu + edeepks_scf + escon;
+        etot = eband + deband + (etxc - etxcc) + ewald_energy + hartree_energy + demet + descf + exx + efield
+               + gatefield + evdw + esol_el + esol_cav + edftu + edeepks_scf + escon;
     }
 
 #ifdef USE_PAW
-    if(GlobalV::use_paw)
+    if (GlobalV::use_paw)
     {
         double ecore = GlobalC::paw_cell.calculate_ecore();
         double epawdc = GlobalC::paw_cell.get_epawdc();
-        etot += ( ecore + epawdc );
+        etot += (ecore + epawdc);
     }
 #endif
     return etot;
@@ -39,22 +41,22 @@ double fenergy::calculate_etot()
 /// @brief calculate etot_harris
 double fenergy::calculate_harris()
 {
-    if(GlobalV::use_paw)
+    if (GlobalV::use_paw)
     {
         etot_harris = eband + deband_harris + etxc + ewald_energy - hartree_energy + demet + descf + exx + efield
-                  + gatefield + evdw + esol_el + esol_cav + edftu + edeepks_scf;
+                      + gatefield + evdw + esol_el + esol_cav + edftu + edeepks_scf;
     }
     else
     {
-        etot_harris = eband + deband_harris + (etxc - etxcc) + ewald_energy + hartree_energy + demet + descf + exx + efield
-                  + gatefield + evdw + esol_el + esol_cav + edftu + edeepks_scf + escon;
+        etot_harris = eband + deband_harris + (etxc - etxcc) + ewald_energy + hartree_energy + demet + descf + exx
+                      + efield + gatefield + evdw + esol_el + esol_cav + edftu + edeepks_scf + escon;
     }
 #ifdef USE_PAW
-    if(GlobalV::use_paw)
+    if (GlobalV::use_paw)
     {
         double ecore = GlobalC::paw_cell.calculate_ecore();
         double epawdc = GlobalC::paw_cell.get_epawdc();
-        etot_harris += ( ecore + epawdc );
+        etot_harris += (ecore + epawdc);
     }
 #endif
     return etot_harris;
@@ -64,7 +66,8 @@ double fenergy::calculate_harris()
 void fenergy::clear_all()
 {
     etot = etot_old = eband = deband = etxc = etxcc = vtxc = ewald_energy = hartree_energy = demet = descf = exx
-        = efield = gatefield = evdw = etot_harris = deband_harris = esol_el = esol_cav = edftu = edeepks_scf = escon = 0.0;
+        = efield = gatefield = evdw = etot_harris = deband_harris = esol_el = esol_cav = edftu = edeepks_scf = escon
+        = 0.0;
 }
 
 /// @brief print all energies
@@ -140,4 +143,18 @@ double efermi::get_efval(const int& is) const
     }
 }
 
+/// @brief get all fermi energies for all spins
+/// @return all fermi energies for all spins
+std::vector<double> efermi::get_all_ef() const
+{
+    if (two_efermi)
+    {
+        return {ef_up, ef_dw};
+    }
+    else
+    {
+        return {ef, ef}; // For NSPIN=1, ef_up=ef_dw=ef
+    }
+}
+
 } // namespace elecstate
diff --git a/source/module_elecstate/fp_energy.h b/source/module_elecstate/fp_energy.h
index 4d3e8d4ec3..9c922a3276 100644
--- a/source/module_elecstate/fp_energy.h
+++ b/source/module_elecstate/fp_energy.h
@@ -1,3 +1,5 @@
+#include <vector>
+
 /**
  * @file fp_energy.h
  * @brief This file contains all energies about first-principle calculations
@@ -12,8 +14,8 @@ namespace elecstate
  */
 struct fenergy
 {
-    double etot = 0.0;     ///< the total free energy
-    double etot_old = 0.0; ///< old total free energy
+    double etot = 0.0;       ///< the total free energy
+    double etot_old = 0.0;   ///< old total free energy
     double etot_delta = 0.0; // the difference of total energy between two steps = etot - etot_old
 
     double eband = 0.0;          ///< the band energy
@@ -42,8 +44,8 @@ struct fenergy
 
     double escon = 0.0; ///< spin constraint energy
 
-    double ekinetic = 0.0;      /// kinetic energy, used in OFDFT
-    double eion_elec = 0.0;     /// ion-electron interaction energy, used in OFDFT
+    double ekinetic = 0.0;  /// kinetic energy, used in OFDFT
+    double eion_elec = 0.0; /// ion-electron interaction energy, used in OFDFT
 
     double calculate_etot();
     double calculate_harris();
@@ -63,6 +65,7 @@ struct efermi
     bool two_efermi = false; ///<
     double& get_ef(const int& is);
     double get_efval(const int& is) const;
+    std::vector<double> get_all_ef() const;
 };
 
 } // namespace elecstate
diff --git a/source/module_esolver/esolver_ks_lcao_elec.cpp b/source/module_esolver/esolver_ks_lcao_elec.cpp
index 3646858a49..aa8e53bd7c 100644
--- a/source/module_esolver/esolver_ks_lcao_elec.cpp
+++ b/source/module_esolver/esolver_ks_lcao_elec.cpp
@@ -19,11 +19,10 @@
 #include "module_elecstate/elecstate_lcao.h"
 #include "module_hamilt_general/module_ewald/H_Ewald_pw.h"
 #include "module_hamilt_general/module_vdw/vdw.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/op_exx_lcao.h"
-#include "module_io/dm_io.h"
-
 #include "module_hamilt_lcao/module_deltaspin/spin_constrain.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
+#include "module_io/dm_io.h"
 
 namespace ModuleESolver
 {
@@ -102,8 +101,8 @@ void ESolver_KS_LCAO<TK, TR>::beforesolver(const int istep)
     // init psi
     if (this->psi == nullptr)
     {
-        int nsk=0;
-        int ncol=0;
+        int nsk = 0;
+        int ncol = 0;
         if (GlobalV::GAMMA_ONLY_LOCAL)
         {
             nsk = GlobalV::NSPIN;
@@ -127,12 +126,7 @@ void ESolver_KS_LCAO<TK, TR>::beforesolver(const int istep)
     }
 
     // prepare grid in Gint
-	LCAO_domain::grid_prepare(
-			this->GridT, 
-			this->GG,
-			this->GK,
-			*this->pw_rho, 
-			*this->pw_big);
+    LCAO_domain::grid_prepare(this->GridT, this->GG, this->GK, *this->pw_rho, *this->pw_big);
 
     // init Hamiltonian
     if (this->p_hamilt != nullptr)
@@ -143,7 +137,8 @@ void ESolver_KS_LCAO<TK, TR>::beforesolver(const int istep)
     if (this->p_hamilt == nullptr)
     {
         elecstate::DensityMatrix<TK, double>* DM = dynamic_cast<elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM();
-        this->p_hamilt = new hamilt::HamiltLCAO<TK, TR>(GlobalV::GAMMA_ONLY_LOCAL ? &(this->GG) : nullptr,
+        this->p_hamilt = new hamilt::HamiltLCAO<TK, TR>(
+            GlobalV::GAMMA_ONLY_LOCAL ? &(this->GG) : nullptr,
             GlobalV::GAMMA_ONLY_LOCAL ? nullptr : &(this->GK),
             &(this->gen_h),
             &(this->LM),
@@ -270,12 +265,12 @@ void ESolver_KS_LCAO<TK, TR>::beforesolver(const int istep)
     //=========================================================
     // cal_ux should be called before init_scf because
     // the direction of ux is used in noncoline_rho
-	//=========================================================
-	if(GlobalV::NSPIN == 4 && GlobalV::DOMAG) 
-	{
-		GlobalC::ucell.cal_ux();
-	}
-	ModuleBase::timer::tick("ESolver_KS_LCAO", "beforesolver");
+    //=========================================================
+    if (GlobalV::NSPIN == 4 && GlobalV::DOMAG)
+    {
+        GlobalC::ucell.cal_ux();
+    }
+    ModuleBase::timer::tick("ESolver_KS_LCAO", "beforesolver");
 }
 
 template <typename TK, typename TR>
@@ -311,15 +306,15 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(int istep)
 
     this->beforesolver(istep);
     // Peize Lin add 2016-12-03
-#ifdef __EXX    // set xc type before the first cal of xc in pelec->init_scf
-	if (GlobalC::exx_info.info_ri.real_number)
-	{
-		this->exd->exx_beforescf(this->kv, *this->p_chgmix);
-	}
-	else
-	{
-		this->exc->exx_beforescf(this->kv, *this->p_chgmix);
-	}
+#ifdef __EXX // set xc type before the first cal of xc in pelec->init_scf
+    if (GlobalC::exx_info.info_ri.real_number)
+    {
+        this->exd->exx_beforescf(this->kv, *this->p_chgmix);
+    }
+    else
+    {
+        this->exc->exx_beforescf(this->kv, *this->p_chgmix);
+    }
 #endif // __EXX
 
     this->pelec->init_scf(istep, this->sf.strucFac);
@@ -329,22 +324,22 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(int istep)
         ->get_DM()
         ->init_DMR(*(dynamic_cast<hamilt::HamiltLCAO<TK, TR>*>(this->p_hamilt)->getHR()));
 
-    if(GlobalV::dm_to_rho)
+    if (GlobalV::dm_to_rho)
     {
         std::string zipname = "output_DM0.npz";
         elecstate::DensityMatrix<TK, double>* dm
             = dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM();
-        this->read_mat_npz(zipname,*(dm->get_DMR_pointer(1)));
-        if(GlobalV::NSPIN == 2)
+        this->read_mat_npz(zipname, *(dm->get_DMR_pointer(1)));
+        if (GlobalV::NSPIN == 2)
         {
             zipname = "output_DM1.npz";
-            this->read_mat_npz(zipname,*(dm->get_DMR_pointer(2)));
+            this->read_mat_npz(zipname, *(dm->get_DMR_pointer(2)));
         }
 
         this->pelec->psiToRho(*this->psi);
 
         this->create_Output_Rho(0, istep).write();
-        if(GlobalV::NSPIN == 2)
+        if (GlobalV::NSPIN == 2)
         {
             this->create_Output_Rho(1, istep).write();
         }
@@ -358,7 +353,7 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(int istep)
     for (int is = 0; is < GlobalV::NSPIN; is++)
     {
         srho.begin(is, *(this->pelec->charge), this->pw_rho, GlobalC::Pgrid, GlobalC::ucell.symm);
-    } 
+    }
 
     // 1. calculate ewald energy.
     // mohan update 2021-02-25
@@ -423,11 +418,20 @@ void ESolver_KS_LCAO<TK, TR>::others(const int istep)
     {
         IState_Charge ISC(this->psi, this->LOC);
         ISC.begin(this->GG,
-                  this->pelec,
-                  this->pw_rho,
-                  this->pw_big,
+                  this->pelec->charge->rho,
+                  this->pelec->wg,
+                  this->pelec->eferm.get_all_ef(),
+                  this->pw_rho->nrxx,
+                  this->pw_rho->nplane,
+                  this->pw_rho->startz_current,
+                  this->pw_rho->nx,
+                  this->pw_rho->ny,
+                  this->pw_rho->nz,
+                  this->pw_big->bz,
+                  this->pw_big->nbz,
                   GlobalV::GAMMA_ONLY_LOCAL,
                   GlobalV::NBANDS_ISTATE,
+                  INPUT.get_out_band_kb(),
                   GlobalV::NBANDS,
                   GlobalV::nelec,
                   GlobalV::NSPIN,
@@ -481,7 +485,6 @@ void ESolver_KS_LCAO<TK, TR>::others(const int istep)
     return;
 }
 
-
 template <>
 void ESolver_KS_LCAO<double, double>::get_S(void)
 {
@@ -489,7 +492,6 @@ void ESolver_KS_LCAO<double, double>::get_S(void)
     ModuleBase::WARNING_QUIT("ESolver_KS_LCAO<double,double>::get_S", "not implemented for");
 }
 
-
 template <>
 void ESolver_KS_LCAO<std::complex<double>, double>::get_S(void)
 {
@@ -523,7 +525,6 @@ void ESolver_KS_LCAO<std::complex<double>, double>::get_S(void)
     return;
 }
 
-
 template <>
 void ESolver_KS_LCAO<std::complex<double>, std::complex<double>>::get_S(void)
 {
@@ -546,8 +547,7 @@ void ESolver_KS_LCAO<std::complex<double>, std::complex<double>>::get_S(void)
     this->LM.ParaV = &this->orb_con.ParaV;
     if (this->p_hamilt == nullptr)
     {
-        this->p_hamilt
-            = new hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>(&this->LM, this->kv);
+        this->p_hamilt = new hamilt::HamiltLCAO<std::complex<double>, std::complex<double>>(&this->LM, this->kv);
         dynamic_cast<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>*>(this->p_hamilt->ops)
             ->contributeHR();
     }
@@ -557,7 +557,6 @@ void ESolver_KS_LCAO<std::complex<double>, std::complex<double>>::get_S(void)
     return;
 }
 
-
 template <typename TK, typename TR>
 void ESolver_KS_LCAO<TK, TR>::nscf(void)
 {
@@ -661,36 +660,31 @@ void ESolver_KS_LCAO<TK, TR>::nscf(void)
 #ifdef __LCAO
         if (INPUT.wannier_method == 1)
         {
-            toWannier90_LCAO_IN_PW myWannier(
-                INPUT.out_wannier_mmn,
-                INPUT.out_wannier_amn,
-                INPUT.out_wannier_unk, 
-                INPUT.out_wannier_eig,
-                INPUT.out_wannier_wvfn_formatted,
-                INPUT.nnkpfile,
-                INPUT.wannier_spin
-            );
-
-            myWannier.calculate(
-              this->pelec->ekb, 
-              this->pw_wfc, 
-              this->pw_big, 
-              this->sf, 
-              this->kv, 
-              this->psi, 
-              this->LOWF.ParaV);
+            toWannier90_LCAO_IN_PW myWannier(INPUT.out_wannier_mmn,
+                                             INPUT.out_wannier_amn,
+                                             INPUT.out_wannier_unk,
+                                             INPUT.out_wannier_eig,
+                                             INPUT.out_wannier_wvfn_formatted,
+                                             INPUT.nnkpfile,
+                                             INPUT.wannier_spin);
+
+            myWannier.calculate(this->pelec->ekb,
+                                this->pw_wfc,
+                                this->pw_big,
+                                this->sf,
+                                this->kv,
+                                this->psi,
+                                this->LOWF.ParaV);
         }
         else if (INPUT.wannier_method == 2)
         {
-            toWannier90_LCAO myWannier(
-                INPUT.out_wannier_mmn,
-                INPUT.out_wannier_amn,
-                INPUT.out_wannier_unk, 
-                INPUT.out_wannier_eig,
-                INPUT.out_wannier_wvfn_formatted,
-                INPUT.nnkpfile,
-                INPUT.wannier_spin
-            );
+            toWannier90_LCAO myWannier(INPUT.out_wannier_mmn,
+                                       INPUT.out_wannier_amn,
+                                       INPUT.out_wannier_unk,
+                                       INPUT.out_wannier_eig,
+                                       INPUT.out_wannier_wvfn_formatted,
+                                       INPUT.nnkpfile,
+                                       INPUT.wannier_spin);
 
             myWannier.calculate(this->pelec->ekb, this->kv, *(this->psi), this->LOWF.ParaV);
         }
diff --git a/source/module_io/input.h b/source/module_io/input.h
index a4f9f18895..4bfbf37be3 100644
--- a/source/module_io/input.h
+++ b/source/module_io/input.h
@@ -9,6 +9,7 @@
 
 #include "module_base/vector3.h"
 #include "module_md/md_para.h"
+#include "input_conv.h"
 
 class Input
 {
@@ -704,10 +705,18 @@ class Input
     void read_bool(std::ifstream &ifs, bool &var);
 
     // Return the const string pointer of private member bands_to_print_
+    // Not recommended to use this function directly, use get_out_band_kb() instead
     const std::string* get_bands_to_print() const
     {
         return &bands_to_print_;
     }
+    // Return parsed bands_to_print_ as a vector of integers
+    std::vector<int> get_out_band_kb() const
+    {
+        std::vector<int> out_band_kb;
+        Input_Conv::parse_expression(bands_to_print_, out_band_kb);
+        return out_band_kb;
+    }
 };
 
 extern Input INPUT;
diff --git a/source/module_io/istate_charge.cpp b/source/module_io/istate_charge.cpp
index 8fe4a170d5..f4bab942e4 100644
--- a/source/module_io/istate_charge.cpp
+++ b/source/module_io/istate_charge.cpp
@@ -6,7 +6,6 @@
 #include "module_base/parallel_common.h"
 #include "module_base/scalapack_connector.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
-#include "module_io/input_conv.h"
 #include "module_io/rho_io.h"
 
 IState_Charge::IState_Charge(psi::Psi<double>* psi_gamma_in, Local_Orbital_Charge& loc_in)
@@ -19,11 +18,20 @@ IState_Charge::~IState_Charge()
 }
 
 void IState_Charge::begin(Gint_Gamma& gg,
-                          elecstate::ElecState* pelec,
-                          const ModulePW::PW_Basis* rhopw,
-                          const ModulePW::PW_Basis_Big* bigpw,
+                          double** rho,
+                          const ModuleBase::matrix& wg,
+                          const std::vector<double>& ef_all_spin,
+                          const int rhopw_nrxx,
+                          const int rhopw_nplane,
+                          const int rhopw_startz_current,
+                          const int rhopw_nx,
+                          const int rhopw_ny,
+                          const int rhopw_nz,
+                          const int bigpw_bz,
+                          const int bigpw_nbz,
                           const bool gamma_only_local,
                           const int nbands_istate,
+                          const std::vector<int>& out_band_kb,
                           const int nbands,
                           const double nelec,
                           const int nspin,
@@ -41,17 +49,14 @@ void IState_Charge::begin(Gint_Gamma& gg,
         ModuleBase::WARNING_QUIT("IState_Charge::begin", "Only available for GAMMA_ONLY_LOCAL now.");
     }
 
-    // Get bands_to_print through public function of INPUT (returns a const pointer to string)
-    std::string bands_to_print = *INPUT.get_bands_to_print();
-
     int mode = 0;
-    if (nbands_istate > 0 && bands_to_print.empty())
+    if (nbands_istate > 0 && static_cast<int>(out_band_kb.size()) == 0)
     {
         mode = 1;
     }
-    else if (!bands_to_print.empty())
+    else if (static_cast<int>(out_band_kb.size()) > 0)
     {
-        // If bands_to_print is not empty, set mode to 2
+        // If out_band_kb (bands_to_print) is not empty, set mode to 2
         mode = 2;
         std::cout << " Notice: INPUT parameter `nbands_istate` overwritten by `bands_to_print`!" << std::endl;
     }
@@ -63,8 +68,6 @@ void IState_Charge::begin(Gint_Gamma& gg,
     int fermi_band = 0;
     int bands_below = 0;
     int bands_above = 0;
-    std::vector<double> out_band_kb;
-    Input_Conv::parse_expression(bands_to_print, out_band_kb);
 
     // (2) cicle:
     // (2.1) calculate the selected density matrix from wave functions.
@@ -124,14 +127,13 @@ void IState_Charge::begin(Gint_Gamma& gg,
                     "The elements of `bands_to_print` must be either 0 or 1. Invalid values found!");
             }
         }
-        // Fill bands_picked_ with values from out_band_kb, converting to int
+        // Fill bands_picked_ with values from out_band_kb
         // Remaining bands are already set to 0
         int length = std::min(static_cast<int>(out_band_kb.size()), nbands);
         for (int i = 0; i < length; ++i)
         {
             // out_band_kb rely on function parse_expression from input_conv.cpp
-            // Initially designed for ocp_set, which can be double
-            bands_picked_[i] = static_cast<int>(out_band_kb[i]);
+            bands_picked_[i] = out_band_kb[i];
         }
 
         std::cout << " Plot band decomposed charge density below the Fermi surface: band ";
@@ -197,44 +199,64 @@ void IState_Charge::begin(Gint_Gamma& gg,
             // band, whenever it is occupied or not.
 
 #ifdef __MPI
-            this->idmatrix(ib, pelec, nspin, nelec, nlocal);
+            this->idmatrix(ib, nspin, nelec, nlocal, wg);
 #endif
             // (2) zero out of charge density array.
             for (int is = 0; is < nspin; ++is)
             {
-                ModuleBase::GlobalFunc::ZEROS(pelec->charge->rho[is], rhopw->nrxx);
+                ModuleBase::GlobalFunc::ZEROS(rho[is], rhopw_nrxx);
             }
 
             // (3) calculate charge density for a particular
             // band.
-            Gint_inout inout(this->loc->DM, pelec->charge->rho, Gint_Tools::job_type::rho);
+            Gint_inout inout(this->loc->DM, rho, Gint_Tools::job_type::rho);
             gg.cal_gint(&inout);
-            pelec->charge->save_rho_before_sum_band(); // xiaohui add 2014-12-09
+
+            // A solution to replace the original implementation of the following code:
+            // pelec->charge->save_rho_before_sum_band();
+            double** rho_save = new double*[nspin]; // Initialize an array of pointers
+            for (int is = 0; is < nspin; is++)
+            {
+                rho_save[is] = new double[rhopw_nrxx]; // Allocate memory for each internal array
+                ModuleBase::GlobalFunc::DCOPY(rho[is], rho_save[is],
+                                              rhopw_nrxx); // Copy data after allocation
+            }
+
             std::stringstream ssc;
             ssc << global_out_dir << "BAND" << ib + 1;
             // 0 means definitely output charge density.
             for (int is = 0; is < nspin; ++is)
             {
                 ssc << "_SPIN" << is << "_CHG.cube";
-                const double ef_tmp = pelec->eferm.get_efval(is);
+
+                // Use a const vector to store efermi for all spins, replace the original implementation:
+                // const double ef_tmp = pelec->eferm.get_efval(is);
+                double ef_spin = ef_all_spin[is];
                 ModuleIO::write_rho(
 #ifdef __MPI
-                    bigpw->bz,
-                    bigpw->nbz,
-                    rhopw->nplane,
-                    rhopw->startz_current,
+                    bigpw_bz,
+                    bigpw_nbz,
+                    rhopw_nplane,
+                    rhopw_startz_current,
 #endif
-                    pelec->charge->rho_save[is],
+                    rho_save[is],
                     is,
                     nspin,
                     0,
                     ssc.str(),
-                    rhopw->nx,
-                    rhopw->ny,
-                    rhopw->nz,
-                    ef_tmp,
+                    rhopw_nx,
+                    rhopw_ny,
+                    rhopw_nz,
+                    ef_spin,
                     &(GlobalC::ucell));
             }
+
+            // Release memory of rho_save
+            for (int is = 0; is < nspin; is++)
+            {
+                delete[] rho_save[is]; // Release memory of each internal array
+            }
+            delete[] rho_save; // Release memory of the array of pointers
         }
     }
 
@@ -243,54 +265,48 @@ void IState_Charge::begin(Gint_Gamma& gg,
 
 #ifdef __MPI
 void IState_Charge::idmatrix(const int& ib,
-                             elecstate::ElecState* pelec,
                              const int nspin,
                              const double nelec,
-                             const int nlocal)
+                             const int nlocal,
+                             const ModuleBase::matrix& wg)
 {
     ModuleBase::TITLE("IState_Charge", "idmatrix");
+    assert(wg.nr == nspin);
 
-    assert(pelec->wg.nr == nspin);
-    for (int is = 0; is != nspin; ++is)
+    int fermi_band = static_cast<int>((nelec + 1) / 2 + 1.0e-8);
+
+    for (int is = 0; is < nspin; ++is)
     {
         std::vector<double> wg_local(this->loc->ParaV->ncol, 0.0);
         const int ib_local = this->loc->ParaV->global2local_col(ib);
 
-        int fermi_band = 0;
-        fermi_band = static_cast<int>((nelec + 1) / 2 + 1.0e-8);
-
         if (ib_local >= 0)
         {
-            if (ib < fermi_band)
-            {
-                wg_local[ib_local] = pelec->wg(is, ib);
-            }
-            else
-            {
-                wg_local[ib_local] = pelec->wg(is, fermi_band - 1);
-            } // unoccupied bands, use occupation of homo
+            // For unoccupied bands, use occupation of HOMO
+            wg_local[ib_local] = (ib < fermi_band) ? wg(is, ib) : wg(is, fermi_band - 1);
         }
 
-        // wg_wfc(ib,iw) = pelec->wg[ib] * wfc(ib,iw);
+        // wg_wfc(ib,iw) = wg[ib] * wfc(ib,iw);
         this->psi_gamma->fix_k(is);
-        psi::Psi<double> wg_wfc(this->psi_gamma[0], 1);
+        psi::Psi<double> wg_wfc(*this->psi_gamma, 1);
 
-        for (int ir = 0; ir != wg_wfc.get_nbands(); ++ir)
+        for (int ir = 0; ir < wg_wfc.get_nbands(); ++ir)
         {
             BlasConnector::scal(wg_wfc.get_nbasis(), wg_local[ir], wg_wfc.get_pointer() + ir * wg_wfc.get_nbasis(), 1);
         }
 
-        // C++: dm(iw1,iw2) = wfc(ib,iw1).T * wg_wfc(ib,iw2)
+        // dm(iw1,iw2) = wfc(ib,iw1).T * wg_wfc(ib,iw2)
         const double one_float = 1.0, zero_float = 0.0;
         const int one_int = 1;
         const char N_char = 'N', T_char = 'T';
+
         this->loc->dm_gamma.at(is).create(wg_wfc.get_nbands(), wg_wfc.get_nbasis());
 
         pdgemm_(&N_char,
                 &T_char,
                 &nlocal,
                 &nlocal,
-                &pelec->wg.nc,
+                &wg.nc,
                 &one_float,
                 wg_wfc.get_pointer(),
                 &one_int,
@@ -307,10 +323,8 @@ void IState_Charge::idmatrix(const int& ib,
                 this->loc->ParaV->desc);
     }
 
-    std::cout << " finished calc dm_2d : " << std::endl;
-
+    std::cout << " Finished calculating dm_2d." << std::endl;
     this->loc->cal_dk_gamma_from_2D_pub();
-
-    std::cout << " finished convert : " << std::endl;
+    std::cout << " Finished converting dm_2d to dk_gamma." << std::endl;
 }
 #endif
diff --git a/source/module_io/istate_charge.h b/source/module_io/istate_charge.h
index bb525aad8d..dfd0c7f41d 100644
--- a/source/module_io/istate_charge.h
+++ b/source/module_io/istate_charge.h
@@ -1,16 +1,15 @@
 #ifndef ISTATE_CHARGE_H
 #define ISTATE_CHARGE_H
-#include <module_base/complexmatrix.h>
-#include <module_base/matrix.h>
-
-#include <stdexcept>
-#include <vector>
-
 #include "module_basis/module_pw/pw_basis.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/local_orbital_charge.h"
 #include "module_hamilt_lcao/module_gint/gint_gamma.h"
 #include "module_psi/psi.h"
 
+#include <module_base/complexmatrix.h>
+#include <module_base/matrix.h>
+#include <stdexcept>
+#include <vector>
+
 /**
  * @brief Manages the computation of the charge density for different bands.
  *
@@ -31,11 +30,20 @@ class IState_Charge
     ~IState_Charge();
 
     void begin(Gint_Gamma& gg,
-               elecstate::ElecState* pelec,
-               const ModulePW::PW_Basis* rhopw,
-               const ModulePW::PW_Basis_Big* bigpw,
+               double** rho,
+               const ModuleBase::matrix& wg,
+               const std::vector<double>& ef_all_spin,
+               const int rhopw_nrxx,
+               const int rhopw_nplane,
+               const int rhopw_startz_current,
+               const int rhopw_nx,
+               const int rhopw_ny,
+               const int rhopw_nz,
+               const int bigpw_bz,
+               const int bigpw_nbz,
                const bool gamma_only_local,
                const int nbands_istate,
+               const std::vector<int>& out_band_kb,
                const int nbands,
                const double nelec,
                const int nspin,
@@ -50,8 +58,18 @@ class IState_Charge
 #ifdef __MPI
     /**
      * @brief Calculates the density matrix for a given band and spin.
+     *
+     * This method calculates the density matrix for a given band and spin using the wave function coefficients.
+     * It adjusts the coefficients based on the Fermi energy and performs a matrix multiplication to produce the density
+     * matrix.
+     *
+     * @param ib Band index.
+     * @param nspin Number of spin channels.
+     * @param nelec Total number of electrons.
+     * @param nlocal Number of local orbitals.
+     * @param wg Weight matrix for bands and spins.
      */
-    void idmatrix(const int& ib, elecstate::ElecState* pelec, const int nspin, const double nelec, const int nlocal);
+    void idmatrix(const int& ib, const int nspin, const double nelec, const int nlocal, const ModuleBase::matrix& wg);
 #endif
     psi::Psi<double>* psi_gamma;
     Local_Orbital_Charge* loc;

From 9fc5728fa179abf81a6941dd1984514db5e92e37 Mon Sep 17 00:00:00 2001
From: LUNASEA <33978601+maki49@users.noreply.github.com>
Date: Sat, 4 May 2024 17:55:36 +0800
Subject: [PATCH 15/18] Feature: write local-XC and EXX part of band(orbital)
 energy along with the XC matrix (#4013)

* write local-XC and EXX part of band(orbital) energy with XC matrix

* meet @mohanchen's requirements

* xc orbital energy: change to LibRPA format
---
 docs/advanced/input_files/input-main.md   |  2 +
 source/module_io/write_Vxc.hpp            | 94 +++++++++++++++++++----
 tests/integrate/207_NO_KP_OXC/INPUT       | 10 +++
 tests/integrate/207_NO_KP_OXC/KPT         |  2 +-
 tests/integrate/207_NO_KP_OXC/k-1-Vxc.ref | 12 +--
 tests/integrate/207_NO_KP_OXC/result.ref  |  6 +-
 tests/integrate/207_NO_KP_OXC/vxc_out.ref | 15 ++++
 tests/integrate/307_NO_GO_OXC/vxc_out.ref |  9 +++
 tests/integrate/tools/catch_properties.sh | 17 ++--
 9 files changed, 134 insertions(+), 33 deletions(-)
 create mode 100644 tests/integrate/207_NO_KP_OXC/vxc_out.ref
 create mode 100644 tests/integrate/307_NO_GO_OXC/vxc_out.ref

diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 993e250478..167715485a 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -1605,6 +1605,8 @@ These variables are used to control the output of properties.
 - **Type**: Boolean
 - **Availability**: Numerical atomic orbital basis
 - **Description**: Whether to print the upper triangular part of the exchange-correlation matrices in **Kohn-Sham orbital representation** (unit: Ry): $\braket{\psi_i|V_\text{xc}^\text{(semi-)local}+V_\text{exx}+V_\text{DFTU}|\psi_j}$ for each k point into files in the directory `OUT.${suffix}`, which is useful for the subsequent GW calculation. (Note that currently DeePKS term is not included. ) The files are named `k-$k-Vxc`, the meaning of `$k`corresponding to k point and spin  is same as [hs_matrix.md](../elec_properties/hs_matrix.md#out_mat_hs).
+The band (KS orbital) energy for each (k-point, spin, band) will be printed in the file `OUT.${suffix}/vxc_out`. If EXX is calculated, the local and EXX part of band energy will also be printed in `OUT.${suffix}/vxc_local_out`and `OUT.${suffix}/vxc_exx_out`, respectively. All the `vxc*_out` files contains 3 integers (nk, nspin, nband) followed by nk\*nspin\*nband lines of energy Hartree and eV.
+```
 - **Default**: False
 
 ### out_hr_npz/out_dm_npz
diff --git a/source/module_io/write_Vxc.hpp b/source/module_io/write_Vxc.hpp
index b9f5c435b9..236ccafb77 100644
--- a/source/module_io/write_Vxc.hpp
+++ b/source/module_io/write_Vxc.hpp
@@ -149,6 +149,29 @@ namespace ModuleIO
         return e;
     }
 
+    template<typename T>
+    std::vector<double> orbital_energy(
+        const int ik,
+        const int nbands,
+        const std::vector<T>& mat_mo,
+        const Parallel_2D& p2d)
+    {
+#ifdef __DEBUG
+        assert(nbands >= 0);
+#endif
+        std::vector<double> e(nbands, 0.0);
+        for (int i = 0;i < nbands;++i)
+        {
+            if (p2d.in_this_processor(i, i))
+            {
+                const int index = p2d.global2local_col(i) * p2d.get_row_size() + p2d.global2local_row(i);
+                e[i] = get_real(mat_mo[index]);
+            }
+        }
+        Parallel_Reduce::reduce_all(e.data(), nbands);
+        return e;
+    }
+
     // mohan update 2024-04-01
     template <typename T>
 	void set_gint_pointer(
@@ -244,12 +267,13 @@ namespace ModuleIO
             GlobalV::CURRENT_SPIN = is; //caution: Veff::contributeHR depends on GlobalV::CURRENT_SPIN
             vxcs_op_ao[is]->contributeHR();
         }
+        std::vector<std::vector<double>> e_orb_locxc; // orbital energy (local XC)
+        std::vector<std::vector<double>> e_orb_tot; // orbital energy (total)
 #ifdef __EXX
         hamilt::OperatorEXX<hamilt::OperatorLCAO<TK, TR>> vexx_op_ao(&lm, nullptr, &vxc_k_ao, kv);
-        // ======test=======
-        // std::vector<TK> test_vexxonly_k_ao(pv->nloc);
-        // hamilt::OperatorEXX<hamilt::OperatorLCAO<TK, TR>> test_vexxonly_op_ao(&lm, nullptr, &test_vexxonly_k_ao, kv);
-        // ======test=======
+        std::vector<TK> vexxonly_k_ao(pv->nloc);
+        hamilt::OperatorEXX<hamilt::OperatorLCAO<TK, TR>> vexxonly_op_ao(&lm, nullptr, &vexxonly_k_ao, kv);
+        std::vector<std::vector<double>> e_orb_exx; // orbital energy (EXX)
 #endif
         hamilt::OperatorDFTU<hamilt::OperatorLCAO<TK, TR>> vdftu_op_ao(&lm, kv.kvec_d, nullptr, &vxc_k_ao, kv.isk);
 
@@ -266,29 +290,34 @@ namespace ModuleIO
             ModuleBase::GlobalFunc::ZEROS(vxc_k_ao.data(), pv->nloc);
             int is = GlobalV::CURRENT_SPIN = kv.isk[ik];
             dynamic_cast<hamilt::OperatorLCAO<TK, TR>*>(vxcs_op_ao[is])->contributeHk(ik);
+            const std::vector<TK>& vlocxc_k_mo = cVc(vxc_k_ao.data(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
+
 #ifdef __EXX
-			if (GlobalC::exx_info.info_global.cal_exx) 
-			{
-				vexx_op_ao.contributeHk(ik);
-			}
+            if (GlobalC::exx_info.info_global.cal_exx)
+            {
+                e_orb_locxc.emplace_back(orbital_energy(ik, nbands, vlocxc_k_mo, p2d));
+                ModuleBase::GlobalFunc::ZEROS(vexxonly_k_ao.data(), pv->nloc);
+                vexx_op_ao.contributeHk(ik);
+                vexxonly_op_ao.contributeHk(ik);
+                std::vector<TK> vexx_k_mo = cVc(vexxonly_k_ao.data(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
+                e_orb_exx.emplace_back(orbital_energy(ik, nbands, vexx_k_mo, p2d));
+            }
             // ======test=======
-            // ModuleBase::GlobalFunc::ZEROS(test_vexxonly_k_ao.data(), pv->nloc);
-            // if (GlobalC::exx_info.info_global.cal_exx) test_vexxonly_op_ao.contributeHk(ik);
-            // std::vector<TK> test_vexxonly_k_mo = cVc(test_vexxonly_k_ao.data(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
-            // exx_energy += all_band_energy(ik, test_vexxonly_k_mo, p2d, wg);
+            // exx_energy += all_band_energy(ik, vexx_k_mo, p2d, wg);
             // ======test=======
 #endif
 			if (GlobalV::dft_plus_u) 
 			{
 				vdftu_op_ao.contributeHk(ik);
 			}
-			std::vector<TK> vxc_k_mo = cVc(vxc_k_ao.data(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
+            const std::vector<TK>& vxc_tot_k_mo = cVc(vxc_k_ao.data(), &psi(ik, 0, 0), nbasis, nbands, *pv, p2d);
+            e_orb_tot.emplace_back(orbital_energy(ik, nbands, vxc_tot_k_mo, p2d));
             // write
-            ModuleIO::save_mat(-1, vxc_k_mo.data(), nbands,
+            ModuleIO::save_mat(-1, vxc_tot_k_mo.data(), nbands,
                 false/*binary*/, GlobalV::out_ndigits, true/*triangle*/, false/*append*/,
                 "Vxc", "k-" + std::to_string(ik), p2d, drank);
             // ======test=======
-            // total_energy += all_band_energy(ik, vxc_k_mo, p2d, wg);
+            // total_energy += all_band_energy(ik, vxc_tot_k_mo, p2d, wg);
             // ======test=======
         }
         // ======test=======
@@ -303,6 +332,39 @@ namespace ModuleIO
 		for (int is = 0;is < nspin0;++is) 
 		{
 			delete vxcs_op_ao[is];
-		}
+        }
+        // write the orbital energy for xc and exx in LibRPA format
+        auto write_orb_energy = [&kv, &nspin0, &nbands](const std::vector<std::vector<double>>& e_orb,
+            const std::string& label, const bool app = false)
+            {
+                assert(e_orb.size() == kv.nks);
+                const int nk = kv.nks / nspin0;
+                std::ofstream ofs;
+                ofs.open(GlobalV::global_out_dir + "vxc_" + (label == "" ? "out" : label + "_out"),
+                    app ? std::ios::app : std::ios::out);
+                ofs << nk << "\n" << nspin0 << "\n" << nbands << "\n";
+                ofs << std::scientific << std::setprecision(16);
+                for (int ik = 0;ik < nk;++ik)
+                {
+                    for (int is = 0;is < nspin0;++is)
+                    {
+                        for (auto e : e_orb[is * nk + ik])
+                        {   // Hartree and eV
+                            ofs << e / 2. << "\t" << e * ModuleBase::Ry_to_eV << "\n";
+                        }
+                    }
+                }
+            };
+        if (GlobalV::MY_RANK == 0)
+        {
+            write_orb_energy(e_orb_tot, "");
+#ifdef __EXX
+            if (GlobalC::exx_info.info_global.cal_exx)
+            {
+                write_orb_energy(e_orb_locxc, "local");
+                write_orb_energy(e_orb_exx, "exx");
+            }
+#endif
+        }
     }
 }
diff --git a/tests/integrate/207_NO_KP_OXC/INPUT b/tests/integrate/207_NO_KP_OXC/INPUT
index a97c26dd21..065e75cac6 100644
--- a/tests/integrate/207_NO_KP_OXC/INPUT
+++ b/tests/integrate/207_NO_KP_OXC/INPUT
@@ -28,3 +28,13 @@ mixing_beta		0.7
 out_mat_xc  1
 ks_solver       scalapack_gvx
 
+dft_functional hse
+exx_real_number                         1
+exx_pca_threshold                       1E-4
+exx_c_threshold                         1E-4
+exx_v_threshold                         1
+exx_dm_threshold                        1E-4
+exx_cauchy_threshold            1E-6
+exx_ccp_rmesh_times                     1
+exx_separate_loop                       1
+exx_hybrid_step                         3
\ No newline at end of file
diff --git a/tests/integrate/207_NO_KP_OXC/KPT b/tests/integrate/207_NO_KP_OXC/KPT
index f5f7f4ec34..e769af7638 100644
--- a/tests/integrate/207_NO_KP_OXC/KPT
+++ b/tests/integrate/207_NO_KP_OXC/KPT
@@ -1,4 +1,4 @@
 K_POINTS
 0
 Gamma
-2 2 2 0 0 0
+2 1 1 0 0 0
diff --git a/tests/integrate/207_NO_KP_OXC/k-1-Vxc.ref b/tests/integrate/207_NO_KP_OXC/k-1-Vxc.ref
index 6651d03e42..3b3b0dfe51 100644
--- a/tests/integrate/207_NO_KP_OXC/k-1-Vxc.ref
+++ b/tests/integrate/207_NO_KP_OXC/k-1-Vxc.ref
@@ -1,6 +1,6 @@
-6 (-0.805858,3.20475e-31) (-1.38778e-16,4.41729e-16) (-9.28077e-17,2.16937e-30) (1.03216e-16,1.97215e-31) (7.21645e-16,-6.64683e-16) (-1.04083e-17,-5.91646e-31)
- (-0.745176,-2.69938e-30) (-1.38778e-16,3.9443e-31) (2.77556e-17,-2.36658e-30) (-0.100899,-4.89558e-15) (6.93889e-17,7.09975e-30)
- (-0.827683,1.12166e-30) (-1.8735e-16,-2.21867e-30) (1.11022e-16,1.97215e-30) (4.996e-16,-1.19047e-16)
- (-0.827683,3.57453e-31) (3.1225e-17,-1.72563e-30) (-3.88578e-16,1.19825e-16)
- (-0.745849,3.20475e-31) (-5.38225e-17,7.39557e-31)
- (-0.739459,3.69779e-31)
+6 (-1.03906440e+00,-3.28984700e-16) (-4.79948026e-11,8.29474856e-16) (2.30389492e-13,1.00796921e-14) (7.41729594e-13,2.43532560e-17) (6.43712306e-11,5.70270843e-16) (3.52504822e-11,7.33979365e-13)
+ (-9.63737147e-01,1.26557062e-15) (6.37714326e-11,2.71427499e-12) (-8.30504138e-11,1.55361767e-14) (1.23037027e-01,5.96012135e-15) (2.41324322e-13,-2.41092682e-16)
+ (-1.06369885e+00,2.37908206e-16) (3.58341828e-14,-1.78915043e-15) (-1.26857316e-10,5.43607520e-12) (-1.09731307e-10,3.55419557e-12)
+ (-1.06369885e+00,2.36736345e-16) (1.56426261e-11,3.11153758e-14) (3.68115260e-11,-3.06158094e-13)
+ (-6.54572201e-01,1.11425211e-16) (5.19990845e-14,-3.84935899e-17)
+ (-6.50796154e-01,-5.40483680e-18)
diff --git a/tests/integrate/207_NO_KP_OXC/result.ref b/tests/integrate/207_NO_KP_OXC/result.ref
index 7d89999bce..505c1187ab 100644
--- a/tests/integrate/207_NO_KP_OXC/result.ref
+++ b/tests/integrate/207_NO_KP_OXC/result.ref
@@ -1,4 +1,4 @@
-etotref -211.4466153062836
-etotperatomref -105.7233076531
+etotref -212.5511698630333
+etotperatomref -106.2755849315
 CompareXC_pass 0
-totaltimeref 0.62879
+totaltimeref 19.88
\ No newline at end of file
diff --git a/tests/integrate/207_NO_KP_OXC/vxc_out.ref b/tests/integrate/207_NO_KP_OXC/vxc_out.ref
new file mode 100644
index 0000000000..4a4614ef7d
--- /dev/null
+++ b/tests/integrate/207_NO_KP_OXC/vxc_out.ref
@@ -0,0 +1,15 @@
+2
+1
+6
+-5.1240042976965061e-01	-1.3943131005032152e+01
+-5.4513974737959203e-01	-1.4834013541286041e+01
+-5.4513974737926330e-01	-1.4834013541277097e+01
+-5.2074993904080846e-01	-1.4170332808215299e+01
+-3.3346675058294384e-01	-9.0740958029457151e+00
+-3.3864852416264057e-01	-9.2150990958051811e+00
+-5.1953219929656025e-01	-1.4137196409809622e+01
+-4.8186857360940338e-01	-1.3112316576440625e+01
+-5.3184942295127013e-01	-1.4472365260298501e+01
+-5.3184942295091764e-01	-1.4472365260288909e+01
+-3.2728610048105333e-01	-8.9059116854857336e+00
+-3.2539807714783464e-01	-8.8545359349082791e+00
diff --git a/tests/integrate/307_NO_GO_OXC/vxc_out.ref b/tests/integrate/307_NO_GO_OXC/vxc_out.ref
new file mode 100644
index 0000000000..ed845fbec7
--- /dev/null
+++ b/tests/integrate/307_NO_GO_OXC/vxc_out.ref
@@ -0,0 +1,9 @@
+1
+1
+6
+-3.4234536094979806e-01	-9.3156951855678916e+00
+-3.5746496058679345e-01	-9.7271205986516289e+00
+-2.9661198231693892e-01	-8.0712261091712225e+00
+-2.9661198231693947e-01	-8.0712261091712385e+00
+-2.9661198231693903e-01	-8.0712261091712261e+00
+-3.0978260393545448e-01	-8.4296171095988104e+00
diff --git a/tests/integrate/tools/catch_properties.sh b/tests/integrate/tools/catch_properties.sh
index e6936c6b03..a348f822e8 100755
--- a/tests/integrate/tools/catch_properties.sh
+++ b/tests/integrate/tools/catch_properties.sh
@@ -215,13 +215,16 @@ fi
 
 if ! test -z "$has_xc"  && [  $has_xc == 1 ]; then
 	if ! test -z "$gamma_only"  && [ $gamma_only == 1 ]; then
-                xcref=k-0-Vxc.ref
-                xccal=OUT.autotest/k-0-Vxc
-        else
-                xcref=k-1-Vxc.ref
-                xccal=OUT.autotest/k-1-Vxc
-        fi
-        python3 ../tools/CompareFile.py $xcref $xccal 4
+			xcref=k-0-Vxc.ref
+			xccal=OUT.autotest/k-0-Vxc
+	else
+			xcref=k-1-Vxc.ref
+			xccal=OUT.autotest/k-1-Vxc
+	fi
+	oeref=vxc_out.ref
+	oecal=OUT.autotest/vxc_out
+	python3 ../tools/CompareFile.py $xcref $xccal 4
+	python3 ../tools/CompareFile.py $oeref $oecal 6
     echo "CompareXC_pass $?" >>$1
 fi
 

From a2a6165fca50a1ae0daa8611321576f4d04a7d96 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Sun, 5 May 2024 09:43:01 +0800
Subject: [PATCH 16/18] CI: update runner config (#4088)

* CI: update runner config

New runners with ARM arch is added into deepmodeling organization. We need to manually set the tests runs on x64 arch.

* let image builds on github-hosted runner

* Revert #4065
---
 .github/workflows/coverage.yml     | 4 ++--
 .github/workflows/devcontainer.yml | 2 +-
 .github/workflows/dynamic.yml      | 2 +-
 .github/workflows/image.yml        | 2 +-
 .github/workflows/test.yml         | 2 +-
 Dockerfile.intel                   | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
index c552a0b9e4..678012eef3 100644
--- a/.github/workflows/coverage.yml
+++ b/.github/workflows/coverage.yml
@@ -8,8 +8,8 @@ on:
 jobs:
   test-coverage:
     name: Generate Coverage Report
-    runs-on: self-hosted
-    container: ghcr.io/deepmodeling/abacus-development-kit:gnu
+    runs-on: X64
+    container: ghcr.io/deepmodeling/abacus-gnu
     steps:
       - name: Checkout
         uses: actions/checkout@v4
diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml
index 597283930b..723b008214 100644
--- a/.github/workflows/devcontainer.yml
+++ b/.github/workflows/devcontainer.yml
@@ -7,7 +7,7 @@ on:
 
 jobs:
   build_container_and_push:
-    runs-on: self-hosted
+    runs-on: ubuntu-latest
     if: github.repository_owner == 'deepmodeling'
     strategy:
       matrix:
diff --git a/.github/workflows/dynamic.yml b/.github/workflows/dynamic.yml
index b0d271c4f9..85cb13482e 100644
--- a/.github/workflows/dynamic.yml
+++ b/.github/workflows/dynamic.yml
@@ -8,7 +8,7 @@ on:
 jobs:
   test:
     name: Dynamic analysis
-    runs-on: self-hosted
+    runs-on: X64
     if: github.repository_owner == 'deepmodeling'
     container: ghcr.io/deepmodeling/abacus-gnu
     steps:
diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml
index d4836ddd06..9203a72f6c 100644
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -7,7 +7,7 @@ on:
       - 'v*'
 jobs:
   build_container_and_push:
-    runs-on: self-hosted
+    runs-on: ubuntu-latest
     if: github.repository_owner == 'deepmodeling'
     steps:
       - name: Checkout
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index d1a3e1ef0d..dff5f21aa8 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ concurrency:
 jobs:
   test:
     name: Test
-    runs-on: self-hosted
+    runs-on: X64
     if: github.repository_owner == 'deepmodeling'
     container:
       image: ghcr.io/deepmodeling/abacus-gnu
diff --git a/Dockerfile.intel b/Dockerfile.intel
index 269713a16c..9628cc5b97 100644
--- a/Dockerfile.intel
+++ b/Dockerfile.intel
@@ -16,7 +16,7 @@ RUN apt-get update && \
         intel-oneapi-compiler-dpcpp-cpp \
         intel-oneapi-compiler-fortran \
         intel-oneapi-mkl-devel \
-        intel-oneapi-mpi-devel="2021.11.*" \
+        intel-oneapi-mpi-devel \
         intel-oneapi-vtune
 ENV I_MPI_ROOT=/opt/intel/oneapi/mpi/latest \
     LIBRARY_PATH=/opt/intel/oneapi/tbb/latest/env/../lib/intel64/gcc4.8:/opt/intel/oneapi/mpi/latest/lib:/opt/intel/oneapi/mkl/latest/lib/:/opt/intel/oneapi/ippcp/latest/lib/:/opt/intel/oneapi/ipp/latest/lib:/opt/intel/oneapi/dpl/latest/lib:/opt/intel/oneapi/dnnl/latest/lib:/opt/intel/oneapi/dal/latest/lib:/opt/intel/oneapi/compiler/latest/lib:/opt/intel/oneapi/ccl/latest/lib/ \

From 3f2e20a7c8f34a3db3e16410aaf93054579f5574 Mon Sep 17 00:00:00 2001
From: Chun Cai <amoycaic@gmail.com>
Date: Sun, 5 May 2024 16:13:17 +0800
Subject: [PATCH 17/18] CI: build images with tags on self-hosted runner
 (#4089)

* CI: build images with tags on self-hosted runner

Remove the outdated runner config. Now the abacus-* series images are built with a tag on release.

Use self-hosted runner to build them to avoid AVX512 availability problem.

* update outdated action

* add comments
---
 .github/workflows/devcontainer.yml | 23 +++++++++---
 .github/workflows/image.yml        | 50 --------------------------
 .github/workflows/performance.yml  | 56 ++----------------------------
 Dockerfile                         | 41 ----------------------
 Dockerfile.gnu                     | 18 ++++++++--
 5 files changed, 35 insertions(+), 153 deletions(-)
 delete mode 100644 .github/workflows/image.yml
 delete mode 100644 Dockerfile

diff --git a/.github/workflows/devcontainer.yml b/.github/workflows/devcontainer.yml
index 723b008214..6d2113efb6 100644
--- a/.github/workflows/devcontainer.yml
+++ b/.github/workflows/devcontainer.yml
@@ -4,10 +4,13 @@ on:
   push:
     branches:
       - develop
+    tags:
+      - 'v*'
+  workflow_dispatch:
 
 jobs:
   build_container_and_push:
-    runs-on: ubuntu-latest
+    runs-on: X64
     if: github.repository_owner == 'deepmodeling'
     strategy:
       matrix:
@@ -16,6 +19,17 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4
 
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ghcr.io/deepmodeling/abacus-${{ matrix.dockerfile }}
+            registry.dp.tech/deepmodeling/abacus-${{ matrix.dockerfile }}
+          tags: |
+            type=semver,pattern={{version}},enable=${{ github.ref_type == 'tag' }}
+            type=raw,value=latest
+
       - name: Setup Docker Buildx
         uses: docker/setup-buildx-action@v3
 
@@ -36,10 +50,9 @@ jobs:
       - name: Build and Push Container
         uses: docker/build-push-action@v5
         with:
-          tags: |
-            ghcr.io/deepmodeling/abacus-${{ matrix.dockerfile }}:latest
-            registry.dp.tech/deepmodeling/abacus-${{ matrix.dockerfile }}:latest
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
           file: Dockerfile.${{ matrix.dockerfile }}
+          push: true
           cache-from: type=registry,ref=ghcr.io/deepmodeling/abacus-${{ matrix.dockerfile }}:latest
           cache-to: type=inline
-          push: true
diff --git a/.github/workflows/image.yml b/.github/workflows/image.yml
deleted file mode 100644
index 9203a72f6c..0000000000
--- a/.github/workflows/image.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Build Image
-
-on:
-  workflow_dispatch:
-  push:
-    tags:
-      - 'v*'
-jobs:
-  build_container_and_push:
-    runs-on: ubuntu-latest
-    if: github.repository_owner == 'deepmodeling'
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            ghcr.io/deepmodeling/abacus
-            registry.dp.tech/deepmodeling/abacus
-          tags: |
-            type=semver,pattern={{version}}
-            type=raw,value=latest,enable=${{ github.event_name == 'workflow_dispatch' }}
-
-      - name: Setup Docker Buildx
-        uses: docker/setup-buildx-action@v3
-
-      - name: Login to GitHub Container Registry
-        uses: docker/login-action@v3
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Login to Aliyun Registry
-        uses: docker/login-action@v3
-        with:
-          registry: registry.dp.tech
-          username: ${{ secrets.DP_HARBOR_USERNAME }}
-          password: ${{ secrets.DP_HARBOR_PASSWORD }}
-
-      - name: Build and Push Container
-        uses: docker/build-push-action@v5
-        with:
-          tags: ${{ steps.meta.outputs.tags }}
-          file: Dockerfile
-          push: true
-          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml
index 4acfd2991c..589a85e584 100644
--- a/.github/workflows/performance.yml
+++ b/.github/workflows/performance.yml
@@ -4,34 +4,9 @@ on:
   workflow_dispatch:
 
 jobs:
-  start-runner:
-    name: Start self-hosted EC2 runner
-    runs-on: ubuntu-latest
-    outputs:
-      label: ${{ steps.start-ec2-runner.outputs.label }}
-      ec2-instance-id: ${{ steps.start-ec2-runner.outputs.ec2-instance-id }}
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-      - name: Start EC2 runner
-        id: start-ec2-runner
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: start
-          github-token: ${{ secrets.PAT }}
-          ec2-image-id: ami-04cd9fec4a7a39019
-          ec2-instance-type: c5.9xlarge
-          subnet-id: subnet-72d3e53e
-          security-group-id: sg-06b0c93122c08aeab
-
   test:
     name: Performance test
-    needs: start-runner
-    runs-on: ${{ needs.start-runner.outputs.label }}
+    runs-on: X64
     strategy:
       matrix:
         tag: ["gnu", "intel"]
@@ -43,39 +18,12 @@ jobs:
       - name: Install Requirements
         run: |
           apt install -y time
-      - name: Build
-        run: |
-          cmake -B build -DENABLE_LIBXC=ON
-          cmake --build build -j`nproc`
-          cmake --install build
       - name: Test
         run: |
-          test -e /opt/intel/oneapi/setvars.sh && . /opt/intel/oneapi/setvars.sh
+          . /opt/intel/oneapi/setvars.sh || :
           cd tests/performance/
           bash run.sh
       - name: Show Result
         if: always()
         run: |
           cat tests/performance/sumall.dat
-
-  stop-runner:
-    name: Stop self-hosted EC2 runner
-    needs:
-      - start-runner # required to get output from the start-runner job
-      - test # required to wait when the main job is done
-    runs-on: ubuntu-latest
-    if: ${{ always() }} # required to stop the runner even if the error happened in the previous jobs
-    steps:
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-          aws-region: us-east-2
-      - name: Stop EC2 runner
-        uses: machulav/ec2-github-runner@v2
-        with:
-          mode: stop
-          github-token: ${{ secrets.PAT }}
-          label: ${{ needs.start-runner.outputs.label }}
-          ec2-instance-id: ${{ needs.start-runner.outputs.ec2-instance-id }}
diff --git a/Dockerfile b/Dockerfile
deleted file mode 100644
index 68c47048ac..0000000000
--- a/Dockerfile
+++ /dev/null
@@ -1,41 +0,0 @@
-# To build this Dockerfile, run `docker build -t abacus - < Dockerfile`.
-# Build without cloning the repo by `docker build https://github.com/deepmodeling/abacus-develop.git#develop`,
-#   and optionally choose the Dockerfile in use by appending e.g. `-f Dockerfile.gnu`.
-# Alternatively, pull the image with `docker pull ghcr.io/deepmodeling/abacus:latest`.
-# Also available at `docker pull registry.dp.tech/deepmodeling/abacus:latest`.
-
-# Docker images are aimed for evaluating ABACUS.
-# For production use, please compile ABACUS from source code and run in bare-metal for a better performace.
-
-FROM ubuntu:22.04
-RUN apt update && apt install -y --no-install-recommends \
-    libopenblas-openmp-dev liblapack-dev libscalapack-mpi-dev libelpa-dev libfftw3-dev libcereal-dev libxc-dev \
-    g++ make cmake bc time sudo vim git
-# If you wish to use the LLVM compiler, replace 'g++' above with 'clang libomp-dev'.
-
-ENV GIT_SSL_NO_VERIFY=true TERM=xterm-256color \
-    OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 OMPI_MCA_btl_vader_single_copy_mechanism=none
-
-# This will fetch the latest commit info, and store in docker building cache.
-# If there are newer commits, docker build will ignore the cache and build latest codes.
-ADD https://api.github.com/repos/deepmodeling/abacus-develop/git/refs/heads/develop /dev/null
-
-RUN git clone https://github.com/deepmodeling/abacus-develop.git --depth 1 && \
-    cd abacus-develop && \
-    cmake -B build && \
-    cmake --build build -j`nproc` && \
-    cmake --install build && \
-    rm -rf build && \
-    cd .. 
-    #&& rm -rf abacus-develop
-# If you have trouble cloning repo, replace "github.com" with "gitee.com".
-CMD mpirun --use-hwthread-cpus abacus
-
-# To run ABACUS built by this image with all available threads, execute `docker run -v <host>:<wd> -w <wd/input> abacus:latest`.
-# Replace '<host>' with the path to all files(including pseudopotential files), '<wd>' with a path to working directory, and '<wd/input>' with the path to input folder(containing 'INPUT', 'STRU', etc.).
-# e.g. after cloning the repo to `$HOME` and pulling image, execute `docker run -v ~/abacus-develop/tests/integrate:/workspace -w /workspace/101_PW_15_f_pseudopots abacus:latest`.
-# To run ABACUS with a given MPI process number, execute `docker run -v <host>:<wd> -w <wd/input> -it --entrypoint mpirun abacus:latest -np <processes> abacus`.
-# Note: It would be better using all available CPUs. Docker uses CFS to share the CPU resources, which will result in bad CPU affinity.
-
-# To use this image as developing environment, execute `docker run -it --entrypoint /bin/bash abacus`.
-# Please refer to https://docs.docker.com/engine/reference/commandline/run/ for more details.
diff --git a/Dockerfile.gnu b/Dockerfile.gnu
index c8316479c9..23b44f1e4c 100644
--- a/Dockerfile.gnu
+++ b/Dockerfile.gnu
@@ -1,11 +1,22 @@
+# To build this Dockerfile, run `docker build -t abacus - < Dockerfile.gnu`.
+# Build without cloning the repo by `docker build https://github.com/deepmodeling/abacus-develop.git#develop -f Dockerfile.intel`,
+# Alternatively, pull the image with `docker pull ghcr.io/deepmodeling/abacus-intel:latest`.
+# Also available at `docker pull registry.dp.tech/deepmodeling/abacus-intel:latest`.
+# Available image names: abacus-gnu, abacus-intel, abacus-cuda
+
+# Docker images are aimed for evaluating ABACUS.
+# For production use, please compile ABACUS from source code and run in bare-metal for a better performace.
+
 FROM ubuntu:22.04
 RUN apt update && apt install -y --no-install-recommends \
     libopenblas-openmp-dev liblapack-dev libscalapack-mpi-dev libelpa-dev libfftw3-dev libcereal-dev \
     libxc-dev libgtest-dev libgmock-dev libbenchmark-dev python3-numpy \
     bc cmake git g++ make bc time sudo unzip vim wget gfortran
+    # If you wish to use the LLVM compiler, replace 'g++' above with 'clang libomp-dev'.
 
 ENV GIT_SSL_NO_VERIFY=true TERM=xterm-256color \
     OMPI_ALLOW_RUN_AS_ROOT=1 OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1 OMPI_MCA_btl_vader_single_copy_mechanism=none
+    # The above environment variables are for using OpenMPI in Docker.
 
 RUN git clone https://github.com/llohse/libnpy.git && \
     cp libnpy/include/npy.hpp /usr/local/include && \
@@ -13,17 +24,18 @@ RUN git clone https://github.com/llohse/libnpy.git && \
 
 RUN wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.0.0%2Bcpu.zip \
         --no-check-certificate --quiet -O libtorch.zip && \
-    unzip -q libtorch.zip -d /opt  && rm libtorch.zip
+    unzip -q libtorch.zip -d /opt && rm libtorch.zip
 
 ENV CMAKE_PREFIX_PATH=/opt/libtorch/share/cmake
 
 ADD https://api.github.com/repos/deepmodeling/abacus-develop/git/refs/heads/develop /dev/null
+    # This will fetch the latest commit info, and store in docker building cache.
+    # If there are newer commits, docker build will ignore the cache and build latest codes.
 
 RUN git clone https://github.com/deepmodeling/abacus-develop.git --depth 1 && \
     cd abacus-develop && \
     cmake -B build -DENABLE_DEEPKS=ON -DENABLE_LIBXC=ON -DENABLE_LIBRI=ON -DENABLE_RAPIDJSON=ON && \
     cmake --build build -j`nproc` && \
     cmake --install build && \
-    rm -rf build && \
-    cd .. 
+    rm -rf build
     #&& rm -rf abacus-develop

From 96e8c59597f4e397b33c6be3c9d0ac8b3f285b44 Mon Sep 17 00:00:00 2001
From: Taoni Bao <baotaoni@pku.edu.cn>
Date: Sun, 5 May 2024 16:14:34 +0800
Subject: [PATCH 18/18] Add a warning quit when parameters do not match in
 `read_wfc_nao.cpp` (#4087)

* Add a warning quit when param do not match in reading wfc nao

* Do nearly nothing, just to restart integration test

* Added Doxygen-style comments to the CTOT2q function and made some other relevant changes

* Added const qualifiers where applicable and initialized variables at declaration

---------

Co-authored-by: Mohan Chen <mohan.chen.chen.mohan@gmail.com>
---
 source/module_io/read_wfc_nao.cpp | 633 +++++++++++++++++-------------
 1 file changed, 351 insertions(+), 282 deletions(-)

diff --git a/source/module_io/read_wfc_nao.cpp b/source/module_io/read_wfc_nao.cpp
index c55e14044d..4e940874d4 100644
--- a/source/module_io/read_wfc_nao.cpp
+++ b/source/module_io/read_wfc_nao.cpp
@@ -1,85 +1,126 @@
 #include "read_wfc_nao.h"
+
 #include "module_base/parallel_common.h"
 #include "module_base/timer.h"
 
-inline int CTOT2q(
-	int myid,
-	int naroc[2],
-	int nb,
-	int dim0,
-	int dim1,
-	int iprow,
-	int ipcol,
-    const int nbands,
-	double* work,
-	double** CTOT)
+/**
+ * @brief Extracts a submatrix from a global orbital coefficient matrix and stores it in a linear array.
+ *        This function is designed for real-valued matrices.
+ *
+ * @param myid The rank of the current MPI process.
+ * @param naroc Array with two elements: number of rows and columns this process is responsible for.
+ * @param nb Block size used in the global index calculation.
+ * @param dim0 Global number of rows in the matrix.
+ * @param dim1 Global number of columns in the matrix.
+ * @param iprow The row index in the process grid.
+ * @param ipcol The column index in the process grid.
+ * @param nbands Total number of bands (relevant matrix columns) involved in the computation.
+ * @param work Output array where the submatrix will be stored linearly.
+ * @param CTOT Global matrix from which the submatrix is extracted.
+ * @return int Always returns 0 as a success indicator.
+ */
+inline int CTOT2q(const int myid,
+                  const int naroc[2],
+                  const int nb,
+                  const int dim0,
+                  const int dim1,
+                  const int iprow,
+                  const int ipcol,
+                  const int nbands,
+                  double* work,
+                  double** const CTOT)
 {
-    for(int j=0; j<naroc[1]; ++j)
+    for (int j = 0; j < naroc[1]; ++j)
     {
-        int igcol=Local_Orbital_wfc::globalIndex(j, nb, dim1, ipcol);
-        if(igcol>=nbands) continue;
-        for(int i=0; i<naroc[0]; ++i)
+        int igcol = Local_Orbital_wfc::globalIndex(j, nb, dim1, ipcol);
+        if (igcol >= nbands)
+        {
+            continue;
+        }
+        for (int i = 0; i < naroc[0]; ++i)
         {
-            int igrow=Local_Orbital_wfc::globalIndex(i, nb, dim0, iprow);
-			//GlobalV::ofs_running << "i,j,igcol,igrow" << i<<" "<<j<<" "<<igcol<<" "<<igrow<<std::endl;
-            if(myid==0) work[j*naroc[0]+i]=CTOT[igcol][igrow];
+            int igrow = Local_Orbital_wfc::globalIndex(i, nb, dim0, iprow);
+            // GlobalV::ofs_running << "i,j,igcol,igrow" << i << " " << j << " " << igcol << " " << igrow << std::endl;
+            if (myid == 0)
+            {
+                work[j * naroc[0] + i] = CTOT[igcol][igrow];
+            }
         }
     }
     return 0;
 }
 
-inline int CTOT2q_c(
-	int myid,
-	int naroc[2],
-	int nb,
-	int dim0,
-	int dim1,
-	int iprow,
-	int ipcol,
-    const int nbands,
-	std::complex<double>* work,
-	std::complex<double>** CTOT)
+/**
+ * @brief Extracts a submatrix from a global orbital coefficient matrix and stores it in a linear array.
+ *        This function is designed for complex-valued matrices.
+ *
+ * @param myid The rank of the current MPI process.
+ * @param naroc Array with two elements: number of rows and columns this process is responsible for.
+ * @param nb Block size used in the global index calculation.
+ * @param dim0 Global number of rows in the matrix.
+ * @param dim1 Global number of columns in the matrix.
+ * @param iprow The row index in the process grid.
+ * @param ipcol The column index in the process grid.
+ * @param nbands Total number of bands (relevant matrix columns) involved in the computation.
+ * @param work Output array where the submatrix will be stored linearly.
+ * @param CTOT Global matrix from which the submatrix is extracted.
+ * @return int Always returns 0 as a success indicator.
+ */
+inline int CTOT2q_c(const int myid,
+                    const int naroc[2],
+                    const int nb,
+                    const int dim0,
+                    const int dim1,
+                    const int iprow,
+                    const int ipcol,
+                    const int nbands,
+                    std::complex<double>* work,
+                    std::complex<double>** const CTOT)
 {
-    for(int j=0; j<naroc[1]; ++j)
+    for (int j = 0; j < naroc[1]; ++j)
     {
-        int igcol=Local_Orbital_wfc::globalIndex(j, nb, dim1, ipcol);
-        if(igcol>=nbands) continue;
-        for(int i=0; i<naroc[0]; ++i)
+        int igcol = Local_Orbital_wfc::globalIndex(j, nb, dim1, ipcol);
+        if (igcol >= nbands)
         {
-            int igrow=Local_Orbital_wfc::globalIndex(i, nb, dim0, iprow);
-			//ofs_running << "i,j,igcol,igrow" << i<<" "<<j<<" "<<igcol<<" "<<igrow<<std::endl;
-            if(myid==0) work[j*naroc[0]+i]=CTOT[igcol][igrow];
+            continue;
+        }
+        for (int i = 0; i < naroc[0]; ++i)
+        {
+            int igrow = Local_Orbital_wfc::globalIndex(i, nb, dim0, iprow);
+            // ofs_running << "i,j,igcol,igrow" << i << " " << j << " " << igcol << " " << igrow << std::endl;
+            if (myid == 0)
+            {
+                work[j * naroc[0] + i] = CTOT[igcol][igrow];
+            }
         }
     }
     return 0;
 }
 
 // be called in local_orbital_wfc::allocate_k
-int ModuleIO::read_wfc_nao_complex(
-    std::complex<double>** ctot, 
-    const int& ik,
-    const int& nb2d,
-    const int& nbands_g,
-    const int& nlocal_g,
-    const std::string& global_readin_dir,
-    const ModuleBase::Vector3<double> kvec_c,
-    const Parallel_Orbitals* ParaV, 
-    psi::Psi<std::complex<double>>* psi, 
-    elecstate::ElecState* pelec)
+int ModuleIO::read_wfc_nao_complex(std::complex<double>** ctot,
+                                   const int& ik,
+                                   const int& nb2d,
+                                   const int& nbands_g,
+                                   const int& nlocal_g,
+                                   const std::string& global_readin_dir,
+                                   const ModuleBase::Vector3<double> kvec_c,
+                                   const Parallel_Orbitals* ParaV,
+                                   psi::Psi<std::complex<double>>* psi,
+                                   elecstate::ElecState* pelec)
 {
-    ModuleBase::TITLE("ModuleIO","read_wfc_nao_complex");
-    ModuleBase::timer::tick("ModuleIO","read_wfc_nao_complex");
+    ModuleBase::TITLE("ModuleIO", "read_wfc_nao_complex");
+    ModuleBase::timer::tick("ModuleIO", "read_wfc_nao_complex");
 
     std::stringstream ss;
-	// read wave functions
-    ss << global_readin_dir << "LOWF_K_" << ik+1 <<".txt";
-//	std::cout << " name is = " << ss.str() << std::endl;
+    // read wave functions
+    ss << global_readin_dir << "LOWF_K_" << ik + 1 << ".txt";
+    //	std::cout << " name is = " << ss.str() << std::endl;
 
     std::ifstream ifs;
-
     int error = 0;
 
-    if (GlobalV::DRANK==0)
+    if (GlobalV::DRANK == 0)
     {
         ifs.open(ss.str().c_str());
         if (!ifs)
@@ -93,39 +134,39 @@ int ModuleIO::read_wfc_nao_complex(
     Parallel_Common::bcast_int(error);
 #endif
 
-    if (error==1) return 1;
+    if (error == 1)
+    {
+        return 1;
+    }
 
     // otherwise, find the file.
 
-    if (GlobalV::MY_RANK==0)
+    if (GlobalV::MY_RANK == 0)
     {
-		int ikr;
-		double kx,ky,kz;
-        int nbands, nlocal;
-		ModuleBase::GlobalFunc::READ_VALUE(ifs, ikr);
-		ifs >> kx >> ky >> kz;
+        int ikr = 0;
+        double kx = 0.0, ky = 0.0, kz = 0.0;
+        int nbands = 0, nlocal = 0;
+        ModuleBase::GlobalFunc::READ_VALUE(ifs, ikr);
+        ifs >> kx >> ky >> kz;
         ModuleBase::GlobalFunc::READ_VALUE(ifs, nbands);
         ModuleBase::GlobalFunc::READ_VALUE(ifs, nlocal);
 
-		if(ikr!=ik+1)
-		{
-			GlobalV::ofs_warning << " ikr=" << ikr << " ik=" << ik << std::endl;
-			GlobalV::ofs_warning << " k index is not correct" << std::endl;
-			error = 4;
-		}
-		else if ( 
-			std::abs(kx-kvec_c.x)>1.0e-5 ||
-			std::abs(ky-kvec_c.y)>1.0e-5 ||
-			std::abs(kz-kvec_c.z)>1.0e-5 )
-		{	
-			GlobalV::ofs_warning << " k std::vector is not correct" << std::endl;
-			GlobalV::ofs_warning << " Read in kx=" << kx << " ky = " << ky << " kz = " << kz << std::endl;
-			GlobalV::ofs_warning << " In fact, kx=" << kvec_c.x 
-			 << " ky=" << kvec_c.y
-			 << " kz=" << kvec_c.z << std::endl;
-			 error = 4; 
-		}
-        else if (nbands!=nbands_g)
+        if (ikr != ik + 1)
+        {
+            GlobalV::ofs_warning << " ikr=" << ikr << " ik=" << ik << std::endl;
+            GlobalV::ofs_warning << " k index is not correct" << std::endl;
+            error = 4;
+        }
+        else if (std::abs(kx - kvec_c.x) > 1.0e-5 || std::abs(ky - kvec_c.y) > 1.0e-5
+                 || std::abs(kz - kvec_c.z) > 1.0e-5)
+        {
+            GlobalV::ofs_warning << " k std::vector is not correct" << std::endl;
+            GlobalV::ofs_warning << " Read in kx=" << kx << " ky=" << ky << " kz=" << kz << std::endl;
+            GlobalV::ofs_warning << " In fact, kx=" << kvec_c.x << " ky=" << kvec_c.y << " kz=" << kvec_c.z
+                                 << std::endl;
+            error = 4;
+        }
+        else if (nbands != nbands_g)
         {
             GlobalV::ofs_warning << " read in nbands=" << nbands;
             GlobalV::ofs_warning << " NBANDS=" << nbands_g << std::endl;
@@ -139,29 +180,29 @@ int ModuleIO::read_wfc_nao_complex(
         }
 
         ctot = new std::complex<double>*[nbands_g];
-        for (int i=0; i<nbands_g; i++)
+        for (int i = 0; i < nbands_g; i++)
         {
             ctot[i] = new std::complex<double>[nlocal_g];
         }
 
-        for (int i=0; i<nbands_g; ++i)
+        for (int i = 0; i < nbands_g; ++i)
         {
-            int ib;
+            int ib = 0;
             ModuleBase::GlobalFunc::READ_VALUE(ifs, ib);
-			ib -= 1; // because in C++, ib should start from 0
-			//------------------------------------------------
-			// read the eigenvalues!
-			// very important to determine the occupations.
-			//------------------------------------------------
-			ModuleBase::GlobalFunc::READ_VALUE(ifs, pelec->ekb(ik, ib));
-			ModuleBase::GlobalFunc::READ_VALUE(ifs, pelec->wg(ik,ib));
-            assert( i==ib );
-			double a, b;
-            for (int j=0; j<nlocal_g; ++j)
+            ib -= 1; // because in C++, ib should start from 0
+            //------------------------------------------------
+            // read the eigenvalues!
+            // very important to determine the occupations.
+            //------------------------------------------------
+            ModuleBase::GlobalFunc::READ_VALUE(ifs, pelec->ekb(ik, ib));
+            ModuleBase::GlobalFunc::READ_VALUE(ifs, pelec->wg(ik, ib));
+            assert(i == ib);
+            double a = 0.0, b = 0.0;
+            for (int j = 0; j < nlocal_g; ++j)
             {
                 ifs >> a >> b;
-				ctot[i][j]=std::complex<double>(a,b);
-				//std::cout << ctot[i][j] << " " << std::endl;
+                ctot[i][j] = std::complex<double>(a, b);
+                // std::cout << ctot[i][j] << " " << std::endl;
             }
         }
     }
@@ -170,78 +211,86 @@ int ModuleIO::read_wfc_nao_complex(
     Parallel_Common::bcast_int(error);
     Parallel_Common::bcast_double(&pelec->wg.c[ik * pelec->wg.nc], pelec->wg.nc);
 #endif
-	if(error==2) return 2;
-	if(error==3) return 3;
-	if(error==4) return 4;
-	
-	ModuleIO::distri_wfc_nao_complex(ctot, ik, nb2d, nbands_g, ParaV, psi);
-	
-	// mohan add 2012-02-15,
-	// still have bugs, but can solve it later.
-	// distribute the wave functions again.
-
-    if (GlobalV::DRANK==0)
+
+    if (error == 2)
+    {
+        return 2;
+    }
+    if (error == 3)
+    {
+        return 3;
+    }
+    if (error == 4)
+    {
+        return 4;
+    }
+
+    ModuleIO::distri_wfc_nao_complex(ctot, ik, nb2d, nbands_g, ParaV, psi);
+
+    // mohan add 2012-02-15,
+    // still have bugs, but can solve it later.
+    // distribute the wave functions again.
+
+    if (GlobalV::DRANK == 0)
     {
         // delte the ctot
-        for (int i=0; i<nbands_g; i++)
+        for (int i = 0; i < nbands_g; i++)
         {
             delete[] ctot[i];
         }
         delete[] ctot;
     }
 
-	//---------
-	// TEST
-	//---------
-	/*
-	for(int i=0; i<nbands_g; ++i)
-	{
-		std::cout << " c band i=" << i+1 << std::endl;
-		for(int j=0; j<nlocal_g; ++j)
-		{
-			std::cout << " " << c[i][j];
-		}
-		std::cout << std::endl;
-	}
-	*/
-
-
-    ModuleBase::timer::tick("ModuleIO","read_wfc_nao_complex");
-	return 0;
+    //---------
+    // TEST
+    //---------
+    /*
+    for(int i=0; i<nbands_g; ++i)
+    {
+        std::cout << " c band i=" << i+1 << std::endl;
+        for(int j=0; j<nlocal_g; ++j)
+        {
+            std::cout << " " << c[i][j];
+        }
+        std::cout << std::endl;
+    }
+    */
+
+    ModuleBase::timer::tick("ModuleIO", "read_wfc_nao_complex");
+    return 0;
 }
 
-int ModuleIO::read_wfc_nao(
-    double** ctot, 
-    const int& is,
-    const bool& gamma_only_local,
-    const int& nb2d,
-    const int& nbands_g,
-    const int& nlocal_g,
-    const std::string& global_readin_dir,
-    const Parallel_Orbitals* ParaV, 
-    psi::Psi<double>* psid, 
-    elecstate::ElecState* pelec)
+int ModuleIO::read_wfc_nao(double** ctot,
+                           const int& is,
+                           const bool& gamma_only_local,
+                           const int& nb2d,
+                           const int& nbands_g,
+                           const int& nlocal_g,
+                           const std::string& global_readin_dir,
+                           const Parallel_Orbitals* ParaV,
+                           psi::Psi<double>* psid,
+                           elecstate::ElecState* pelec)
 {
-    ModuleBase::TITLE("ModuleIO","read_wfc_nao");
+    ModuleBase::TITLE("ModuleIO", "read_wfc_nao");
     ModuleBase::timer::tick("ModuleIO", "read_wfc_nao");
-    
+
     std::stringstream ss;
-	if(GlobalV::GAMMA_ONLY_LOCAL)
-	{
-		// read wave functions
-    	ss << global_readin_dir << "LOWF_GAMMA_S" << is+1 <<".txt";
-		std::cout << " name is = " << ss.str() << std::endl;
-	}
-	else
-	{
-		ss << global_readin_dir << "LOWF_K.txt";
-	}
+    if (GlobalV::GAMMA_ONLY_LOCAL)
+    {
+        // read wave functions
+        ss << global_readin_dir << "LOWF_GAMMA_S" << is + 1 << ".txt";
+        std::cout << " name is = " << ss.str() << std::endl;
+    }
+    else
+    {
+        ss << global_readin_dir << "LOWF_K.txt";
+    }
 
     std::ifstream ifs;
 
     int error = 0;
 
-    if (GlobalV::DRANK==0)
+    if (GlobalV::DRANK == 0)
     {
         ifs.open(ss.str().c_str());
         if (!ifs)
@@ -255,217 +304,237 @@ int ModuleIO::read_wfc_nao(
     Parallel_Common::bcast_int(error);
 #endif
 
-    if (error==1) return 1;
+    if (error == 1)
+    {
+        return 1;
+    }
 
     // otherwise, find the file.
 
-    if (GlobalV::MY_RANK==0)
+    if (GlobalV::MY_RANK == 0)
     {
-        int nbands, nlocal;
+        int nbands = 0, nlocal = 0;
         ModuleBase::GlobalFunc::READ_VALUE(ifs, nbands);
         ModuleBase::GlobalFunc::READ_VALUE(ifs, nlocal);
 
-        if (nbands!=nbands_g)
+        if (nbands != nbands_g)
         {
             GlobalV::ofs_warning << " read in nbands=" << nbands;
             GlobalV::ofs_warning << " NBANDS=" << nbands_g << std::endl;
             error = 2;
+            ModuleBase::WARNING_QUIT("ModuleIO::read_wfc_nao",
+                                     "The nbands read in from file do not match the global parameter NBANDS!");
         }
         else if (nlocal != nlocal_g)
         {
             GlobalV::ofs_warning << " read in nlocal=" << nlocal;
             GlobalV::ofs_warning << " NLOCAL=" << nlocal_g << std::endl;
             error = 3;
+            ModuleBase::WARNING_QUIT("ModuleIO::read_wfc_nao",
+                                     "The nlocal read in from file do not match the global parameter NLOCAL!");
         }
 
         ctot = new double*[nbands_g];
-        for (int i=0; i<nbands_g; i++)
+        for (int i = 0; i < nbands_g; i++)
         {
             ctot[i] = new double[nlocal_g];
         }
 
-		//std::cout << "nbands" << nbands_g << std::endl;
-        for (int i=0; i<nbands_g; i++)
+        for (int i = 0; i < nbands_g; i++)
         {
-            int ib;
+            int ib = 0;
             ModuleBase::GlobalFunc::READ_VALUE(ifs, ib);
             ModuleBase::GlobalFunc::READ_VALUE(ifs, pelec->ekb(is, i));
             ModuleBase::GlobalFunc::READ_VALUE(ifs, pelec->wg(is, i));
-            assert( (i+1)==ib);
-			//std::cout << " ib=" << ib << std::endl;
-            for (int j=0; j<nlocal_g; j++)
+            assert((i + 1) == ib);
+            // std::cout << " ib=" << ib << std::endl;
+            for (int j = 0; j < nlocal_g; j++)
             {
                 ifs >> ctot[i][j];
-				//std::cout << ctot[i][j] << " ";
+                // std::cout << ctot[i][j] << " ";
             }
-			//std::cout << std::endl;
+            // std::cout << std::endl;
         }
     }
 
-
 #ifdef __MPI
     Parallel_Common::bcast_int(error);
-	Parallel_Common::bcast_double( &(pelec->ekb(is, 0)), nbands_g);
-	Parallel_Common::bcast_double( &(pelec->wg(is, 0)), nbands_g);
+    Parallel_Common::bcast_double(&(pelec->ekb(is, 0)), nbands_g);
+    Parallel_Common::bcast_double(&(pelec->wg(is, 0)), nbands_g);
 #endif
-	if(error==2) return 2;
-	if(error==3) return 3;
+    if (error == 2)
+    {
+        return 2;
+    }
+    if (error == 3)
+    {
+        return 3;
+    }
+
+    ModuleIO::distri_wfc_nao(ctot, is, nb2d, nbands_g, nlocal_g, ParaV, psid);
 
-	ModuleIO::distri_wfc_nao(ctot, is, nb2d, nbands_g, nlocal_g,ParaV, psid);
-	
-	// mohan add 2012-02-15,
-	// still have bugs, but can solve it later.
-	// distribute the wave functions again.
-	// mohan comment out 2021-02-09
+    // mohan add 2012-02-15,
+    // still have bugs, but can solve it later.
+    // distribute the wave functions again.
+    // mohan comment out 2021-02-09
 
-    if (GlobalV::MY_RANK==0)
+    if (GlobalV::MY_RANK == 0)
     {
         // delte the ctot
-        for (int i=0; i<nbands_g; i++)
+        for (int i = 0; i < nbands_g; i++)
         {
             delete[] ctot[i];
         }
         delete[] ctot;
     }
 
-    ModuleBase::timer::tick("ModuleIO","read_wfc_nao");
+    ModuleBase::timer::tick("ModuleIO", "read_wfc_nao");
     return 0;
 }
 
-void ModuleIO::distri_wfc_nao(double** ctot, const int& is, const int& nb2d, const int& nbands_g,
-                              const int& nlocal_g, const Parallel_Orbitals* ParaV, psi::Psi<double>* psid)
+void ModuleIO::distri_wfc_nao(double** ctot,
+                              const int& is,
+                              const int& nb2d,
+                              const int& nbands_g,
+                              const int& nlocal_g,
+                              const Parallel_Orbitals* ParaV,
+                              psi::Psi<double>* psid)
 {
-    ModuleBase::TITLE("ModuleIO","distri_wfc_nao");
+    ModuleBase::TITLE("ModuleIO", "distri_wfc_nao");
 #ifdef __MPI
 
-//1. alloc work array; set some parameters
+    // 1. alloc work array; set some parameters
 
-	long maxnloc; // maximum number of elements in local matrix
+    long maxnloc = 0; // maximum number of elements in local matrix
     MPI_Reduce(&ParaV->nloc_wfc, &maxnloc, 1, MPI_LONG, MPI_MAX, 0, ParaV->comm_2D);
-	MPI_Bcast(&maxnloc, 1, MPI_LONG, 0, ParaV->comm_2D);
-	//reduce and bcast could be replaced by allreduce
-	
-    int nprocs, myid;
+    MPI_Bcast(&maxnloc, 1, MPI_LONG, 0, ParaV->comm_2D);
+    // reduce and bcast could be replaced by allreduce
+
+    int nprocs = 0, myid = 0;
     MPI_Comm_size(ParaV->comm_2D, &nprocs);
     MPI_Comm_rank(ParaV->comm_2D, &myid);
 
-	double *work=new double[maxnloc]; // work/buffer matrix
+    double* work = new double[maxnloc]; // work/buffer matrix
 #ifdef __DEBUG
-assert(nb2d > 0);
-#endif	
-	int nb = nb2d;
-	int info;
-	int naroc[2]; // maximum number of row or column
-	
-//2. copy from ctot to psi
-	for(int iprow=0; iprow<ParaV->dim0; ++iprow)
-	{
-		for(int ipcol=0; ipcol<ParaV->dim1; ++ipcol)
-		{
-//2.1 get and bcast local 2d matrix info
-			const int coord[2]={iprow, ipcol};
-			int src_rank;
-			MPI_Cart_rank(ParaV->comm_2D, coord, &src_rank);
-			if(myid==src_rank)
-			{
-				naroc[0]=ParaV->nrow;
-				naroc[1]=ParaV->ncol;
-			}
-			info=MPI_Bcast(naroc, 2, MPI_INT, src_rank, ParaV->comm_2D);
-
-//2.2 copy from ctot to work, then bcast work
-			info=CTOT2q(myid, naroc, nb, ParaV->dim0, ParaV->dim1, iprow, ipcol, nbands_g, work, ctot);
-			info=MPI_Bcast(work, maxnloc, MPI_DOUBLE, 0, ParaV->comm_2D);
-			//GlobalV::ofs_running << "iprow, ipcow : " << iprow << ipcol << std::endl;
-			//for (int i=0; i<maxnloc; ++i)
-			//{
-				//GlobalV::ofs_running << *(work+i)<<" ";
-			//}
-			//GlobalV::ofs_running << std::endl;
-//2.3 copy from work to psi
-			const int inc=1;
-			if(myid==src_rank)
-			{
-                BlasConnector::copy(ParaV->nloc_wfc, work, inc, &(psid[0](is, 0, 0)), inc);
-			}
-		}//loop ipcol
-	}//loop	iprow
+    assert(nb2d > 0);
+#endif
+    const int nb = nb2d;
+    int info = 0;
+    int naroc[2]; // maximum number of row or column
 
-	delete[] work;
-#else
-        for (int i=0; i<nbands_g; i++)
+    // 2. copy from ctot to psi
+    for (int iprow = 0; iprow < ParaV->dim0; ++iprow)
+    {
+        for (int ipcol = 0; ipcol < ParaV->dim1; ++ipcol)
         {
-            for (int j=0; j<nlocal_g; j++)
+            // 2.1 get and bcast local 2d matrix info
+            const int coord[2] = {iprow, ipcol};
+            int src_rank = 0;
+            MPI_Cart_rank(ParaV->comm_2D, coord, &src_rank);
+            if (myid == src_rank)
             {
-               psid[0](is, i, j) = ctot[i][j];
+                naroc[0] = ParaV->nrow;
+                naroc[1] = ParaV->ncol;
             }
+            info = MPI_Bcast(naroc, 2, MPI_INT, src_rank, ParaV->comm_2D);
+
+            // 2.2 copy from nctot to work, then bcast work
+            info = CTOT2q(myid, naroc, nb, ParaV->dim0, ParaV->dim1, iprow, ipcol, nbands_g, work, ctot);
+            info = MPI_Bcast(work, maxnloc, MPI_DOUBLE, 0, ParaV->comm_2D);
+            // GlobalV::ofs_running << "iprow, ipcow : " << iprow << ipcol << std::endl;
+            // for (int i=0; i<maxnloc; ++i)
+            //{
+            // GlobalV::ofs_running << *(work+i)<<" ";
+            //}
+            // GlobalV::ofs_running << std::endl;
+            // 2.3 copy from work to psi
+            const int inc = 1;
+            if (myid == src_rank)
+            {
+                BlasConnector::copy(ParaV->nloc_wfc, work, inc, &(psid[0](is, 0, 0)), inc);
+            }
+        } // loop ipcol
+    }     // loop	iprow
+
+    delete[] work;
+#else
+    for (int i = 0; i < nbands_g; i++)
+    {
+        for (int j = 0; j < nlocal_g; j++)
+        {
+            psid[0](is, i, j) = ctot[i][j];
         }
+    }
 #endif
     return;
 }
 
-void ModuleIO::distri_wfc_nao_complex(std::complex<double>** ctot, const int& ik, const int& nb2d,
-         const int& nbands_g, const Parallel_Orbitals* ParaV, psi::Psi<std::complex<double>>* psi)
+void ModuleIO::distri_wfc_nao_complex(std::complex<double>** ctot,
+                                      const int& ik,
+                                      const int& nb2d,
+                                      const int& nbands_g,
+                                      const Parallel_Orbitals* ParaV,
+                                      psi::Psi<std::complex<double>>* psi)
 {
-    ModuleBase::TITLE("ModuleIO","distri_wfc_nao_complex");
+    ModuleBase::TITLE("ModuleIO", "distri_wfc_nao_complex");
 #ifdef __MPI
 
-//1. alloc work array; set some parameters
+    // 1. alloc work array; set some parameters
+
+    long maxnloc = 0; // maximum number of elements in local matrix
+    MPI_Reduce(&ParaV->nloc_wfc, &maxnloc, 1, MPI_LONG, MPI_MAX, 0, ParaV->comm_2D);
+    MPI_Bcast(&maxnloc, 1, MPI_LONG, 0, ParaV->comm_2D);
+    // reduce and bcast could be replaced by allreduce
 
-	long maxnloc; // maximum number of elements in local matrix
-	MPI_Reduce(&ParaV->nloc_wfc, &maxnloc, 1, MPI_LONG, MPI_MAX, 0, ParaV->comm_2D);
-	MPI_Bcast(&maxnloc, 1, MPI_LONG, 0, ParaV->comm_2D);
-	//reduce and bcast could be replaced by allreduce
-	
-    int nprocs, myid;
+    int nprocs = 0, myid = 0;
     MPI_Comm_size(ParaV->comm_2D, &nprocs);
     MPI_Comm_rank(ParaV->comm_2D, &myid);
 
-	std::complex<double> *work=new std::complex<double>[maxnloc]; // work/buffer matrix
+    std::complex<double>* work = new std::complex<double>[maxnloc]; // work/buffer matrix
 #ifdef __DEBUG
-assert(nb2d > 0);
-#endif	
-	int nb = nb2d;
-	int info;
-	int naroc[2]; // maximum number of row or column
-	
-//2. copy from ctot to psi
-	for(int iprow=0; iprow<ParaV->dim0; ++iprow)
-	{
-		for(int ipcol=0; ipcol<ParaV->dim1; ++ipcol)
-		{
-//2.1 get and bcast local 2d matrix info
-			const int coord[2]={iprow, ipcol};
-			int src_rank;
-			MPI_Cart_rank(ParaV->comm_2D, coord, &src_rank);
-			if(myid==src_rank)
-			{
-				naroc[0]=ParaV->nrow;
-				naroc[1]=ParaV->ncol_bands;
-			}
-			info=MPI_Bcast(naroc, 2, MPI_INT, src_rank, ParaV->comm_2D);
-
-//2.2 copy from ctot to work, then bcast work
-			info=CTOT2q_c(myid, naroc, nb, ParaV->dim0, ParaV->dim1, iprow, ipcol, nbands_g, work, ctot);
-			info=MPI_Bcast(work, maxnloc, MPI_DOUBLE_COMPLEX, 0, ParaV->comm_2D);
-			//ofs_running << "iprow, ipcow : " << iprow << ipcol << std::endl;
-			//for (int i=0; i<maxnloc; ++i)
-			//{
-				//ofs_running << *(work+i)<<" ";
-			//}
-			//ofs_running << std::endl;
-//2.3 copy from work to psi_k
+    assert(nb2d > 0);
+#endif
+    const int nb = nb2d;
+    int info = 0;
+    int naroc[2]; // maximum number of row or column
+
+    // 2. copy from ctot to psi
+    for (int iprow = 0; iprow < ParaV->dim0; ++iprow)
+    {
+        for (int ipcol = 0; ipcol < ParaV->dim1; ++ipcol)
+        {
+            // 2.1 get and bcast local 2d matrix info
+            const int coord[2] = {iprow, ipcol};
+            int src_rank = 0;
+            MPI_Cart_rank(ParaV->comm_2D, coord, &src_rank);
+            if (myid == src_rank)
+            {
+                naroc[0] = ParaV->nrow;
+                naroc[1] = ParaV->ncol_bands;
+            }
+            info = MPI_Bcast(naroc, 2, MPI_INT, src_rank, ParaV->comm_2D);
+
+            // 2.2 copy from ctot to work, then bcast work
+            info = CTOT2q_c(myid, naroc, nb, ParaV->dim0, ParaV->dim1, iprow, ipcol, nbands_g, work, ctot);
+            info = MPI_Bcast(work, maxnloc, MPI_DOUBLE_COMPLEX, 0, ParaV->comm_2D);
+            // ofs_running << "iprow, ipcow : " << iprow << ipcol << std::endl;
+            // for (int i=0; i<maxnloc; ++i)
+            //{
+            // ofs_running << *(work+i)<<" ";
+            //}
+            // ofs_running << std::endl;
+            // 2.3 copy from work to psi_k
             const int inc = 1;
-			if(myid==src_rank)
-			{
-				BlasConnector::copy(ParaV->nloc_wfc, work, inc, &(psi[0](ik, 0, 0)), inc);
-			}
-		}//loop ipcol
-	}//loop	iprow
-
-	delete[] work;
+            if (myid == src_rank)
+            {
+                BlasConnector::copy(ParaV->nloc_wfc, work, inc, &(psi[0](ik, 0, 0)), inc);
+            }
+        } // loop ipcol
+    }     // loop	iprow
+
+    delete[] work;
 #else
-	ModuleBase::WARNING_QUIT("ModuleIO::distri_wfc_nao","check the code without MPI.");
+    ModuleBase::WARNING_QUIT("ModuleIO::distri_wfc_nao", "check the code without MPI.");
 #endif
     return;
 }