diff --git a/source/source_estate/module_dm/density_matrix.cpp b/source/source_estate/module_dm/density_matrix.cpp
index 7414e00fd3b..f18d068db73 100644
--- a/source/source_estate/module_dm/density_matrix.cpp
+++ b/source/source_estate/module_dm/density_matrix.cpp
@@ -655,7 +655,7 @@ void DensityMatrix_Tools::func_xyz_to_updown<double>(const std::complex<double>
 {
     target_DMR_mat[icol + step_trace[0]] = tmp[0].real() + tmp[3].real();  // rho_0 = (rho_upup + rho_downdown).real()
     target_DMR_mat[icol + step_trace[1]] = tmp[1].real() + tmp[2].real();  // rho_x = (rho_updown + rho_downup).real()
-    target_DMR_mat[icol + step_trace[2]] = -tmp[1].imag() + tmp[2].imag(); // rho_y = (i * (rho_updown - rho_downup)).real()
+    target_DMR_mat[icol + step_trace[2]] = tmp[1].imag() - tmp[2].imag(); // rho_y = Im(rho_updown - rho_downup)
     target_DMR_mat[icol + step_trace[3]] = tmp[0].real() - tmp[3].real();  // rho_z = (rho_upup - rho_downdown).real()
 }
 
@@ -664,7 +664,7 @@ void DensityMatrix_Tools::func_xyz_to_updown<std::complex<double>>(const std::co
 {
     target_DMR_mat[icol + step_trace[0]] = tmp[0] + tmp[3];                                         // rho_0 = (rho_upup + rho_downdown)
     target_DMR_mat[icol + step_trace[1]] = tmp[1] + tmp[2];                                         // rho_x = (rho_updown + rho_downup)
-    target_DMR_mat[icol + step_trace[2]] = ModuleBase::IMAG_UNIT * (tmp[1].imag() - tmp[2].imag()); // rho_y = (i * (rho_updown - rho_downup))
+    target_DMR_mat[icol + step_trace[2]] = -ModuleBase::IMAG_UNIT * (tmp[1] - tmp[2]); // rho_y = -i*(rho_updown - rho_downup)
     target_DMR_mat[icol + step_trace[3]] = tmp[0] - tmp[3];                                         // rho_z = (rho_upup - rho_downdown)
 }
 
diff --git a/source/source_hsolver/module_genelpa/elpa_new.cpp b/source/source_hsolver/module_genelpa/elpa_new.cpp
index d0454821907..dc590c71320 100644
--- a/source/source_hsolver/module_genelpa/elpa_new.cpp
+++ b/source/source_hsolver/module_genelpa/elpa_new.cpp
@@ -94,6 +94,10 @@ ELPA_Solver::ELPA_Solver(const bool isReal,
     elpa_set_integer(NEW_ELPA_HANDLE_POOL[handle_id], "mpi_comm_parent", MPI_Comm_c2f(comm), &error);
     elpa_set_integer(NEW_ELPA_HANDLE_POOL[handle_id], "process_row", myprow, &error);
     elpa_set_integer(NEW_ELPA_HANDLE_POOL[handle_id], "process_col", mypcol, &error);
+    // blacs_context is required by ELPA for internal MPI operations
+    // (e.g. MPI_Bcast in complex Cholesky/invert_triangular);
+    // previously missing in this constructor but present in the otherParameter one
+    elpa_set_integer(NEW_ELPA_HANDLE_POOL[handle_id], "blacs_context", cblacs_ctxt, &error);
 
     error = elpa_setup(NEW_ELPA_HANDLE_POOL[handle_id]);
     // cout<<"elpa handle is setup\n";
diff --git a/source/source_lcao/module_gint/gint_common.cpp b/source/source_lcao/module_gint/gint_common.cpp
index 987fcbe5a0e..1de8969fd0d 100644
--- a/source/source_lcao/module_gint/gint_common.cpp
+++ b/source/source_lcao/module_gint/gint_common.cpp
@@ -168,8 +168,15 @@ void merge_hr_part_to_hR(const std::vector<hamilt::HContainer<double>>& hr_gint_
     std::vector<int> row_set = {0, 0, 1, 1};
     std::vector<int> col_set = {0, 1, 0, 1};
     //construct complex matrix
+    // Pauli-to-spinor conversion: H = V_0*I + B_x*sigma_x + B_y*sigma_y + B_z*sigma_z
+    // sigma_y = [[0,-i],[i,0]], so H_{up,down} = B_x - i*B_y, H_{down,up} = B_x + i*B_y
+    // coefficient = clx_i + i*clx_j for each Pauli channel:
+    //   is=0 (up,up):   V_0 + B_z   => coeff on B_z = +1  => clx_i=1,  clx_j=0
+    //   is=1 (up,down): B_x - i*B_y => coeff on B_y = -i  => clx_i=0,  clx_j=-1
+    //   is=2 (down,up): B_x + i*B_y => coeff on B_y = +i  => clx_i=0,  clx_j=+1
+    //   is=3 (down,down): -(V_0 - B_z) => coeff on V_0 = -1 => clx_i=-1, clx_j=0
     std::vector<int> clx_i = {1, 0, 0, -1};
-    std::vector<int> clx_j = {0, 1, -1, 0};
+    std::vector<int> clx_j = {0, -1, 1, 0};
     for (int is = 0; is < 4; is++){
         if(!PARAM.globalv.domag && (is==1 || is==2)) continue;
         hR_tmp->set_zero();
@@ -203,9 +210,10 @@ void merge_hr_part_to_hR(const std::vector<hamilt::HContainer<double>>& hr_gint_
                             + std::complex<double>(clx_i[is], clx_j[is]) * mat_nspin2->get_value(irow, icol);
                         }
                     }
-                    //fill the lower triangle matrix
-                    //When is=0 or 3, the real part does not need conjugation; 
-                    //when is=1 or 2, the small matrix is not Hermitian, so conjugation is not needed
+                    //fill the lower triangle matrix at -R by conjugate transpose of upper at R
+                    // This ensures H(-R) = H(R)^dagger, required for Hermiticity of H(k).
+                    // For real matrices (is=0,3), conj has no effect.
+                    // For complex matrices (is=1,2), conj is essential.
                     if (iat1 < iat2)
                     {
                         auto lower_mat = lower_ap->find_matrix(-R_index);
@@ -213,7 +221,7 @@ void merge_hr_part_to_hR(const std::vector<hamilt::HContainer<double>>& hr_gint_
                         {
                             for (int icol = 0; icol < upper_mat->get_col_size(); ++icol)
                             {
-                                lower_mat->get_value(icol, irow) = upper_mat->get_value(irow, icol);
+                                lower_mat->get_value(icol, irow) = std::conj(upper_mat->get_value(irow, icol));
                             }
                         }
                     }
diff --git a/source/source_lcao/module_operator_lcao/dftu_force_stress.hpp b/source/source_lcao/module_operator_lcao/dftu_force_stress.hpp
index 38c96025fa5..e9a546c2b97 100644
--- a/source/source_lcao/module_operator_lcao/dftu_force_stress.hpp
+++ b/source/source_lcao/module_operator_lcao/dftu_force_stress.hpp
@@ -146,13 +146,6 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
         std::vector<double> VU(occ.size());
         double eu_tmp = 0;
         this->cal_v_of_u(occ, tlp1, u_value, &VU[0], eu_tmp);
-        if(this->nspin == 4) 
-        {
-            for (int i = 0; i < VU.size(); i++)
-            {
-                VU[i] /= 2.0;
-            }
-        }
 
         // second iteration to calculate force and stress
         // calculate Force for atom J
@@ -242,12 +235,14 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
     if (cal_force)
     {
 #ifdef __MPI
-        // sum up the occupation matrix
         Parallel_Reduce::reduce_all(force.c, force.nr * force.nc);
 #endif
-        for (int i = 0; i < force.nr * force.nc; i++)
+        if (this->nspin != 4)
         {
-            force.c[i] *= 2.0;
+            for (int i = 0; i < force.nr * force.nc; i++)
+            {
+                force.c[i] *= 2.0;
+            }
         }
     }
 
diff --git a/source/source_lcao/module_operator_lcao/dftu_lcao.cpp b/source/source_lcao/module_operator_lcao/dftu_lcao.cpp
index 2b8e4f3506f..92fac0ffabd 100644
--- a/source/source_lcao/module_operator_lcao/dftu_lcao.cpp
+++ b/source/source_lcao/module_operator_lcao/dftu_lcao.cpp
@@ -612,7 +612,14 @@ void hamilt::DFTU<hamilt::OperatorLCAO<std::complex<double>, std::complex<double
     assert(vu.size() == vu_tmp.size());
 #endif
 
-    // TR == std::complex<double> transfer from double to std::complex<double>
+    // Pauli-to-spinor conversion for DFT+U potential:
+    // V = V_0*I + V_x*sigma_x + V_y*sigma_y + V_z*sigma_z
+    // sigma_y = [[0,-i],[i,0]], so:
+    //   V_{up,up}   = 0.5*(V_0 + V_z)
+    //   V_{down,down} = 0.5*(V_0 - V_z)
+    //   V_{up,down}  = 0.5*(V_x - i*V_y)  <- note: minus sign from sigma_y
+    //   V_{down,up}  = 0.5*(V_x + i*V_y)  <- note: plus sign from sigma_y
+    // This is consistent with the convention in gint_common.cpp merge_hr_part_to_hR().
     const int m_size = int(sqrt(vu.size()) / 2);
     const int m_size2 = m_size * m_size;
     vu.resize(vu_tmp.size());
@@ -627,9 +634,8 @@ void hamilt::DFTU<hamilt::OperatorLCAO<std::complex<double>, std::complex<double
             index[3] = m2 * m_size + m1 + m_size2 * 3;
             vu[index[0]] = 0.5 * (vu_tmp[index[0]] + vu_tmp[index[3]]);
             vu[index[3]] = 0.5 * (vu_tmp[index[0]] - vu_tmp[index[3]]);
-            // vu should be std::complex<double> type, but here we use double type for test
-            vu[index[1]] = 0.5 * (vu_tmp[index[1]] + std::complex<double>(0.0, 1.0) * vu_tmp[index[2]]);
-            vu[index[2]] = 0.5 * (vu_tmp[index[1]] - std::complex<double>(0.0, 1.0) * vu_tmp[index[2]]);
+            vu[index[1]] = 0.5 * (vu_tmp[index[1]] - std::complex<double>(0.0, 1.0) * vu_tmp[index[2]]);
+            vu[index[2]] = 0.5 * (vu_tmp[index[1]] + std::complex<double>(0.0, 1.0) * vu_tmp[index[2]]);
         }
     }
 }
diff --git a/source/source_lcao/module_operator_lcao/dspin_force_stress.hpp b/source/source_lcao/module_operator_lcao/dspin_force_stress.hpp
index 955c6b4d267..1fe8812e9b9 100644
--- a/source/source_lcao/module_operator_lcao/dspin_force_stress.hpp
+++ b/source/source_lcao/module_operator_lcao/dspin_force_stress.hpp
@@ -212,12 +212,14 @@ void DeltaSpin<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
     if (cal_force)
     {
 #ifdef __MPI
-        // sum up the occupation matrix
         Parallel_Reduce::reduce_all(force.c, force.nr * force.nc);
 #endif
-        for (int i = 0; i < force.nr * force.nc; i++)
+        if (this->nspin != 4)
         {
-            force.c[i] *= 2.0;
+            for (int i = 0; i < force.nr * force.nc; i++)
+            {
+                force.c[i] *= 2.0;
+            }
         }
     }
 
diff --git a/tests/03_NAO_multik/scf_angle_spin4/result.ref b/tests/03_NAO_multik/scf_angle_spin4/result.ref
index 1a6409ad032..e1656f8f8ce 100644
--- a/tests/03_NAO_multik/scf_angle_spin4/result.ref
+++ b/tests/03_NAO_multik/scf_angle_spin4/result.ref
@@ -1,5 +1,5 @@
-etotref -6267.4651888505040915
-etotperatomref -3133.7325944253
+etotref -6267.4651896196382950
+etotperatomref -3133.7325948098
 totalforceref 0.000000
-totalstressref 3912.920415
-totaltimeref 1.24
+totalstressref 3912.920437
+totaltimeref 15.08
diff --git a/tests/03_NAO_multik/scf_angle_spin4/threshold b/tests/03_NAO_multik/scf_angle_spin4/threshold
new file mode 100644
index 00000000000..33e27fb6c5f
--- /dev/null
+++ b/tests/03_NAO_multik/scf_angle_spin4/threshold
@@ -0,0 +1 @@
+threshold 0.00001
diff --git a/tests/03_NAO_multik/scf_out_dos_spin4/result.ref b/tests/03_NAO_multik/scf_out_dos_spin4/result.ref
index d126df468a6..e54b51d3064 100644
--- a/tests/03_NAO_multik/scf_out_dos_spin4/result.ref
+++ b/tests/03_NAO_multik/scf_out_dos_spin4/result.ref
@@ -1,6 +1,6 @@
-etotref -1964.0663947982975515
+etotref -1964.0663947982770878
 etotperatomref -982.0331973991
-totalforceref 0.162298
-totalstressref 1877.059021
+totalforceref 0.162158
+totalstressref 1877.059089
 totaldosref 38
-totaltimeref 7.05
+totaltimeref 16.23
diff --git a/tests/03_NAO_multik/scf_u_spin4/result.ref b/tests/03_NAO_multik/scf_u_spin4/result.ref
index 74a7a7e4d72..bdf978fbbe7 100644
--- a/tests/03_NAO_multik/scf_u_spin4/result.ref
+++ b/tests/03_NAO_multik/scf_u_spin4/result.ref
@@ -1,5 +1,5 @@
-etotref -6789.2817503491569369
-etotperatomref -3394.6408751746
-totalforceref 11.335196
-totalstressref 4892.274915
-totaltimeref 4.70
+etotref -6789.2816406266510967
+etotperatomref -3394.6408203133
+totalforceref 11.331534
+totalstressref 4697.832232
+totaltimeref 9.71