diff --git a/Test/openmp_nep_basic_benchmark.cpp b/Test/openmp_nep_basic_benchmark.cpp
new file mode 100644
index 00000000000..16c2ff4e72b
--- /dev/null
+++ b/Test/openmp_nep_basic_benchmark.cpp
@@ -0,0 +1,611 @@
+#include <algorithm>
+#include <array>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+namespace
+{
+
+struct Vec3
+{
+    double x;
+    double y;
+    double z;
+};
+
+struct Matrix3
+{
+    double m[3][3];
+};
+
+struct BenchmarkData
+{
+    int natom = 2000000;
+    int ntype = 4;
+    double md_dt = 0.5;
+    double lat0 = 10.0;
+    double fact_f = 0.0433641153087705;
+    Matrix3 gt{};
+
+    std::vector<Vec3> pos;
+    std::vector<Vec3> vel;
+    std::vector<Vec3> base_vel;
+    std::vector<Vec3> force;
+    std::vector<double> mass;
+    std::vector<std::array<int, 3>> ionmbl;
+    std::vector<double> force_matrix;
+
+    std::vector<int> atom_type_index;
+    std::vector<int> atom_local_index;
+    std::vector<int> type_offset;
+    std::vector<double> tau_x;
+    std::vector<double> tau_y;
+    std::vector<double> tau_z;
+    std::vector<double> nep_coord;
+    std::vector<double> nep_e;
+    std::vector<double> nep_f;
+    std::vector<double> nep_v;
+    std::vector<double> nep_force;
+};
+
+struct Measurement
+{
+    std::string kernel;
+    int threads = 1;
+    double serial_ms = 0.0;
+    double omp_ms = 0.0;
+    double max_abs_diff = 0.0;
+    double checksum = 0.0;
+};
+
+using Clock = std::chrono::steady_clock;
+volatile double global_sink = 0.0;
+
+double elapsed_ms(const Clock::time_point& start, const Clock::time_point& end)
+{
+    return std::chrono::duration<double, std::milli>(end - start).count();
+}
+
+int parse_int_arg(int argc, char** argv, const std::string& name, int fallback)
+{
+    for (int i = 1; i + 1 < argc; ++i)
+    {
+        if (argv[i] == name)
+        {
+            return std::atoi(argv[i + 1]);
+        }
+    }
+    return fallback;
+}
+
+double value_for_index(const int i, const double scale)
+{
+    return scale * (1.0 + static_cast<double>((i * 17) % 97) / 97.0);
+}
+
+BenchmarkData make_data(const int natom)
+{
+    BenchmarkData data;
+    data.natom = natom;
+    data.gt = {{{1.02, 0.01, 0.03}, {0.02, 0.98, 0.04}, {0.01, 0.02, 1.01}}};
+
+    data.pos.resize(natom);
+    data.vel.resize(natom);
+    data.base_vel.resize(natom);
+    data.force.resize(natom);
+    data.mass.resize(natom);
+    data.ionmbl.resize(natom);
+    data.force_matrix.resize(static_cast<std::size_t>(natom) * 3);
+    data.atom_type_index.resize(natom);
+    data.atom_local_index.resize(natom);
+    data.nep_coord.resize(static_cast<std::size_t>(natom) * 3);
+    data.nep_e.resize(natom);
+    data.nep_f.resize(static_cast<std::size_t>(natom) * 3);
+    data.nep_v.resize(static_cast<std::size_t>(natom) * 9);
+    data.nep_force.resize(static_cast<std::size_t>(natom) * 3);
+
+    data.type_offset.assign(data.ntype + 1, 0);
+    for (int it = 0; it < data.ntype; ++it)
+    {
+        data.type_offset[it + 1] = data.type_offset[it] + natom / data.ntype + (it < natom % data.ntype ? 1 : 0);
+    }
+    data.tau_x.resize(natom);
+    data.tau_y.resize(natom);
+    data.tau_z.resize(natom);
+
+    for (int i = 0; i < natom; ++i)
+    {
+        data.base_vel[i] = {value_for_index(i, 0.001), value_for_index(i + 11, 0.0012), value_for_index(i + 23, 0.0009)};
+        data.vel[i] = data.base_vel[i];
+        data.force[i] = {value_for_index(i + 3, 0.004), value_for_index(i + 5, 0.003), value_for_index(i + 7, 0.005)};
+        data.mass[i] = 10.0 + static_cast<double>(i % 13);
+        data.ionmbl[i] = {1, (i % 11) == 0 ? 0 : 1, (i % 17) == 0 ? 0 : 1};
+        data.force_matrix[3 * static_cast<std::size_t>(i)] = data.force[i].x;
+        data.force_matrix[3 * static_cast<std::size_t>(i) + 1] = data.force[i].y;
+        data.force_matrix[3 * static_cast<std::size_t>(i) + 2] = data.force[i].z;
+        data.nep_e[i] = value_for_index(i + 13, 0.02);
+        data.nep_f[i] = value_for_index(i + 29, 0.03);
+        data.nep_f[static_cast<std::size_t>(i) + natom] = value_for_index(i + 31, 0.02);
+        data.nep_f[static_cast<std::size_t>(i) + 2 * static_cast<std::size_t>(natom)] = value_for_index(i + 37, 0.04);
+    }
+
+    for (int it = 0; it < data.ntype; ++it)
+    {
+        for (int local = 0; local < data.type_offset[it + 1] - data.type_offset[it]; ++local)
+        {
+            const int global = data.type_offset[it] + local;
+            data.atom_type_index[global] = it;
+            data.atom_local_index[global] = local;
+            data.tau_x[global] = value_for_index(global + 41, 0.2);
+            data.tau_y[global] = value_for_index(global + 43, 0.3);
+            data.tau_z[global] = value_for_index(global + 47, 0.4);
+        }
+    }
+
+    for (int j = 0; j < 9; ++j)
+    {
+        for (int i = 0; i < natom; ++i)
+        {
+            data.nep_v[static_cast<std::size_t>(j) * natom + i] = value_for_index(i + j * 19, 0.0003 * (j + 1));
+        }
+    }
+
+    return data;
+}
+
+template <typename F>
+double time_repeated(const int repeat, F&& f)
+{
+    const auto start = Clock::now();
+    for (int r = 0; r < repeat; ++r)
+    {
+        f();
+    }
+    return elapsed_ms(start, Clock::now()) / repeat;
+}
+
+void update_pos_serial(const BenchmarkData& data, std::vector<Vec3>& pos)
+{
+    for (int i = 0; i < data.natom; ++i)
+    {
+        double px = data.ionmbl[i][0] ? data.vel[i].x * data.md_dt / data.lat0 : 0.0;
+        double py = data.ionmbl[i][1] ? data.vel[i].y * data.md_dt / data.lat0 : 0.0;
+        double pz = data.ionmbl[i][2] ? data.vel[i].z * data.md_dt / data.lat0 : 0.0;
+        pos[i].x = px * data.gt.m[0][0] + py * data.gt.m[1][0] + pz * data.gt.m[2][0];
+        pos[i].y = px * data.gt.m[0][1] + py * data.gt.m[1][1] + pz * data.gt.m[2][1];
+        pos[i].z = px * data.gt.m[0][2] + py * data.gt.m[1][2] + pz * data.gt.m[2][2];
+    }
+}
+
+void update_pos_omp(const BenchmarkData& data, std::vector<Vec3>& pos)
+{
+#pragma omp parallel for schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        double px = data.ionmbl[i][0] ? data.vel[i].x * data.md_dt / data.lat0 : 0.0;
+        double py = data.ionmbl[i][1] ? data.vel[i].y * data.md_dt / data.lat0 : 0.0;
+        double pz = data.ionmbl[i][2] ? data.vel[i].z * data.md_dt / data.lat0 : 0.0;
+        pos[i].x = px * data.gt.m[0][0] + py * data.gt.m[1][0] + pz * data.gt.m[2][0];
+        pos[i].y = px * data.gt.m[0][1] + py * data.gt.m[1][1] + pz * data.gt.m[2][1];
+        pos[i].z = px * data.gt.m[0][2] + py * data.gt.m[1][2] + pz * data.gt.m[2][2];
+    }
+}
+
+void update_vel_serial(const BenchmarkData& data, std::vector<Vec3>& vel)
+{
+    for (int i = 0; i < data.natom; ++i)
+    {
+        if (data.ionmbl[i][0])
+        {
+            vel[i].x += 0.5 * data.force[i].x * data.md_dt / data.mass[i];
+        }
+        if (data.ionmbl[i][1])
+        {
+            vel[i].y += 0.5 * data.force[i].y * data.md_dt / data.mass[i];
+        }
+        if (data.ionmbl[i][2])
+        {
+            vel[i].z += 0.5 * data.force[i].z * data.md_dt / data.mass[i];
+        }
+    }
+}
+
+void update_vel_omp(const BenchmarkData& data, std::vector<Vec3>& vel)
+{
+#pragma omp parallel for schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        if (data.ionmbl[i][0])
+        {
+            vel[i].x += 0.5 * data.force[i].x * data.md_dt / data.mass[i];
+        }
+        if (data.ionmbl[i][1])
+        {
+            vel[i].y += 0.5 * data.force[i].y * data.md_dt / data.mass[i];
+        }
+        if (data.ionmbl[i][2])
+        {
+            vel[i].z += 0.5 * data.force[i].z * data.md_dt / data.mass[i];
+        }
+    }
+}
+
+double kinetic_serial(const BenchmarkData& data)
+{
+    double ke = 0.0;
+    for (int i = 0; i < data.natom; ++i)
+    {
+        ke += 0.5 * data.mass[i]
+              * (data.vel[i].x * data.vel[i].x + data.vel[i].y * data.vel[i].y + data.vel[i].z * data.vel[i].z);
+    }
+    return ke;
+}
+
+double kinetic_omp(const BenchmarkData& data)
+{
+    double ke = 0.0;
+#pragma omp parallel for reduction(+:ke) schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        ke += 0.5 * data.mass[i]
+              * (data.vel[i].x * data.vel[i].x + data.vel[i].y * data.vel[i].y + data.vel[i].z * data.vel[i].z);
+    }
+    return ke;
+}
+
+std::array<double, 9> temp_vector_serial(const BenchmarkData& data)
+{
+    std::array<double, 9> t{};
+    for (int i = 0; i < data.natom; ++i)
+    {
+        const double m = data.mass[i];
+        const double vx = data.vel[i].x;
+        const double vy = data.vel[i].y;
+        const double vz = data.vel[i].z;
+        t[0] += m * vx * vx;
+        t[1] += m * vx * vy;
+        t[2] += m * vx * vz;
+        t[3] += m * vy * vx;
+        t[4] += m * vy * vy;
+        t[5] += m * vy * vz;
+        t[6] += m * vz * vx;
+        t[7] += m * vz * vy;
+        t[8] += m * vz * vz;
+    }
+    return t;
+}
+
+std::array<double, 9> temp_vector_omp(const BenchmarkData& data)
+{
+    double t00 = 0.0;
+    double t01 = 0.0;
+    double t02 = 0.0;
+    double t10 = 0.0;
+    double t11 = 0.0;
+    double t12 = 0.0;
+    double t20 = 0.0;
+    double t21 = 0.0;
+    double t22 = 0.0;
+
+#pragma omp parallel for reduction(+:t00, t01, t02, t10, t11, t12, t20, t21, t22) schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        const double m = data.mass[i];
+        const double vx = data.vel[i].x;
+        const double vy = data.vel[i].y;
+        const double vz = data.vel[i].z;
+        t00 += m * vx * vx;
+        t01 += m * vx * vy;
+        t02 += m * vx * vz;
+        t10 += m * vy * vx;
+        t11 += m * vy * vy;
+        t12 += m * vy * vz;
+        t20 += m * vz * vx;
+        t21 += m * vz * vy;
+        t22 += m * vz * vz;
+    }
+    return {t00, t01, t02, t10, t11, t12, t20, t21, t22};
+}
+
+void force_copy_serial(const BenchmarkData& data, std::vector<Vec3>& out)
+{
+    for (int i = 0; i < data.natom; ++i)
+    {
+        out[i].x = data.force_matrix[3 * static_cast<std::size_t>(i)];
+        out[i].y = data.force_matrix[3 * static_cast<std::size_t>(i) + 1];
+        out[i].z = data.force_matrix[3 * static_cast<std::size_t>(i) + 2];
+    }
+}
+
+void force_copy_omp(const BenchmarkData& data, std::vector<Vec3>& out)
+{
+#pragma omp parallel for schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        out[i].x = data.force_matrix[3 * static_cast<std::size_t>(i)];
+        out[i].y = data.force_matrix[3 * static_cast<std::size_t>(i) + 1];
+        out[i].z = data.force_matrix[3 * static_cast<std::size_t>(i) + 2];
+    }
+}
+
+void nep_coord_serial(const BenchmarkData& data, std::vector<double>& coord)
+{
+    const int nat = data.natom;
+    for (int iat = 0; iat < nat; ++iat)
+    {
+        const int it = data.atom_type_index[iat];
+        const int ia = data.atom_local_index[iat];
+        const int index = data.type_offset[it] + ia;
+        coord[iat] = data.tau_x[index] * data.lat0;
+        coord[static_cast<std::size_t>(iat) + nat] = data.tau_y[index] * data.lat0;
+        coord[static_cast<std::size_t>(iat) + 2 * static_cast<std::size_t>(nat)] = data.tau_z[index] * data.lat0;
+    }
+}
+
+void nep_coord_omp(const BenchmarkData& data, std::vector<double>& coord)
+{
+    const int nat = data.natom;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int iat = 0; iat < nat; ++iat)
+    {
+        const int it = data.atom_type_index[iat];
+        const int ia = data.atom_local_index[iat];
+        const int index = data.type_offset[it] + ia;
+        coord[iat] = data.tau_x[index] * data.lat0;
+        coord[static_cast<std::size_t>(iat) + nat] = data.tau_y[index] * data.lat0;
+        coord[static_cast<std::size_t>(iat) + 2 * static_cast<std::size_t>(nat)] = data.tau_z[index] * data.lat0;
+    }
+}
+
+double nep_energy_serial(const BenchmarkData& data)
+{
+    return std::accumulate(data.nep_e.begin(), data.nep_e.end(), 0.0);
+}
+
+double nep_energy_omp(const BenchmarkData& data)
+{
+    double energy = 0.0;
+#pragma omp parallel for reduction(+:energy) schedule(static) if (data.natom >= 256)
+    for (int i = 0; i < data.natom; ++i)
+    {
+        energy += data.nep_e[i];
+    }
+    return energy;
+}
+
+void nep_force_serial(const BenchmarkData& data, std::vector<double>& out)
+{
+    const int nat = data.natom;
+    for (int i = 0; i < nat; ++i)
+    {
+        out[3 * static_cast<std::size_t>(i)] = data.nep_f[i] * data.fact_f;
+        out[3 * static_cast<std::size_t>(i) + 1] = data.nep_f[static_cast<std::size_t>(i) + nat] * data.fact_f;
+        out[3 * static_cast<std::size_t>(i) + 2] = data.nep_f[static_cast<std::size_t>(i) + 2 * static_cast<std::size_t>(nat)] * data.fact_f;
+    }
+}
+
+void nep_force_omp(const BenchmarkData& data, std::vector<double>& out)
+{
+    const int nat = data.natom;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        out[3 * static_cast<std::size_t>(i)] = data.nep_f[i] * data.fact_f;
+        out[3 * static_cast<std::size_t>(i) + 1] = data.nep_f[static_cast<std::size_t>(i) + nat] * data.fact_f;
+        out[3 * static_cast<std::size_t>(i) + 2] = data.nep_f[static_cast<std::size_t>(i) + 2 * static_cast<std::size_t>(nat)] * data.fact_f;
+    }
+}
+
+std::array<double, 9> nep_virial_serial(const BenchmarkData& data)
+{
+    std::array<double, 9> sum{};
+    const int nat = data.natom;
+    for (int j = 0; j < 9; ++j)
+    {
+        for (int i = 0; i < nat; ++i)
+        {
+            sum[j] += data.nep_v[static_cast<std::size_t>(j) * nat + i];
+        }
+    }
+    return sum;
+}
+
+std::array<double, 9> nep_virial_omp(const BenchmarkData& data)
+{
+    const int nat = data.natom;
+    double v0 = 0.0;
+    double v1 = 0.0;
+    double v2 = 0.0;
+    double v3 = 0.0;
+    double v4 = 0.0;
+    double v5 = 0.0;
+    double v6 = 0.0;
+    double v7 = 0.0;
+    double v8 = 0.0;
+#pragma omp parallel for reduction(+:v0, v1, v2, v3, v4, v5, v6, v7, v8) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        v0 += data.nep_v[i];
+        v1 += data.nep_v[static_cast<std::size_t>(nat) + i];
+        v2 += data.nep_v[2 * static_cast<std::size_t>(nat) + i];
+        v3 += data.nep_v[3 * static_cast<std::size_t>(nat) + i];
+        v4 += data.nep_v[4 * static_cast<std::size_t>(nat) + i];
+        v5 += data.nep_v[5 * static_cast<std::size_t>(nat) + i];
+        v6 += data.nep_v[6 * static_cast<std::size_t>(nat) + i];
+        v7 += data.nep_v[7 * static_cast<std::size_t>(nat) + i];
+        v8 += data.nep_v[8 * static_cast<std::size_t>(nat) + i];
+    }
+    return {v0, v1, v2, v3, v4, v5, v6, v7, v8};
+}
+
+double max_abs_diff(const std::vector<Vec3>& a, const std::vector<Vec3>& b)
+{
+    double diff = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i)
+    {
+        diff = std::max(diff, std::abs(a[i].x - b[i].x));
+        diff = std::max(diff, std::abs(a[i].y - b[i].y));
+        diff = std::max(diff, std::abs(a[i].z - b[i].z));
+    }
+    return diff;
+}
+
+double max_abs_diff(const std::vector<double>& a, const std::vector<double>& b)
+{
+    double diff = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i)
+    {
+        diff = std::max(diff, std::abs(a[i] - b[i]));
+    }
+    return diff;
+}
+
+double max_abs_diff(const std::array<double, 9>& a, const std::array<double, 9>& b)
+{
+    double diff = 0.0;
+    for (std::size_t i = 0; i < a.size(); ++i)
+    {
+        diff = std::max(diff, std::abs(a[i] - b[i]));
+    }
+    return diff;
+}
+
+double checksum(const std::vector<Vec3>& a)
+{
+    double sum = 0.0;
+    for (std::size_t i = 0; i < a.size(); i += 4096)
+    {
+        sum += a[i].x + 0.5 * a[i].y + 0.25 * a[i].z;
+    }
+    return sum;
+}
+
+double checksum(const std::vector<double>& a)
+{
+    double sum = 0.0;
+    for (std::size_t i = 0; i < a.size(); i += 4096)
+    {
+        sum += a[i];
+    }
+    return sum;
+}
+
+double checksum(const std::array<double, 9>& a)
+{
+    return std::accumulate(a.begin(), a.end(), 0.0);
+}
+
+void print_measurement(const Measurement& m)
+{
+    const double speedup = m.omp_ms > 0.0 ? m.serial_ms / m.omp_ms : 0.0;
+    const double efficiency = m.threads > 0 ? speedup / m.threads : 0.0;
+    std::cout << m.kernel << ',' << m.threads << ','
+              << std::fixed << std::setprecision(6)
+              << m.serial_ms << ',' << m.omp_ms << ',' << speedup << ',' << efficiency << ','
+              << std::scientific << m.max_abs_diff << ',' << m.checksum << '\n';
+}
+
+} // namespace
+
+int main(int argc, char** argv)
+{
+    const int threads = parse_int_arg(argc, argv, "--threads", 1);
+    const int natom = parse_int_arg(argc, argv, "--natoms", 2000000);
+    const int repeat = parse_int_arg(argc, argv, "--repeat", 5);
+
+#ifdef _OPENMP
+    omp_set_dynamic(0);
+    omp_set_num_threads(threads);
+#endif
+
+    BenchmarkData data = make_data(natom);
+    std::vector<Vec3> serial_vec(natom);
+    std::vector<Vec3> omp_vec(natom);
+    std::vector<double> serial_doubles(static_cast<std::size_t>(natom) * 3);
+    std::vector<double> omp_doubles(static_cast<std::size_t>(natom) * 3);
+
+    std::cout << "kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum\n";
+
+    {
+        update_pos_serial(data, serial_vec);
+        update_pos_omp(data, omp_vec);
+        const double serial_ms = time_repeated(repeat, [&] { update_pos_serial(data, serial_vec); });
+        const double omp_ms = time_repeated(repeat, [&] { update_pos_omp(data, omp_vec); });
+        print_measurement({"md_update_pos", threads, serial_ms, omp_ms, max_abs_diff(serial_vec, omp_vec), checksum(omp_vec)});
+    }
+    {
+        std::vector<Vec3> serial_vel = data.base_vel;
+        std::vector<Vec3> omp_vel = data.base_vel;
+        update_vel_serial(data, serial_vel);
+        update_vel_omp(data, omp_vel);
+        const double serial_ms = time_repeated(repeat, [&] { update_vel_serial(data, serial_vel); });
+        const double omp_ms = time_repeated(repeat, [&] { update_vel_omp(data, omp_vel); });
+        print_measurement({"md_update_vel", threads, serial_ms, omp_ms, max_abs_diff(serial_vel, omp_vel), checksum(omp_vel)});
+    }
+    {
+        const double serial_value = kinetic_serial(data);
+        const double omp_value = kinetic_omp(data);
+        const double serial_ms = time_repeated(repeat, [&] { global_sink += kinetic_serial(data); });
+        const double omp_ms = time_repeated(repeat, [&] { global_sink += kinetic_omp(data); });
+        print_measurement({"md_kinetic_energy", threads, serial_ms, omp_ms, std::abs(serial_value - omp_value), omp_value});
+    }
+    {
+        const auto serial_value = temp_vector_serial(data);
+        const auto omp_value = temp_vector_omp(data);
+        const double serial_ms = time_repeated(repeat, [&] { global_sink += checksum(temp_vector_serial(data)); });
+        const double omp_ms = time_repeated(repeat, [&] { global_sink += checksum(temp_vector_omp(data)); });
+        print_measurement({"md_temp_vector", threads, serial_ms, omp_ms, max_abs_diff(serial_value, omp_value), checksum(omp_value)});
+    }
+    {
+        force_copy_serial(data, serial_vec);
+        force_copy_omp(data, omp_vec);
+        const double serial_ms = time_repeated(repeat, [&] { force_copy_serial(data, serial_vec); });
+        const double omp_ms = time_repeated(repeat, [&] { force_copy_omp(data, omp_vec); });
+        print_measurement({"md_force_copy", threads, serial_ms, omp_ms, max_abs_diff(serial_vec, omp_vec), checksum(omp_vec)});
+    }
+    {
+        nep_coord_serial(data, serial_doubles);
+        nep_coord_omp(data, omp_doubles);
+        const double serial_ms = time_repeated(repeat, [&] { nep_coord_serial(data, serial_doubles); });
+        const double omp_ms = time_repeated(repeat, [&] { nep_coord_omp(data, omp_doubles); });
+        print_measurement({"nep_coord_fill", threads, serial_ms, omp_ms, max_abs_diff(serial_doubles, omp_doubles), checksum(omp_doubles)});
+    }
+    {
+        const double serial_value = nep_energy_serial(data);
+        const double omp_value = nep_energy_omp(data);
+        const double serial_ms = time_repeated(repeat, [&] { global_sink += nep_energy_serial(data); });
+        const double omp_ms = time_repeated(repeat, [&] { global_sink += nep_energy_omp(data); });
+        print_measurement({"nep_energy_sum", threads, serial_ms, omp_ms, std::abs(serial_value - omp_value), omp_value});
+    }
+    {
+        nep_force_serial(data, serial_doubles);
+        nep_force_omp(data, omp_doubles);
+        const double serial_ms = time_repeated(repeat, [&] { nep_force_serial(data, serial_doubles); });
+        const double omp_ms = time_repeated(repeat, [&] { nep_force_omp(data, omp_doubles); });
+        print_measurement({"nep_force_fill", threads, serial_ms, omp_ms, max_abs_diff(serial_doubles, omp_doubles), checksum(omp_doubles)});
+    }
+    {
+        const auto serial_value = nep_virial_serial(data);
+        const auto omp_value = nep_virial_omp(data);
+        const double serial_ms = time_repeated(repeat, [&] { global_sink += checksum(nep_virial_serial(data)); });
+        const double omp_ms = time_repeated(repeat, [&] { global_sink += checksum(nep_virial_omp(data)); });
+        print_measurement({"nep_virial_sum", threads, serial_ms, omp_ms, max_abs_diff(serial_value, omp_value), checksum(omp_value)});
+    }
+
+    if (global_sink == -1.0)
+    {
+        std::cerr << "unreachable sink: " << global_sink << '\n';
+    }
+
+    return 0;
+}
diff --git a/Test/results/openmp_nep_basic_benchmark.csv b/Test/results/openmp_nep_basic_benchmark.csv
new file mode 100644
index 00000000000..0e3136e7095
--- /dev/null
+++ b/Test/results/openmp_nep_basic_benchmark.csv
@@ -0,0 +1,37 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,1,13.007151,13.052355,0.996537,0.996537,0.000000e+00,6.693102e-02
+md_update_vel,1,14.545481,14.496280,1.003394,1.003394,0.000000e+00,1.811520e+00
+md_kinetic_energy,1,6.925990,6.857271,1.010021,1.010021,0.000000e+00,1.205301e+02
+md_temp_vector,1,8.115525,8.224651,0.986732,0.986732,0.000000e+00,7.061666e+02
+md_force_copy,1,9.308984,9.119837,1.020740,1.020740,0.000000e+00,4.932209e+00
+nep_coord_fill,1,10.322396,10.129846,1.019008,1.019008,0.000000e+00,6.562763e+03
+nep_energy_sum,1,3.037125,3.027364,1.003224,1.003224,0.000000e+00,5.979382e+04
+nep_force_fill,1,8.900827,8.746296,1.017668,1.017668,0.000000e+00,2.851195e+00
+nep_virial_sum,1,28.520833,12.451097,2.290628,2.290628,0.000000e+00,4.036082e+04
+md_update_pos,2,13.098582,6.573173,1.992733,0.996367,0.000000e+00,6.693102e-02
+md_update_vel,2,14.820314,7.287271,2.033726,1.016863,0.000000e+00,1.811520e+00
+md_kinetic_energy,2,7.014427,3.325272,2.109430,1.054715,2.036415e-10,1.205301e+02
+md_temp_vector,2,8.208209,4.038653,2.032413,1.016206,1.600000e-10,7.061666e+02
+md_force_copy,2,9.362898,4.795636,1.952379,0.976190,0.000000e+00,4.932209e+00
+nep_coord_fill,2,10.446854,5.126669,2.037747,1.018874,0.000000e+00,6.562763e+03
+nep_energy_sum,2,3.059902,1.525828,2.005405,1.002702,8.811185e-09,5.979382e+04
+nep_force_fill,2,8.925261,4.459652,2.001336,1.000668,0.000000e+00,2.851195e+00
+nep_virial_sum,2,28.700600,6.623542,4.333120,2.166560,9.997166e-09,4.036082e+04
+md_update_pos,4,13.473068,3.496484,3.853319,0.963330,0.000000e+00,6.693102e-02
+md_update_vel,4,14.817903,3.852381,3.846427,0.961607,0.000000e+00,1.811520e+00
+md_kinetic_energy,4,7.070885,1.874085,3.772980,0.943245,2.242899e-10,1.205301e+02
+md_temp_vector,4,8.508072,2.182533,3.898256,0.974564,1.489155e-10,7.061666e+02
+md_force_copy,4,9.537895,2.563152,3.721158,0.930290,0.000000e+00,4.932209e+00
+nep_coord_fill,4,10.574139,2.837437,3.726652,0.931663,0.000000e+00,6.562763e+03
+nep_energy_sum,4,3.115476,0.785601,3.965723,0.991431,2.277375e-08,5.979382e+04
+nep_force_fill,4,9.159472,2.378972,3.850181,0.962545,0.000000e+00,2.851195e+00
+nep_virial_sum,4,28.671869,3.517298,8.151675,2.037919,6.323717e-09,4.036082e+04
+md_update_pos,8,13.427898,1.823981,7.361864,0.920233,0.000000e+00,6.693102e-02
+md_update_vel,8,14.830449,2.062811,7.189436,0.898679,0.000000e+00,1.811520e+00
+md_kinetic_energy,8,7.156275,1.025259,6.979966,0.872496,1.775788e-10,1.205301e+02
+md_temp_vector,8,8.650072,1.172442,7.377825,0.922228,1.680860e-10,7.061666e+02
+md_force_copy,8,9.885551,1.377531,7.176284,0.897036,0.000000e+00,4.932209e+00
+nep_coord_fill,8,11.209954,1.566864,7.154390,0.894299,0.000000e+00,6.562763e+03
+nep_energy_sum,8,3.211079,0.399830,8.031107,1.003888,1.861918e-08,5.979382e+04
+nep_force_fill,8,9.512588,1.360189,6.993579,0.874197,0.000000e+00,2.851195e+00
+nep_virial_sum,8,28.884186,2.028955,14.235989,1.779499,8.529241e-09,4.036082e+04
diff --git a/Test/results/run_1.csv b/Test/results/run_1.csv
new file mode 100644
index 00000000000..f988811bcee
--- /dev/null
+++ b/Test/results/run_1.csv
@@ -0,0 +1,10 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,1,13.007151,13.052355,0.996537,0.996537,0.000000e+00,6.693102e-02
+md_update_vel,1,14.545481,14.496280,1.003394,1.003394,0.000000e+00,1.811520e+00
+md_kinetic_energy,1,6.925990,6.857271,1.010021,1.010021,0.000000e+00,1.205301e+02
+md_temp_vector,1,8.115525,8.224651,0.986732,0.986732,0.000000e+00,7.061666e+02
+md_force_copy,1,9.308984,9.119837,1.020740,1.020740,0.000000e+00,4.932209e+00
+nep_coord_fill,1,10.322396,10.129846,1.019008,1.019008,0.000000e+00,6.562763e+03
+nep_energy_sum,1,3.037125,3.027364,1.003224,1.003224,0.000000e+00,5.979382e+04
+nep_force_fill,1,8.900827,8.746296,1.017668,1.017668,0.000000e+00,2.851195e+00
+nep_virial_sum,1,28.520833,12.451097,2.290628,2.290628,0.000000e+00,4.036082e+04
diff --git a/Test/results/run_2.csv b/Test/results/run_2.csv
new file mode 100644
index 00000000000..c56f0793298
--- /dev/null
+++ b/Test/results/run_2.csv
@@ -0,0 +1,10 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,2,13.098582,6.573173,1.992733,0.996367,0.000000e+00,6.693102e-02
+md_update_vel,2,14.820314,7.287271,2.033726,1.016863,0.000000e+00,1.811520e+00
+md_kinetic_energy,2,7.014427,3.325272,2.109430,1.054715,2.036415e-10,1.205301e+02
+md_temp_vector,2,8.208209,4.038653,2.032413,1.016206,1.600000e-10,7.061666e+02
+md_force_copy,2,9.362898,4.795636,1.952379,0.976190,0.000000e+00,4.932209e+00
+nep_coord_fill,2,10.446854,5.126669,2.037747,1.018874,0.000000e+00,6.562763e+03
+nep_energy_sum,2,3.059902,1.525828,2.005405,1.002702,8.811185e-09,5.979382e+04
+nep_force_fill,2,8.925261,4.459652,2.001336,1.000668,0.000000e+00,2.851195e+00
+nep_virial_sum,2,28.700600,6.623542,4.333120,2.166560,9.997166e-09,4.036082e+04
diff --git a/Test/results/run_4.csv b/Test/results/run_4.csv
new file mode 100644
index 00000000000..2f118db3395
--- /dev/null
+++ b/Test/results/run_4.csv
@@ -0,0 +1,10 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,4,13.473068,3.496484,3.853319,0.963330,0.000000e+00,6.693102e-02
+md_update_vel,4,14.817903,3.852381,3.846427,0.961607,0.000000e+00,1.811520e+00
+md_kinetic_energy,4,7.070885,1.874085,3.772980,0.943245,2.242899e-10,1.205301e+02
+md_temp_vector,4,8.508072,2.182533,3.898256,0.974564,1.489155e-10,7.061666e+02
+md_force_copy,4,9.537895,2.563152,3.721158,0.930290,0.000000e+00,4.932209e+00
+nep_coord_fill,4,10.574139,2.837437,3.726652,0.931663,0.000000e+00,6.562763e+03
+nep_energy_sum,4,3.115476,0.785601,3.965723,0.991431,2.277375e-08,5.979382e+04
+nep_force_fill,4,9.159472,2.378972,3.850181,0.962545,0.000000e+00,2.851195e+00
+nep_virial_sum,4,28.671869,3.517298,8.151675,2.037919,6.323717e-09,4.036082e+04
diff --git a/Test/results/run_8.csv b/Test/results/run_8.csv
new file mode 100644
index 00000000000..be310c03f17
--- /dev/null
+++ b/Test/results/run_8.csv
@@ -0,0 +1,10 @@
+kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum
+md_update_pos,8,13.427898,1.823981,7.361864,0.920233,0.000000e+00,6.693102e-02
+md_update_vel,8,14.830449,2.062811,7.189436,0.898679,0.000000e+00,1.811520e+00
+md_kinetic_energy,8,7.156275,1.025259,6.979966,0.872496,1.775788e-10,1.205301e+02
+md_temp_vector,8,8.650072,1.172442,7.377825,0.922228,1.680860e-10,7.061666e+02
+md_force_copy,8,9.885551,1.377531,7.176284,0.897036,0.000000e+00,4.932209e+00
+nep_coord_fill,8,11.209954,1.566864,7.154390,0.894299,0.000000e+00,6.562763e+03
+nep_energy_sum,8,3.211079,0.399830,8.031107,1.003888,1.861918e-08,5.979382e+04
+nep_force_fill,8,9.512588,1.360189,6.993579,0.874197,0.000000e+00,2.851195e+00
+nep_virial_sum,8,28.884186,2.028955,14.235989,1.779499,8.529241e-09,4.036082e+04
diff --git a/Test/run_openmp_nep_basic_benchmark.sh b/Test/run_openmp_nep_basic_benchmark.sh
new file mode 100755
index 00000000000..78d206fbe16
--- /dev/null
+++ b/Test/run_openmp_nep_basic_benchmark.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+BUILD_DIR="${SCRIPT_DIR}/build"
+RESULT_DIR="${SCRIPT_DIR}/results"
+BIN="${BUILD_DIR}/openmp_nep_basic_benchmark"
+CSV="${RESULT_DIR}/openmp_nep_basic_benchmark.csv"
+LOG="${RESULT_DIR}/openmp_nep_basic_benchmark.log"
+
+NATOMS="${NATOMS:-2000000}"
+REPEAT="${REPEAT:-5}"
+CXX="${CXX:-g++}"
+
+mkdir -p "${BUILD_DIR}" "${RESULT_DIR}"
+
+{
+  echo "Compiler: $(${CXX} --version | head -n 1)"
+  echo "NATOMS=${NATOMS}"
+  echo "REPEAT=${REPEAT}"
+  echo "Build: ${CXX} -O3 -std=c++17 -fopenmp"
+} > "${LOG}"
+
+"${CXX}" -O3 -std=c++17 -fopenmp "${SCRIPT_DIR}/openmp_nep_basic_benchmark.cpp" -o "${BIN}" 2>&1 | tee -a "${LOG}"
+
+: > "${CSV}"
+for threads in 1 2 4 8; do
+  echo "Running with OMP_NUM_THREADS=${threads}" | tee -a "${LOG}"
+  export OMP_NUM_THREADS="${threads}"
+  export OMP_PROC_BIND="${OMP_PROC_BIND:-close}"
+  export OMP_PLACES="${OMP_PLACES:-cores}"
+  tmp_csv="${RESULT_DIR}/run_${threads}.csv"
+  "${BIN}" --threads "${threads}" --natoms "${NATOMS}" --repeat "${REPEAT}" > "${tmp_csv}"
+  if [[ "${threads}" == "1" ]]; then
+    cat "${tmp_csv}" >> "${CSV}"
+  else
+    tail -n +2 "${tmp_csv}" >> "${CSV}"
+  fi
+done
+
+echo "CSV: ${CSV}" | tee -a "${LOG}"
diff --git a/source/source_esolver/esolver_dp.cpp b/source/source_esolver/esolver_dp.cpp
index 879193e668b..b06b4cf2fa1 100644
--- a/source/source_esolver/esolver_dp.cpp
+++ b/source/source_esolver/esolver_dp.cpp
@@ -36,6 +36,10 @@ void ESolver_DP::before_all_runners(UnitCell& ucell, const Input_para& inp)
     dp_potential = 0;
     dp_force.create(ucell.nat, 3);
     dp_virial.create(3, 3);
+    dp_cell.resize(9);
+    dp_coord.resize(3 * ucell.nat);
+    dp_model_force.clear();
+    dp_model_virial.clear();
 
     ModuleIO::CifParser::write(PARAM.globalv.global_out_dir + "STRU.cif", 
                                ucell, 
@@ -44,6 +48,20 @@ void ESolver_DP::before_all_runners(UnitCell& ucell, const Input_para& inp)
 
     atype.resize(ucell.nat);
 
+    // Build flat atom index for OpenMP coordinate fill in runner()
+    atom_type_index.resize(ucell.nat);
+    atom_local_index.resize(ucell.nat);
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; ++it)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
+        {
+            atom_type_index[iat] = it;
+            atom_local_index[iat] = ia;
+            iat++;
+        }
+    }
+
     rescaling = inp.mdp.dp_rescaling;
     fparam = inp.mdp.dp_fparam;
     aparam = inp.mdp.dp_aparam;
@@ -59,38 +77,36 @@ void ESolver_DP::runner(UnitCell& ucell, const int istep)
     ModuleBase::TITLE("ESolver_DP", "runner");
     ModuleBase::timer::start("ESolver_DP", "runner");
 
-    std::vector<double> cell(9, 0.0);
-    cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom;
-    cell[1] = ucell.latvec.e12 * ucell.lat0_angstrom;
-    cell[2] = ucell.latvec.e13 * ucell.lat0_angstrom;
-    cell[3] = ucell.latvec.e21 * ucell.lat0_angstrom;
-    cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom;
-    cell[5] = ucell.latvec.e23 * ucell.lat0_angstrom;
-    cell[6] = ucell.latvec.e31 * ucell.lat0_angstrom;
-    cell[7] = ucell.latvec.e32 * ucell.lat0_angstrom;
-    cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom;
-
-    std::vector<double> coord(3 * ucell.nat, 0.0);
-    int iat = 0;
-    for (int it = 0; it < ucell.ntype; ++it)
+    dp_cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom;
+    dp_cell[1] = ucell.latvec.e12 * ucell.lat0_angstrom;
+    dp_cell[2] = ucell.latvec.e13 * ucell.lat0_angstrom;
+    dp_cell[3] = ucell.latvec.e21 * ucell.lat0_angstrom;
+    dp_cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom;
+    dp_cell[5] = ucell.latvec.e23 * ucell.lat0_angstrom;
+    dp_cell[6] = ucell.latvec.e31 * ucell.lat0_angstrom;
+    dp_cell[7] = ucell.latvec.e32 * ucell.lat0_angstrom;
+    dp_cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom;
+
+    dp_coord.resize(3 * ucell.nat);
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int iat = 0; iat < nat; ++iat)
     {
-        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
-        {
-            coord[3 * iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
-            coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
-            coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
-            iat++;
-        }
+        const int it = atom_type_index[iat];
+        const int ia = atom_local_index[iat];
+        dp_coord[3 * iat]     = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
+        dp_coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
+        dp_coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
     }
-    assert(ucell.nat == iat);
 
 #ifdef __DPMD
-    std::vector<double> f, v;
     dp_potential = 0;
     dp_force.zero_out();
     dp_virial.zero_out();
+    dp_model_force.clear();
+    dp_model_virial.clear();
 
-    dp.compute(dp_potential, f, v, coord, atype, cell, fparam, aparam);
+    dp.compute(dp_potential, dp_model_force, dp_model_virial, dp_coord, atype, dp_cell, fparam, aparam);
 
     // rescale the energy, force, and stress
     const double fact_e = rescaling / ModuleBase::Ry_to_eV;
@@ -101,18 +117,20 @@ void ESolver_DP::runner(UnitCell& ucell, const int istep)
     GlobalV::ofs_running << " #TOTAL ENERGY# " << std::setprecision(11) << dp_potential * ModuleBase::Ry_to_eV << " eV"
                          << std::endl;
 
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat_f = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat_f >= 256)
+    for (int i = 0; i < nat_f; ++i)
     {
-        dp_force(i, 0) = f[3 * i] * fact_f;
-        dp_force(i, 1) = f[3 * i + 1] * fact_f;
-        dp_force(i, 2) = f[3 * i + 2] * fact_f;
+        dp_force(i, 0) = dp_model_force[3 * i] * fact_f;
+        dp_force(i, 1) = dp_model_force[3 * i + 1] * fact_f;
+        dp_force(i, 2) = dp_model_force[3 * i + 2] * fact_f;
     }
 
     for (int i = 0; i < 3; ++i)
     {
         for (int j = 0; j < 3; ++j)
         {
-            dp_virial(i, j) = v[3 * i + j] * fact_v;
+            dp_virial(i, j) = dp_model_virial[3 * i + j] * fact_v;
         }
     }
 #else
diff --git a/source/source_esolver/esolver_dp.h b/source/source_esolver/esolver_dp.h
index 405bae44461..72e4b5ff550 100644
--- a/source/source_esolver/esolver_dp.h
+++ b/source/source_esolver/esolver_dp.h
@@ -109,12 +109,18 @@ class ESolver_DP : public ESolver
 
     std::string dp_file;             ///< directory of DP model file
     std::vector<int> atype = {};     ///< atom type corresponding to DP model
+    std::vector<int> atom_type_index;  ///< type index (it) for each global atom iat
+    std::vector<int> atom_local_index; ///< local index (ia) within type for each global atom iat
     std::vector<double> fparam = {}; ///< frame parameter for dp potential: dim_fparam
     std::vector<double> aparam = {}; ///< atomic parameter for dp potential: natoms x dim_aparam
     double rescaling = 1.0;          ///< rescaling factor for DP model
     double dp_potential = 0.0;       ///< computed potential energy
     ModuleBase::matrix dp_force;     ///< computed atomic forces
     ModuleBase::matrix dp_virial;    ///< computed lattice virials
+    std::vector<double> dp_cell;     ///< DP cell buffer in Angstrom
+    std::vector<double> dp_coord;    ///< DP coordinate buffer in Angstrom
+    std::vector<double> dp_model_force;  ///< raw force buffer returned by DP
+    std::vector<double> dp_model_virial; ///< raw virial buffer returned by DP
 };
 
 } // namespace ModuleESolver
diff --git a/source/source_esolver/esolver_lj.cpp b/source/source_esolver/esolver_lj.cpp
index 3ddba86adc2..17ecb760d37 100644
--- a/source/source_esolver/esolver_lj.cpp
+++ b/source/source_esolver/esolver_lj.cpp
@@ -11,25 +11,27 @@
 namespace ModuleESolver
 {
 
-    UnitCellLite ESolver_LJ::change_from_ucell_to_ucell_lite(const UnitCell& ucell)
+    UnitCellPlus ESolver_LJ::change_from_ucell_to_ucell_plus(const UnitCell& ucell)
     {
-        UnitCellLite ucell_lite;
-        
-        // Set lattice parameters
-        ucell_lite.set_lattice(ucell.lat0, ucell.omega, ucell.latvec);
-        
-        // Build atom information
-        std::vector<int> na;
-        std::vector<ModuleBase::Vector3<double>> tau;
-        for (int i = 0; i < ucell.ntype; i++) {
-            na.push_back(ucell.atoms[i].na);
-            for (int j = 0; j < ucell.atoms[i].na; j++) {
-                tau.push_back(ucell.atoms[i].tau[j]);
+        UnitCellPlus ucell_plus;
+        ucell_plus.lat0 = ucell.lat0;
+        ucell_plus.omega = ucell.omega;
+        ucell_plus.nat = ucell.nat;
+        for(int i=0;i<ucell.ntype;i++)
+        {
+            ucell_plus.na.push_back(ucell.atoms[i].na);
+        }
+        ucell_plus.ntype = ucell.ntype;
+        ucell_plus.latvec = ucell.latvec;
+        for(int i=0;i<ucell.ntype;i++)
+        {
+            for(int j=0;j<ucell.atoms[i].na;j++)
+            {
+                ucell_plus.tau.push_back(ucell.atoms[i].tau[j]);
             }
         }
-        ucell_lite.set_atoms(ucell.ntype, na, tau);
-        
-        return ucell_lite;
+        ucell_plus.compute_naa();
+        return ucell_plus;
     }
 
 void ESolver_LJ::before_all_runners(UnitCell& ucell, const Input_para& inp)
@@ -51,53 +53,104 @@ void ESolver_LJ::before_all_runners(UnitCell& ucell, const Input_para& inp)
 
     // calculate the energy shift so that LJ energy is zero at rcut
     cal_en_shift(ucell.ntype, inp.mdp.lj_eshift);
+
+    // build flat atom index cache for OpenMP coordinate access in runner()
+    atom_type_index.resize(ucell.nat);
+    atom_local_index.resize(ucell.nat);
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; ++it)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
+        {
+            atom_type_index[iat] = it;
+            atom_local_index[iat] = ia;
+            ++iat;
+        }
+    }
+    assert(ucell.nat == iat);
 }
 
 void ESolver_LJ::runner(UnitCell& ucell, const int istep)
 {
-    UnitCellLite ucell_lite = change_from_ucell_to_ucell_lite(ucell);
+    UnitCellPlus ucell_plus = change_from_ucell_to_ucell_plus(ucell);
     NeighborSearch neighbor_search;
-    neighbor_search.init(ucell_lite, search_radius, 0);
+    neighbor_search.init(ucell_plus, search_radius, 0);
     neighbor_search.build_neighbors();
 
-    double distance = 0.0;
-    int index = 0;
-
     // Important! potential, force, virial must be zero per step
     lj_potential = 0;
     lj_force.zero_out();
     lj_virial.zero_out();
 
-    ModuleBase::Vector3<double> tau1, tau2, dtau;
-    const NeighborList& neighbor_list = neighbor_search.get_neighbor_list();
-    const std::vector<NeighborAtom>& all_atoms = neighbor_search.get_all_atoms();
-    for (int it = 0; it < ucell.ntype; ++it)
+    const int nat = ucell.nat;
+
+#pragma omp parallel if (nat >= 256)
     {
-        Atom* atom1 = &ucell.atoms[it];
-        for (int ia = 0; ia < atom1->na; ++ia)
+        double vl[9] = {0};
+        double pot_local = 0.0;
+
+#pragma omp for schedule(dynamic, 32)
+        for (int iat = 0; iat < nat; ++iat)
         {
-            tau1 = atom1->tau[ia];
-            for (int ad = 0; ad < neighbor_list.get_numneigh(index); ++ad)
+            const int it = atom_type_index[iat];
+            const int ia = atom_local_index[iat];
+            ModuleBase::Vector3<double> tau1 = ucell.atoms[it].tau[ia];
+
+            for (int ad = 0; ad < neighbor_search.neighbor_list.numneigh[iat]; ++ad)
             {
-                const NeighborAtom& neighbor_atom = all_atoms[neighbor_list.get_firstneigh(index)[ad]];
-                tau2.x = neighbor_atom.position_x;
-                tau2.y = neighbor_atom.position_y;
-                tau2.z = neighbor_atom.position_z;
-                int it2 = neighbor_atom.atom_type;
-                dtau = (tau1 - tau2) * ucell.lat0;
-                distance = dtau.norm();
+                const int neigh_idx = neighbor_search.neighbor_list.firstneigh[iat][ad];
+                ModuleBase::Vector3<double> tau2;
+                tau2.x = neighbor_search.all_atoms[neigh_idx].position_x;
+                tau2.y = neighbor_search.all_atoms[neigh_idx].position_y;
+                tau2.z = neighbor_search.all_atoms[neigh_idx].position_z;
+                int it2 = neighbor_search.all_atoms[neigh_idx].atom_type;
+
+                ModuleBase::Vector3<double> dtau = (tau1 - tau2) * ucell.lat0;
+                double distance = dtau.norm();
+
                 if (distance < lj_rcut(it, it2))
                 {
-                    lj_potential += LJ_energy(distance, it, it2) - en_shift(it, it2);
+                    pot_local += LJ_energy(distance, it, it2) - en_shift(it, it2);
                     ModuleBase::Vector3<double> f_ij = LJ_force(dtau, it, it2);
-                    lj_force(index, 0) += f_ij.x;
-                    lj_force(index, 1) += f_ij.y;
-                    lj_force(index, 2) += f_ij.z;
-                    LJ_virial(f_ij, dtau);
+                    lj_force(iat, 0) += f_ij.x;
+                    lj_force(iat, 1) += f_ij.y;
+                    lj_force(iat, 2) += f_ij.z;
+
+                    // per-thread virial accumulation
+                    vl[0] += dtau.x * f_ij.x;
+                    vl[1] += dtau.x * f_ij.y;
+                    vl[2] += dtau.x * f_ij.z;
+                    vl[3] += dtau.y * f_ij.x;
+                    vl[4] += dtau.y * f_ij.y;
+                    vl[5] += dtau.y * f_ij.z;
+                    vl[6] += dtau.z * f_ij.x;
+                    vl[7] += dtau.z * f_ij.y;
+                    vl[8] += dtau.z * f_ij.z;
                 }
             }
-            index++;
         }
+
+#pragma omp atomic
+        lj_potential += pot_local;
+
+#pragma omp atomic
+        lj_virial(0, 0) += vl[0];
+#pragma omp atomic
+        lj_virial(0, 1) += vl[1];
+#pragma omp atomic
+        lj_virial(0, 2) += vl[2];
+#pragma omp atomic
+        lj_virial(1, 0) += vl[3];
+#pragma omp atomic
+        lj_virial(1, 1) += vl[4];
+#pragma omp atomic
+        lj_virial(1, 2) += vl[5];
+#pragma omp atomic
+        lj_virial(2, 0) += vl[6];
+#pragma omp atomic
+        lj_virial(2, 1) += vl[7];
+#pragma omp atomic
+        lj_virial(2, 2) += vl[8];
     }
 
 
diff --git a/source/source_esolver/esolver_lj.h b/source/source_esolver/esolver_lj.h
index 1a96510eaa4..473c9bef20f 100644
--- a/source/source_esolver/esolver_lj.h
+++ b/source/source_esolver/esolver_lj.h
@@ -2,7 +2,8 @@
 #define ESOLVER_LJ_H
 
 #include "esolver.h"
-#include "source_cell/module_neighlist/unitcell_lite.h"
+#include "source_cell/module_neighlist/unitcell_plus.h"
+#include <vector>
 
 namespace ModuleESolver
 {
@@ -15,7 +16,7 @@ namespace ModuleESolver
             classname = "ESolver_LJ";
         }
 
-        UnitCellLite change_from_ucell_to_ucell_lite(const UnitCell& ucell);
+        UnitCellPlus change_from_ucell_to_ucell_plus(const UnitCell& ucell);
 
         void before_all_runners(UnitCell& ucell, const Input_para& inp) override;
 
@@ -55,6 +56,8 @@ namespace ModuleESolver
         double lj_potential=0.0;
         ModuleBase::matrix lj_force;
         ModuleBase::matrix lj_virial;
+        std::vector<int> atom_type_index;   ///< global atom index to UnitCell atom type
+        std::vector<int> atom_local_index;  ///< global atom index to local index inside atom type
         //---------------------------------------------------
     };
 }
diff --git a/source/source_esolver/esolver_nep.cpp b/source/source_esolver/esolver_nep.cpp
index 8944776aaa6..b586983a647 100644
--- a/source/source_esolver/esolver_nep.cpp
+++ b/source/source_esolver/esolver_nep.cpp
@@ -23,7 +23,7 @@
 #include "source_io/module_output/output_log.h"
 #include "source_io/module_output/cif_io.h"
 
-#include <numeric>
+#include <algorithm>
 #include <unordered_map>
 
 using namespace ModuleESolver;
@@ -34,9 +34,26 @@ void ESolver_NEP::before_all_runners(UnitCell& ucell, const Input_para& inp)
     nep_force.create(ucell.nat, 3);
     nep_virial.create(3, 3);
     atype.resize(ucell.nat);
+    nep_cell.resize(9);
+    nep_coord.resize(3 * ucell.nat);
+    nep_virial_sum.resize(9);
     _e.resize(ucell.nat);
     _f.resize(3 * ucell.nat);
     _v.resize(9 * ucell.nat);
+    atom_type_index.resize(ucell.nat);
+    atom_local_index.resize(ucell.nat);
+
+    int iat = 0;
+    for (int it = 0; it < ucell.ntype; ++it)
+    {
+        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
+        {
+            atom_type_index[iat] = it;
+            atom_local_index[iat] = ia;
+            ++iat;
+        }
+    }
+    assert(ucell.nat == iat);
 
     ModuleIO::CifParser::write(PARAM.globalv.global_out_dir + "STRU.cif", 
                                ucell, 
@@ -56,39 +73,35 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
 
     // note that NEP are column major, thus a transpose is needed
     // cell
-    std::vector<double> cell(9, 0.0);
-    cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom;
-    cell[1] = ucell.latvec.e21 * ucell.lat0_angstrom;
-    cell[2] = ucell.latvec.e31 * ucell.lat0_angstrom;
-    cell[3] = ucell.latvec.e12 * ucell.lat0_angstrom;
-    cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom;
-    cell[5] = ucell.latvec.e32 * ucell.lat0_angstrom;
-    cell[6] = ucell.latvec.e13 * ucell.lat0_angstrom;
-    cell[7] = ucell.latvec.e23 * ucell.lat0_angstrom;
-    cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom;
+    nep_cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom;
+    nep_cell[1] = ucell.latvec.e21 * ucell.lat0_angstrom;
+    nep_cell[2] = ucell.latvec.e31 * ucell.lat0_angstrom;
+    nep_cell[3] = ucell.latvec.e12 * ucell.lat0_angstrom;
+    nep_cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom;
+    nep_cell[5] = ucell.latvec.e32 * ucell.lat0_angstrom;
+    nep_cell[6] = ucell.latvec.e13 * ucell.lat0_angstrom;
+    nep_cell[7] = ucell.latvec.e23 * ucell.lat0_angstrom;
+    nep_cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom;
 
     // coord
-    std::vector<double> coord(3 * ucell.nat, 0.0);
-    int iat = 0;
+    nep_coord.resize(3 * ucell.nat);
     const int nat = ucell.nat;
-    for (int it = 0; it < ucell.ntype; ++it)
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int iat = 0; iat < nat; ++iat)
     {
-        for (int ia = 0; ia < ucell.atoms[it].na; ++ia)
-        {
-            coord[iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
-            coord[iat + nat] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
-            coord[iat + 2 * nat] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
-            iat++;
-        }
+        const int it = atom_type_index[iat];
+        const int ia = atom_local_index[iat];
+        nep_coord[iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom;
+        nep_coord[iat + nat] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom;
+        nep_coord[iat + 2 * nat] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom;
     }
-    assert(ucell.nat == iat);
 
 #ifdef __NEP
     nep_potential = 0.0;
     nep_force.zero_out();
     nep_virial.zero_out();
 
-    nep.compute(atype, cell, coord, _e, _f, _v);
+    nep.compute(atype, nep_cell, nep_coord, _e, _f, _v);
 
     // unit conversion
     const double fact_e = 1.0 / ModuleBase::Ry_to_eV;
@@ -97,11 +110,18 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
 
 
     // potential energy
-    nep_potential = fact_e * std::accumulate(_e.begin(), _e.end(), 0.0) ;
+    double energy_sum = 0.0;
+#pragma omp parallel for reduction(+:energy_sum) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
+    {
+        energy_sum += _e[i];
+    }
+    nep_potential = fact_e * energy_sum;
     GlobalV::ofs_running << " #TOTAL ENERGY# " << std::setprecision(11) << nep_potential * ModuleBase::Ry_to_eV << " eV"
                          << std::endl;
     
     // forces
+#pragma omp parallel for schedule(static) if (nat >= 256)
     for (int i = 0; i < nat; ++i)
     {
         nep_force(i, 0) = _f[i] * fact_f;
@@ -110,22 +130,44 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep)
     }
 
     // virial
-    std::vector<double> v_sum(9, 0.0);
-    for (int j = 0; j < 9; ++j)
+    double v0 = 0.0;
+    double v1 = 0.0;
+    double v2 = 0.0;
+    double v3 = 0.0;
+    double v4 = 0.0;
+    double v5 = 0.0;
+    double v6 = 0.0;
+    double v7 = 0.0;
+    double v8 = 0.0;
+#pragma omp parallel for reduction(+:v0, v1, v2, v3, v4, v5, v6, v7, v8) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
-        for (int i = 0; i < nat; ++i)
-        {
-            int index = j * nat + i;
-            v_sum[j] += _v[index];
-        }
+        v0 += _v[i];
+        v1 += _v[nat + i];
+        v2 += _v[2 * nat + i];
+        v3 += _v[3 * nat + i];
+        v4 += _v[4 * nat + i];
+        v5 += _v[5 * nat + i];
+        v6 += _v[6 * nat + i];
+        v7 += _v[7 * nat + i];
+        v8 += _v[8 * nat + i];
     }
+    nep_virial_sum[0] = v0;
+    nep_virial_sum[1] = v1;
+    nep_virial_sum[2] = v2;
+    nep_virial_sum[3] = v3;
+    nep_virial_sum[4] = v4;
+    nep_virial_sum[5] = v5;
+    nep_virial_sum[6] = v6;
+    nep_virial_sum[7] = v7;
+    nep_virial_sum[8] = v8;
 
     // virial -> stress
     for (int i = 0; i < 3; ++i)
     {
         for (int j = 0; j < 3; ++j)
         {
-            nep_virial(i, j) = v_sum[3 * i + j] * fact_v;
+            nep_virial(i, j) = nep_virial_sum[3 * i + j] * fact_v;
         }
     }
 #else
diff --git a/source/source_esolver/esolver_nep.h b/source/source_esolver/esolver_nep.h
index dfec17a83c2..bcbd658c319 100644
--- a/source/source_esolver/esolver_nep.h
+++ b/source/source_esolver/esolver_nep.h
@@ -95,9 +95,14 @@ class ESolver_NEP : public ESolver
 
     std::string nep_file;                ///< directory of NEP model file
     std::vector<int> atype = {};         ///< atom type mapping for NEP model
+    std::vector<int> atom_type_index;     ///< global atom index to UnitCell atom type
+    std::vector<int> atom_local_index;    ///< global atom index to local index inside atom type
     double nep_potential;                ///< computed potential energy
     ModuleBase::matrix nep_force;        ///< computed atomic forces
     ModuleBase::matrix nep_virial;       ///< computed lattice virials
+    std::vector<double> nep_cell;        ///< NEP cell buffer in Angstrom, column-major
+    std::vector<double> nep_coord;       ///< NEP coordinate buffer in Angstrom, column-major
+    std::vector<double> nep_virial_sum;  ///< summed per-atom virial components
     std::vector<double> _e;              ///< temporary storage for energy computation
     std::vector<double> _f;              ///< temporary storage for force computation
     std::vector<double> _v;              ///< temporary storage for virial computation
@@ -105,4 +110,4 @@ class ESolver_NEP : public ESolver
 
 } // namespace ModuleESolver
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/source_md/fire.cpp b/source/source_md/fire.cpp
index fa575b508d7..1260906e6e4 100644
--- a/source/source_md/fire.cpp
+++ b/source/source_md/fire.cpp
@@ -19,11 +19,7 @@ FIRE::FIRE(const Parameter& param_in, UnitCell& unit_in) : MD_base(param_in, uni
     n_min = 4;
     negative_count = 0;
     max = 0.0;
-
-    // BUGFIX:
-    // Do not override the force convergence threshold read from INPUT.
-    // force_thr is stored in internal force unit, Hartree/Bohr.
-    // force_thr = 1e-3;
+    force_thr = 1e-3;
 }
 
 FIRE::~FIRE()
@@ -156,44 +152,22 @@ void FIRE::restart(const std::string& global_readin_dir)
     return;
 }
 
+
 void FIRE::check_force(void)
 {
     max = 0.0;
 
-    int movable_dof = 0;
-
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for default(none) shared(force, nat) reduction(max:max) \
+    schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         for (int j = 0; j < 3; ++j)
         {
-            // Only movable degrees of freedom should be used
-            // in the FIRE convergence criterion.
-            //
-            // For example:
-            // m 1 1 1 -> x/y/z are included.
-            // m 1 0 1 -> y is excluded.
-            // m 0 0 0 -> this atom contributes no DOF to convergence.
-            if (!ionmbl[i][j])
-            {
-                continue;
-            }
-
-            ++movable_dof;
-
-            if (max < std::abs(force[i][j]))
-            {
-                max = std::abs(force[i][j]);
-            }
+            max = std::max(max, std::abs(force[i][j]));
         }
     }
 
-    // If there are no movable degrees of freedom, there is nothing to optimize.
-    if (movable_dof == 0)
-    {
-        stop = true;
-        return;
-    }
-
     if (2.0 * max < force_thr)
     {
         stop = true;
@@ -215,56 +189,25 @@ void FIRE::check_fire(void)
         dt_max = 2.5 * md_dt;
     }
 
-    int movable_dof = 0;
-
-    // Compute P, |F| and |v| only on movable degrees of freedom.
-    // Fixed atoms/directions may have non-zero raw forces, but they should not
-    // affect the FIRE velocity projection or adaptive time-step control.
-    for (int i = 0; i < ucell.nat; ++i)
-    {
-        for (int j = 0; j < 3; ++j)
-        {
-            if (!ionmbl[i][j])
-            {
-                // Keep frozen components clean.
-                vel[i][j] = 0.0;
-                continue;
-            }
-
-            ++movable_dof;
-
-            P += vel[i][j] * force[i][j];
-            sumforce += force[i][j] * force[i][j];
-            normvel += vel[i][j] * vel[i][j];
-        }
-    }
+    const int nat = ucell.nat;
 
-    // No movable degrees of freedom: nothing to update.
-    if (movable_dof == 0)
+#pragma omp parallel for reduction(+:P, sumforce, normvel) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
-        return;
+        P += vel[i].x * force[i].x + vel[i].y * force[i].y + vel[i].z * force[i].z;
+        sumforce += force[i].norm2();
+        normvel += vel[i].norm2();
     }
 
-    sumforce = std::sqrt(sumforce);
-    normvel = std::sqrt(normvel);
+    sumforce = sqrt(sumforce);
+    normvel = sqrt(normvel);
 
-    // If force or velocity norm is zero, the velocity projection is undefined.
-    // Avoid 0/0. In a truly converged case check_force() should stop the run.
-    if (sumforce > 0.0 && normvel > 0.0)
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
-        for (int i = 0; i < ucell.nat; ++i)
+        for (int j = 0; j < 3; ++j)
         {
-            for (int j = 0; j < 3; ++j)
-            {
-                if (!ionmbl[i][j])
-                {
-                    vel[i][j] = 0.0;
-                    continue;
-                }
-
-                vel[i][j] = (1.0 - alpha) * vel[i][j]
-                          + alpha * force[i][j] / sumforce * normvel;
-            }
+            vel[i][j] = (1.0 - alpha) * vel[i][j] + alpha * force[i][j] / sumforce * normvel;
         }
     }
 
@@ -282,7 +225,8 @@ void FIRE::check_fire(void)
         md_dt *= fdec;
         negative_count = 0;
 
-        for (int i = 0; i < ucell.nat; ++i)
+#pragma omp parallel for schedule(static) if (nat >= 256)
+        for (int i = 0; i < nat; ++i)
         {
             for (int j = 0; j < 3; ++j)
             {
@@ -292,6 +236,6 @@ void FIRE::check_fire(void)
 
         alpha = alpha_start;
     }
-
+    
     return;
 }
diff --git a/source/source_md/langevin.cpp b/source/source_md/langevin.cpp
index 31996810aa0..a6b1ed12a28 100644
--- a/source/source_md/langevin.cpp
+++ b/source/source_md/langevin.cpp
@@ -88,14 +88,16 @@ void Langevin::post_force()
     if (my_rank == 0)
     {
         double t_target = MD_func::target_temp(step_ + step_rst_, mdp.md_nstep, md_tfirst, md_tlast);
-        ModuleBase::Vector3<double> fictitious_force;
+#pragma omp parallel for default(none) shared(ucell, allmass, vel, md_damp, \
+                                         t_target, md_dt, force, total_force) \
+    schedule(static) if (ucell.nat >= 256)
         for (int i = 0; i < ucell.nat; ++i)
         {
-            fictitious_force = -allmass[i] * vel[i] / md_damp;
+            ModuleBase::Vector3<double> fictitious_force = -allmass[i] * vel[i] / md_damp;
             for (int j = 0; j < 3; ++j)
             {
                 fictitious_force[j] += sqrt(24.0 * t_target * allmass[i] / md_damp / md_dt)
-                                       * (static_cast<double>(std::rand()) / RAND_MAX - 0.5);
+                                       * (MD_func::uniform_rand_thread_safe() - 0.5);
             }
             total_force[i] = force[i] + fictitious_force;
         }
diff --git a/source/source_md/md_base.cpp b/source/source_md/md_base.cpp
index 390e1c2b082..f87fca9d2ee 100644
--- a/source/source_md/md_base.cpp
+++ b/source/source_md/md_base.cpp
@@ -96,7 +96,9 @@ void MD_base::update_pos()
 {
     if (my_rank == 0)
     {
-        for (int i = 0; i < ucell.nat; ++i)
+        const int natom = ucell.nat;
+#pragma omp parallel for schedule(static) if (natom >= 256)
+        for (int i = 0; i < natom; ++i)
         {
             for (int k = 0; k < 3; ++k)
             {
@@ -127,7 +129,9 @@ void MD_base::update_vel(const ModuleBase::Vector3<double>* force)
 {
     if (my_rank == 0)
     {
-        for (int i = 0; i < ucell.nat; ++i)
+        const int natom = ucell.nat;
+#pragma omp parallel for schedule(static) if (natom >= 256)
+        for (int i = 0; i < natom; ++i)
         {
             for (int k = 0; k < 3; ++k)
             {
diff --git a/source/source_md/md_func.cpp b/source/source_md/md_func.cpp
index 6bd1b60dd59..4d653c07e9a 100644
--- a/source/source_md/md_func.cpp
+++ b/source/source_md/md_func.cpp
@@ -4,6 +4,8 @@
 #include "source_base/timer.h"
 #include "source_io/module_parameter/parameter.h"
 
+#include <random>
+
 
 namespace MD_func
 {
@@ -40,10 +42,27 @@ double gaussrand()
     return xx;
 }
 
+double gaussrand_thread_safe()
+{
+    // thread_local guarantees each OpenMP thread gets its own RNG state,
+    // seeded independently via std::random_device.
+    thread_local std::mt19937 gen(std::random_device{}());
+    thread_local std::normal_distribution<double> dist(0.0, 1.0);
+    return dist(gen);
+}
+
+double uniform_rand_thread_safe()
+{
+    thread_local std::mt19937 gen(std::random_device{}());
+    thread_local std::uniform_real_distribution<double> dist(0.0, 1.0);
+    return dist(gen);
+}
+
 double kinetic_energy(const int& natom, const ModuleBase::Vector3<double>* vel, const double* allmass)
 {
     double ke = 0;
 
+#pragma omp parallel for reduction(+:ke) schedule(static) if (natom >= 256)
     for (int ion = 0; ion < natom; ++ion)
     {
         ke += 0.5 * allmass[ion] * vel[ion].norm2();
@@ -52,6 +71,43 @@ double kinetic_energy(const int& natom, const ModuleBase::Vector3<double>* vel,
     return ke;
 }
 
+MDKineticState calc_kinetic_state(const int& natom,
+                                  const int& frozen_freedom,
+                                  const double* allmass,
+                                  const ModuleBase::Vector3<double>* vel)
+{
+    MDKineticState state;
+    if (3 * natom == frozen_freedom)
+    {
+        return state;
+    }
+
+    state.kinetic = kinetic_energy(natom, vel, allmass);
+    state.temperature = 2 * state.kinetic / (3 * natom - frozen_freedom);
+    return state;
+}
+
+MDStressState calc_stress_state(const int& natom,
+                                const double& omega,
+                                const ModuleBase::Vector3<double>* vel,
+                                const double* allmass,
+                                const ModuleBase::matrix& virial)
+{
+    MDStressState state;
+    temp_vector(natom, vel, allmass, state.temperature_tensor);
+    state.stress.create(3, 3);
+
+    for (int i = 0; i < 3; ++i)
+    {
+        for (int j = 0; j < 3; ++j)
+        {
+            state.stress(i, j) = virial(i, j) + state.temperature_tensor(i, j) / omega;
+        }
+    }
+
+    return state;
+}
+
 void compute_stress(const UnitCell& unit_in,
                     const ModuleBase::Vector3<double>* vel,
                     const double* allmass,
@@ -61,17 +117,7 @@ void compute_stress(const UnitCell& unit_in,
 {
     if (cal_stress)
     {
-        ModuleBase::matrix t_vector;
-
-        temp_vector(unit_in.nat, vel, allmass, t_vector);
-
-        for (int i = 0; i < 3; ++i)
-        {
-            for (int j = 0; j < 3; ++j)
-            {
-                stress(i, j) = virial(i, j) + t_vector(i, j) / unit_in.omega;
-            }
-        }
+        stress = calc_stress_state(unit_in.nat, unit_in.omega, vel, allmass, virial).stress;
     }
 
     return;
@@ -121,6 +167,7 @@ void rescale_vel(const int& natom,
         factor = 0.5 * (3 * natom - frozen_freedom) * temperature / kinetic_energy(natom, vel, allmass);
     }
 
+#pragma omp parallel for schedule(static) if (natom >= 256)
     for (int i = 0; i < natom; i++)
     {
         vel[i] = vel[i] * sqrt(factor);
@@ -273,7 +320,9 @@ void force_virial(ModuleESolver::ESolver* p_esolver,
     force_temp *= 0.5;
     virial *= 0.5;
 
-    for (int i = 0; i < unit_in.nat; ++i)
+    const int natom = unit_in.nat;
+#pragma omp parallel for schedule(static) if (natom >= 256)
+    for (int i = 0; i < natom; ++i)
     {
         for (int j = 0; j < 3; ++j)
         {
@@ -463,8 +512,9 @@ double current_temp(double& kinetic,
     }
     else
     {
-        kinetic = kinetic_energy(natom, vel, allmass);
-        return 2 * kinetic / (3 * natom - frozen_freedom);
+        const MDKineticState state = calc_kinetic_state(natom, frozen_freedom, allmass, vel);
+        kinetic = state.kinetic;
+        return state.temperature;
     }
 }
 
@@ -475,17 +525,45 @@ void temp_vector(const int& natom,
 {
     t_vector.create(3, 3);
 
+    double t00 = 0.0;
+    double t01 = 0.0;
+    double t02 = 0.0;
+    double t10 = 0.0;
+    double t11 = 0.0;
+    double t12 = 0.0;
+    double t20 = 0.0;
+    double t21 = 0.0;
+    double t22 = 0.0;
+
+#pragma omp parallel for reduction(+:t00, t01, t02, t10, t11, t12, t20, t21, t22) schedule(static) if (natom >= 256)
     for (int ion = 0; ion < natom; ++ion)
     {
-        for (int i = 0; i < 3; ++i)
-        {
-            for (int j = 0; j < 3; ++j)
-            {
-                t_vector(i, j) += allmass[ion] * vel[ion][i] * vel[ion][j];
-            }
-        }
+        const double mass = allmass[ion];
+        const double vx = vel[ion].x;
+        const double vy = vel[ion].y;
+        const double vz = vel[ion].z;
+
+        t00 += mass * vx * vx;
+        t01 += mass * vx * vy;
+        t02 += mass * vx * vz;
+        t10 += mass * vy * vx;
+        t11 += mass * vy * vy;
+        t12 += mass * vy * vz;
+        t20 += mass * vz * vx;
+        t21 += mass * vz * vy;
+        t22 += mass * vz * vz;
     }
 
+    t_vector(0, 0) = t00;
+    t_vector(0, 1) = t01;
+    t_vector(0, 2) = t02;
+    t_vector(1, 0) = t10;
+    t_vector(1, 1) = t11;
+    t_vector(1, 2) = t12;
+    t_vector(2, 0) = t20;
+    t_vector(2, 1) = t21;
+    t_vector(2, 2) = t22;
+
     return;
 }
 
diff --git a/source/source_md/md_func.h b/source/source_md/md_func.h
index be433ffe4ac..39c28910779 100644
--- a/source/source_md/md_func.h
+++ b/source/source_md/md_func.h
@@ -1,6 +1,7 @@
 #ifndef MD_FUNC_H
 #define MD_FUNC_H
 
+#include "md_statistics.h"
 #include "source_esolver/esolver.h"
 
 class Parameter;
@@ -24,6 +25,28 @@ namespace MD_func
  */
 double gaussrand();
 
+/**
+ * @brief generate a Gaussian random number (thread-safe)
+ *
+ * Uses thread_local std::mt19937 and std::normal_distribution.
+ * Thread-safe alternative to gaussrand() for use inside
+ * #pragma omp parallel regions.
+ *
+ * @return Gaussian random number with mean 0 and variance 1
+ */
+double gaussrand_thread_safe();
+
+/**
+ * @brief generate a uniform random number in [0,1) (thread-safe)
+ *
+ * Uses thread_local std::mt19937 and std::uniform_real_distribution.
+ * Thread-safe alternative to std::rand() for use inside
+ * #pragma omp parallel regions.
+ *
+ * @return uniform random number in [0, 1)
+ */
+double uniform_rand_thread_safe();
+
 /**
  * @brief initialize the atomic velocities
  *
@@ -117,6 +140,14 @@ void force_virial(ModuleESolver::ESolver* p_esolver,
  */
 double kinetic_energy(const int& natom, const ModuleBase::Vector3<double>* vel, const double* allmass);
 
+/**
+ * @brief calculate kinetic energy and temperature without writing caller-owned state
+ */
+MDKineticState calc_kinetic_state(const int& natom,
+                                  const int& frozen_freedom,
+                                  const double* allmass,
+                                  const ModuleBase::Vector3<double>* vel);
+
 /**
  * @brief calculate the total stress tensor
  *
@@ -134,6 +165,15 @@ void compute_stress(const UnitCell& unit_in,
                     const ModuleBase::matrix& virial,
                     ModuleBase::matrix& stress);
 
+/**
+ * @brief calculate stress and ionic temperature tensor without writing caller-owned state
+ */
+MDStressState calc_stress_state(const int& natom,
+                                const double& omega,
+                                const ModuleBase::Vector3<double>* vel,
+                                const double* allmass,
+                                const ModuleBase::matrix& virial);
+
 /**
  * @brief output the stress information
  *
diff --git a/source/source_md/md_statistics.h b/source/source_md/md_statistics.h
new file mode 100644
index 00000000000..e7bef175be5
--- /dev/null
+++ b/source/source_md/md_statistics.h
@@ -0,0 +1,23 @@
+#ifndef MD_STATISTICS_H
+#define MD_STATISTICS_H
+
+#include "source_base/matrix.h"
+
+namespace MD_func
+{
+
+struct MDKineticState
+{
+    double kinetic = 0.0;
+    double temperature = 0.0;
+};
+
+struct MDStressState
+{
+    ModuleBase::matrix stress;
+    ModuleBase::matrix temperature_tensor;
+};
+
+} // namespace MD_func
+
+#endif // MD_STATISTICS_H
diff --git a/source/source_md/msst.cpp b/source/source_md/msst.cpp
index e33c24b45e5..dcd00b01926 100644
--- a/source/source_md/msst.cpp
+++ b/source/source_md/msst.cpp
@@ -239,8 +239,9 @@ void MSST::restart(const std::string& global_readin_dir)
 double MSST::vel_sum() const
 {
     double vsum = 0;
-
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for reduction(+:vsum) schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         vsum += vel[i].norm2();
     }
@@ -262,7 +263,9 @@ void MSST::rescale(std::ofstream& ofs, const double& volume)
     unitcell::setup_cell_after_vc(ucell,ofs);
 
     /// rescale velocity
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         vel[i][sd] *= dilation[sd];
     }
@@ -276,8 +279,10 @@ void MSST::propagate_vel()
         const int sd = mdp.msst_direction;
         const double dthalf = 0.5 * md_dt;
         const double fac = msst_vis * pow(omega[sd], 2) / (vsum * ucell.omega);
+        const int nat = ucell.nat;
 
-        for (int i = 0; i < ucell.nat; ++i)
+#pragma omp parallel for schedule(static) if (nat >= 256)
+        for (int i = 0; i < nat; ++i)
         {
             ModuleBase::Vector3<double> const_C = force[i] / allmass[i];
             ModuleBase::Vector3<double> const_D;
diff --git a/source/source_md/nhchain.cpp b/source/source_md/nhchain.cpp
index dc72669ec4e..76cce8da662 100644
--- a/source/source_md/nhchain.cpp
+++ b/source/source_md/nhchain.cpp
@@ -553,7 +553,9 @@ void Nose_Hoover::particle_thermo()
     }
 
     /// rescale velocity due to thermostats
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         vel[i] *= scale;
     }
@@ -695,7 +697,9 @@ void Nose_Hoover::vel_baro()
         factor[i] = exp(-(v_omega[i] + mtk_term) * md_dt / 4);
     }
 
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         for (int j = 0; j < 3; ++j)
         {
diff --git a/source/source_md/run_md.cpp b/source/source_md/run_md.cpp
index ef28e5c8975..b7aab6511a0 100644
--- a/source/source_md/run_md.cpp
+++ b/source/source_md/run_md.cpp
@@ -12,41 +12,48 @@
 #include "verlet.h"
 #include "source_cell/update_cell.h"
 #include "source_cell/print_cell.h"
-namespace Run_MD
-{
+#include <memory>
 
-void md_line(UnitCell& unit_in, ModuleESolver::ESolver* p_esolver, const Parameter& param_in)
+namespace
+{
+std::unique_ptr<MD_base> create_md_runner(const Parameter& param_in, UnitCell& unit_in)
 {
-    ModuleBase::TITLE("Run_MD", "md_line");
-    ModuleBase::timer::start("Run_MD", "md_line");
-
-    /// determine the md_type
-    MD_base* mdrun = nullptr;
     if (param_in.mdp.md_type == "fire")
     {
-        mdrun = new FIRE(param_in, unit_in);
+        return std::unique_ptr<MD_base>(new FIRE(param_in, unit_in));
     }
-    else if ((param_in.mdp.md_type == "nvt" && param_in.mdp.md_thermostat == "nhc") || param_in.mdp.md_type == "npt")
+    if ((param_in.mdp.md_type == "nvt" && param_in.mdp.md_thermostat == "nhc") || param_in.mdp.md_type == "npt")
     {
-        mdrun = new Nose_Hoover(param_in, unit_in);
+        return std::unique_ptr<MD_base>(new Nose_Hoover(param_in, unit_in));
     }
-    else if (param_in.mdp.md_type == "nve" || param_in.mdp.md_type == "nvt")
+    if (param_in.mdp.md_type == "nve" || param_in.mdp.md_type == "nvt")
     {
-        mdrun = new Verlet(param_in, unit_in);
+        return std::unique_ptr<MD_base>(new Verlet(param_in, unit_in));
     }
-    else if (param_in.mdp.md_type == "langevin")
+    if (param_in.mdp.md_type == "langevin")
     {
-        mdrun = new Langevin(param_in, unit_in);
+        return std::unique_ptr<MD_base>(new Langevin(param_in, unit_in));
     }
-    else if (param_in.mdp.md_type == "msst")
+    if (param_in.mdp.md_type == "msst")
     {
-        mdrun = new MSST(param_in, unit_in);
-    }
-    else
-    {
-        ModuleBase::WARNING_QUIT("md_line", "no such md_type!");
+        return std::unique_ptr<MD_base>(new MSST(param_in, unit_in));
     }
 
+    ModuleBase::WARNING_QUIT("md_line", "no such md_type!");
+    return nullptr;
+}
+} // namespace
+
+namespace Run_MD
+{
+
+void md_line(UnitCell& unit_in, ModuleESolver::ESolver* p_esolver, const Parameter& param_in)
+{
+    ModuleBase::TITLE("Run_MD", "md_line");
+    ModuleBase::timer::start("Run_MD", "md_line");
+
+    std::unique_ptr<MD_base> mdrun = create_md_runner(param_in, unit_in);
+
     /// md cycle, mohan update 2026-01-04, change '<=' to '<'
     while ((mdrun->step_ + mdrun->step_rst_) < param_in.mdp.md_nstep && !mdrun->stop)
     {
@@ -129,7 +136,6 @@ void md_line(UnitCell& unit_in, ModuleESolver::ESolver* p_esolver, const Paramet
         mdrun->step_++;
     }
 
-    delete mdrun;
     ModuleBase::timer::end("Run_MD", "md_line");
     return;
 }
diff --git a/source/source_md/test/CMakeLists.txt b/source/source_md/test/CMakeLists.txt
index 3ff705c4f4a..2e04f1d70c0 100644
--- a/source/source_md/test/CMakeLists.txt
+++ b/source/source_md/test/CMakeLists.txt
@@ -37,7 +37,6 @@ list(APPEND depend_files
   ../../source_base/realarray.cpp
   ../../source_base/complexarray.cpp
   ../../source_base/complexmatrix.cpp
-  ../../source_base/global_variable.cpp
   ../../source_base/libm/branred.cpp
   ../../source_base/libm/sincos.cpp
   ../../source_base/math_integral.cpp
diff --git a/source/source_md/test/fire_test.cpp b/source/source_md/test/fire_test.cpp
index 3b294da46ac..e52ae4cbecd 100644
--- a/source/source_md/test/fire_test.cpp
+++ b/source/source_md/test/fire_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/fire.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 
 /************************************************
@@ -35,31 +34,8 @@
  *     - output MD information such as energy, temperature, and pressure
  */
 
-class FIREtest : public testing::Test
+class FIREtest : public MdIntegratorFixture<FIRE>
 {
-  protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
-        mdrun = new FIRE(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-        delete p_esolver;
-    }
 };
 
 TEST_F(FIREtest, Setup)
@@ -167,7 +143,7 @@ TEST_F(FIREtest, Restart)
     mdrun->restart(PARAM.sys.global_readin_dir);
     remove("Restart_md.txt");
 
-    FIRE* fire = dynamic_cast<FIRE*>(mdrun);
+    FIRE* fire = dynamic_cast<FIRE*>(mdrun.get());
     EXPECT_EQ(mdrun->step_rst_, 3);
     EXPECT_EQ(fire->alpha, 0.1);
     EXPECT_EQ(fire->negative_count, 0);
diff --git a/source/source_md/test/langevin_test.cpp b/source/source_md/test/langevin_test.cpp
index 69df605b153..65d462a86da 100644
--- a/source/source_md/test/langevin_test.cpp
+++ b/source/source_md/test/langevin_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/langevin.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 
 /************************************************
@@ -35,31 +34,8 @@
  *     - output MD information such as energy, temperature, and pressure
  */
 
-class Langevin_test : public testing::Test
+class Langevin_test : public MdIntegratorFixture<Langevin>
 {
-  protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
-        mdrun = new Langevin(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-        delete p_esolver;
-    }
 };
 
 TEST_F(Langevin_test, setup)
diff --git a/source/source_md/test/lj_pot_test.cpp b/source/source_md/test/lj_pot_test.cpp
index 64c0b52fe6b..99cec432b56 100644
--- a/source/source_md/test/lj_pot_test.cpp
+++ b/source/source_md/test/lj_pot_test.cpp
@@ -1,9 +1,9 @@
 #include "gtest/gtest.h"
 #define private public
 #include "source_io/module_parameter/parameter.h"
+#include "md_test_fixture.h"
 #include "source_esolver/esolver_lj.h"
 #include "source_md/md_func.h"
-#include "setcell.h"
 #undef private
 #define doublethreshold 1e-12
 
@@ -17,46 +17,23 @@
  *     - calculate energy, force, virial for lj pot
  */
 
-class LJ_pot_test : public testing::Test
+class LJ_pot_test : public LjPotTestFixture
 {
-  protected:
-    ModuleBase::Vector3<double>* force;
-    ModuleBase::matrix stress;
-    double potential;
-    int natom;
-    UnitCell ucell;
-    Input_para input;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-
-        natom = ucell.nat;
-        force = new ModuleBase::Vector3<double>[natom];
-        stress.create(3, 3);
-
-        Setcell::parameters(input);
-    }
-
-    void TearDown()
-    {
-        delete[] force;
-    }
 };
 
 TEST_F(LJ_pot_test, potential)
 {
-    ModuleESolver::ESolver* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver> p_esolver(new ModuleESolver::ESolver_LJ());
     p_esolver->before_all_runners(ucell, input);
-    MD_func::force_virial(p_esolver, 0, ucell, potential, force, true, stress);
+    MD_func::force_virial(p_esolver.get(), 0, ucell, potential, force, true, stress);
     EXPECT_NEAR(potential, -0.011957818623534381, doublethreshold);
 }
 
 TEST_F(LJ_pot_test, force)
 {
-    ModuleESolver::ESolver* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver> p_esolver(new ModuleESolver::ESolver_LJ());
     p_esolver->before_all_runners(ucell, input);
-    MD_func::force_virial(p_esolver, 0, ucell, potential, force, true, stress);
+    MD_func::force_virial(p_esolver.get(), 0, ucell, potential, force, true, stress);
     EXPECT_NEAR(force[0].x, 0.00049817733089377704, doublethreshold);
     EXPECT_NEAR(force[0].y, 0.00082237246837022328, doublethreshold);
     EXPECT_NEAR(force[0].z, -3.0493186101154812e-20, doublethreshold);
@@ -73,9 +50,9 @@ TEST_F(LJ_pot_test, force)
 
 TEST_F(LJ_pot_test, stress)
 {
-    ModuleESolver::ESolver* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver> p_esolver(new ModuleESolver::ESolver_LJ());
     p_esolver->before_all_runners(ucell, input);
-    MD_func::force_virial(p_esolver, 0, ucell, potential, force, true, stress);
+    MD_func::force_virial(p_esolver.get(), 0, ucell, potential, force, true, stress);
     EXPECT_NEAR(stress(0, 0), 8.0360222227631859e-07, doublethreshold);
     EXPECT_NEAR(stress(0, 1), 1.7207745586539077e-07, doublethreshold);
     EXPECT_NEAR(stress(0, 2), 0, doublethreshold);
@@ -89,7 +66,7 @@ TEST_F(LJ_pot_test, stress)
 
 TEST_F(LJ_pot_test, RcutSearchRadius)
 {
-    ModuleESolver::ESolver_LJ* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver_LJ> p_esolver(new ModuleESolver::ESolver_LJ());
     ucell.ntype = 2;
     std::vector<double> rcut = {3.0};
     p_esolver->rcut_search_radius(ucell.ntype, rcut);
@@ -114,7 +91,7 @@ TEST_F(LJ_pot_test, RcutSearchRadius)
 
 TEST_F(LJ_pot_test, SetC6C12)
 {
-    ModuleESolver::ESolver_LJ* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver_LJ> p_esolver(new ModuleESolver::ESolver_LJ());
     ucell.ntype = 2;
 
     // no rule
@@ -187,7 +164,7 @@ TEST_F(LJ_pot_test, SetC6C12)
 
 TEST_F(LJ_pot_test, CalEnShift)
 {
-    ModuleESolver::ESolver_LJ* p_esolver = new ModuleESolver::ESolver_LJ();
+    std::unique_ptr<ModuleESolver::ESolver_LJ> p_esolver(new ModuleESolver::ESolver_LJ());
     ucell.ntype = 2;
 
     std::vector<double> rcut = {3.0};
@@ -214,4 +191,4 @@ TEST_F(LJ_pot_test, CalEnShift)
     EXPECT_NEAR(p_esolver->en_shift(0, 1), -3.303688865319793e-07, doublethreshold);
     EXPECT_NEAR(p_esolver->en_shift(1, 0), -3.303688865319793e-07, doublethreshold);
     EXPECT_NEAR(p_esolver->en_shift(1, 1), -5.6443326024140752e-06, doublethreshold);
-}
\ No newline at end of file
+}
diff --git a/source/source_md/test/md_func_test.cpp b/source/source_md/test/md_func_test.cpp
index eb9ce57a5f9..a9bae026fa8 100644
--- a/source/source_md/test/md_func_test.cpp
+++ b/source/source_md/test/md_func_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
+#include "md_test_fixture.h"
 #include "source_md/md_func.h"
-#include "setcell.h"
 
 #define doublethreshold 1e-12
 /************************************************
@@ -50,45 +49,8 @@
  *     - test the current_md_info function with an incorrect file path
  */
 
-class MD_func_test : public testing::Test
+class MD_func_test : public MdFuncTestFixture
 {
-  protected:
-    UnitCell ucell;
-    double* allmass;                    // atom mass
-    ModuleBase::Vector3<double>* pos;   // atom position
-    ModuleBase::Vector3<double>* vel;   // atom velocity
-    ModuleBase::Vector3<int>* ionmbl;   // atom is frozen or not
-    ModuleBase::Vector3<double>* force; // atom force
-    ModuleBase::matrix virial;          // virial for this lattice
-    ModuleBase::matrix stress;          // stress for this lattice
-    double potential;                   // potential energy
-    int natom;                          // atom number
-    double temperature;                 // temperature
-    int frozen_freedom;                 // frozen_freedom
-    Parameter param_in;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-        natom = ucell.nat;
-        allmass = new double[natom];
-        pos = new ModuleBase::Vector3<double>[natom];
-        ionmbl = new ModuleBase::Vector3<int>[natom];
-        vel = new ModuleBase::Vector3<double>[natom];
-        force = new ModuleBase::Vector3<double>[natom];
-        stress.create(3, 3);
-        virial.create(3, 3);
-    }
-
-    void TearDown()
-    {
-        delete[] allmass;
-        delete[] pos;
-        delete[] vel;
-        delete[] ionmbl;
-        delete[] force;
-    }
 };
 
 TEST_F(MD_func_test, gaussrand)
diff --git a/source/source_md/test/md_test_fixture.h b/source/source_md/test/md_test_fixture.h
new file mode 100644
index 00000000000..fccc19e96f0
--- /dev/null
+++ b/source/source_md/test/md_test_fixture.h
@@ -0,0 +1,111 @@
+#ifndef MD_TEST_FIXTURE_H
+#define MD_TEST_FIXTURE_H
+
+#include "gtest/gtest.h"
+#include "source_esolver/esolver_lj.h"
+#include "source_io/module_parameter/parameter.h"
+#include "source_md/md_base.h"
+#include "setcell.h"
+
+#include <memory>
+#include <vector>
+
+class MdTestBase : public testing::Test
+{
+  protected:
+    UnitCell ucell;
+    Parameter param_in;
+    std::unique_ptr<ModuleESolver::ESolver> p_esolver;
+
+    void SetUp() override
+    {
+        Setcell::setupcell(ucell);
+        Setcell::parameters(param_in.input);
+
+        p_esolver.reset(new ModuleESolver::ESolver_LJ());
+        p_esolver->before_all_runners(ucell, param_in.inp);
+    }
+};
+
+template <class Integrator>
+class MdIntegratorFixture : public MdTestBase
+{
+  protected:
+    std::unique_ptr<MD_base> mdrun;
+
+    void SetUp() override
+    {
+        MdTestBase::SetUp();
+        mdrun.reset(new Integrator(param_in, ucell));
+        mdrun->setup(p_esolver.get(), PARAM.sys.global_readin_dir);
+    }
+};
+
+class MdFuncTestFixture : public testing::Test
+{
+  protected:
+    UnitCell ucell;
+    std::vector<double> allmass_store;
+    std::vector<ModuleBase::Vector3<double>> pos_store;
+    std::vector<ModuleBase::Vector3<double>> vel_store;
+    std::vector<ModuleBase::Vector3<int>> ionmbl_store;
+    std::vector<ModuleBase::Vector3<double>> force_store;
+    double* allmass = nullptr;
+    ModuleBase::Vector3<double>* pos = nullptr;
+    ModuleBase::Vector3<double>* vel = nullptr;
+    ModuleBase::Vector3<int>* ionmbl = nullptr;
+    ModuleBase::Vector3<double>* force = nullptr;
+    ModuleBase::matrix virial;
+    ModuleBase::matrix stress;
+    double potential = 0.0;
+    int natom = 0;
+    double temperature = 0.0;
+    int frozen_freedom = 0;
+    Parameter param_in;
+
+    void SetUp() override
+    {
+        Setcell::setupcell(ucell);
+        Setcell::parameters(param_in.input);
+        natom = ucell.nat;
+
+        allmass_store.resize(natom);
+        pos_store.resize(natom);
+        vel_store.resize(natom);
+        ionmbl_store.resize(natom);
+        force_store.resize(natom);
+        allmass = allmass_store.data();
+        pos = pos_store.data();
+        vel = vel_store.data();
+        ionmbl = ionmbl_store.data();
+        force = force_store.data();
+        stress.create(3, 3);
+        virial.create(3, 3);
+    }
+};
+
+class LjPotTestFixture : public testing::Test
+{
+  protected:
+    std::vector<ModuleBase::Vector3<double>> force_store;
+    ModuleBase::Vector3<double>* force = nullptr;
+    ModuleBase::matrix stress;
+    double potential = 0.0;
+    int natom = 0;
+    UnitCell ucell;
+    Input_para input;
+
+    void SetUp() override
+    {
+        Setcell::setupcell(ucell);
+
+        natom = ucell.nat;
+        force_store.resize(natom);
+        force = force_store.data();
+        stress.create(3, 3);
+
+        Setcell::parameters(input);
+    }
+};
+
+#endif // MD_TEST_FIXTURE_H
diff --git a/source/source_md/test/msst_test.cpp b/source/source_md/test/msst_test.cpp
index 7d0fd8054d1..8b3982be73b 100644
--- a/source/source_md/test/msst_test.cpp
+++ b/source/source_md/test/msst_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/msst.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 
 /************************************************
@@ -35,31 +34,8 @@
  *     - output MD information such as energy, temperature, and pressure
  */
 
-class MSST_test : public testing::Test
+class MSST_test : public MdIntegratorFixture<MSST>
 {
-  protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
-        mdrun = new MSST(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-        delete p_esolver;
-    }
 };
 
 TEST_F(MSST_test, setup)
@@ -208,7 +184,7 @@ TEST_F(MSST_test, restart)
     mdrun->restart(PARAM.sys.global_readin_dir);
     remove("Restart_md.txt");
 
-    MSST* msst = dynamic_cast<MSST*>(mdrun);
+    MSST* msst = dynamic_cast<MSST*>(mdrun.get());
     EXPECT_EQ(mdrun->step_rst_, 3);
     EXPECT_EQ(msst->omega[mdrun->mdp.msst_direction], -0.00977662);
     EXPECT_EQ(msst->e0, -0.00768262);
diff --git a/source/source_md/test/nhchain_test.cpp b/source/source_md/test/nhchain_test.cpp
index 647df0a730d..4f7a4244011 100644
--- a/source/source_md/test/nhchain_test.cpp
+++ b/source/source_md/test/nhchain_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/nhchain.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 /************************************************
  *  unit test of functions in nhchain.h
@@ -33,34 +32,20 @@
  *   - Nose_Hoover::print_md
  *     - output MD information such as energy, temperature, and pressure
  */
-class NHC_test : public testing::Test
+class NHC_test : public MdTestBase
 {
   protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
+    std::unique_ptr<MD_base> mdrun;
 
-    void SetUp()
+    void SetUp() override
     {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
+        MdTestBase::SetUp();
         param_in.input.mdp.md_type = "npt";
         param_in.input.mdp.md_pmode = "tri";
         param_in.input.mdp.md_pfirst = 1;
         param_in.input.mdp.md_plast = 1;
-        mdrun = new Nose_Hoover(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-        delete p_esolver;
+        mdrun.reset(new Nose_Hoover(param_in, ucell));
+        mdrun->setup(p_esolver.get(), PARAM.sys.global_readin_dir);
     }
 };
 
@@ -179,7 +164,7 @@ TEST_F(NHC_test, restart)
     mdrun->restart(PARAM.sys.global_readin_dir);
     remove("Restart_md.txt");
 
-    Nose_Hoover* nhc = dynamic_cast<Nose_Hoover*>(mdrun);
+    Nose_Hoover* nhc = dynamic_cast<Nose_Hoover*>(mdrun.get());
     EXPECT_EQ(mdrun->step_rst_, 3);
     EXPECT_EQ(mdrun->mdp.md_tchain, 4);
     EXPECT_EQ(mdrun->mdp.md_pchain, 4);
diff --git a/source/source_md/test/verlet_test.cpp b/source/source_md/test/verlet_test.cpp
index 86dc66a85e1..d859f0b9ff8 100644
--- a/source/source_md/test/verlet_test.cpp
+++ b/source/source_md/test/verlet_test.cpp
@@ -5,9 +5,8 @@
 #undef private
 #define private public
 #define protected public
-#include "source_esolver/esolver_lj.h"
 #include "source_md/verlet.h"
-#include "setcell.h"
+#include "md_test_fixture.h"
 #define doublethreshold 1e-12
 
 
@@ -36,30 +35,8 @@
  *     - output MD information such as energy, temperature, and pressure
  */
 
-class Verlet_test : public testing::Test
+class Verlet_test : public MdIntegratorFixture<Verlet>
 {
-  protected:
-    MD_base* mdrun;
-    UnitCell ucell;
-    Parameter param_in;
-    ModuleESolver::ESolver* p_esolver;
-
-    void SetUp()
-    {
-        Setcell::setupcell(ucell);
-        Setcell::parameters(param_in.input);
-
-        p_esolver = new ModuleESolver::ESolver_LJ();
-        p_esolver->before_all_runners(ucell, param_in.inp);
-
-        mdrun = new Verlet(param_in, ucell);
-        mdrun->setup(p_esolver, PARAM.sys.global_readin_dir);
-    }
-
-    void TearDown()
-    {
-        delete mdrun;
-    }
 };
 
 TEST_F(Verlet_test, setup)
diff --git a/source/source_md/verlet.cpp b/source/source_md/verlet.cpp
index 3fd79ff1b68..825c46133f1 100644
--- a/source/source_md/verlet.cpp
+++ b/source/source_md/verlet.cpp
@@ -75,17 +75,18 @@ void Verlet::apply_thermostat(void)
     {
         if (my_rank == 0)
         {
-            double deviation = 0.0;
+#pragma omp parallel for default(none) shared(ucell, mdp, md_tlast, allmass, ionmbl, vel) \
+            schedule(static) if (ucell.nat >= 256)
             for (int i = 0; i < ucell.nat; ++i)
             {
-                if (static_cast<double>(std::rand()) / RAND_MAX <= 1.0 / mdp.md_nraise)
+                if (MD_func::uniform_rand_thread_safe() <= 1.0 / mdp.md_nraise)
                 {
-                    deviation = sqrt(md_tlast / allmass[i]);
+                    double deviation = sqrt(md_tlast / allmass[i]);
                     for (int k = 0; k < 3; ++k)
                     {
                         if (ionmbl[i][k])
                         {
-                            vel[i][k] = deviation * MD_func::gaussrand();
+                            vel[i][k] = deviation * MD_func::gaussrand_thread_safe();
                         }
                     }
                 }
@@ -124,7 +125,9 @@ void Verlet::thermalize(const int& nraise, const double& current_temp, const dou
         fac = sqrt(target_temp / current_temp);
     }
 
-    for (int i = 0; i < ucell.nat; ++i)
+    const int nat = ucell.nat;
+#pragma omp parallel for schedule(static) if (nat >= 256)
+    for (int i = 0; i < nat; ++i)
     {
         vel[i] *= fac;
     }
@@ -158,14 +161,16 @@ void Verlet::apply_csvr(const double& current_temp, const double& target_temp)
         factor = exp(-1.0 / taut);
     }
 
-    // Generate Gaussian random numbers using MD_func
-    double rr = MD_func::gaussrand();
+    // Generate Gaussian random numbers using thread-safe RNG
+    double rr = MD_func::gaussrand_thread_safe();
 
     // Calculate sum of squared Gaussian random numbers (ndeg - 1)
     double sumnoises = 0.0;
+#pragma omp parallel for default(none) shared(ndeg) reduction(+:sumnoises) \
+    schedule(static) if (ndeg >= 256)
     for (int i = 0; i < ndeg - 1; ++i)
     {
-        double r = MD_func::gaussrand();
+        double r = MD_func::gaussrand_thread_safe();
         sumnoises += r * r;
     }
 
@@ -179,7 +184,9 @@ void Verlet::apply_csvr(const double& current_temp, const double& target_temp)
     // Calculate scaling factor
     double scale = sqrt(resample);
 
-    // Apply velocity scaling
+    // Apply velocity scaling (embarrassingly parallel)
+#pragma omp parallel for default(none) shared(ucell, vel, scale) \
+    schedule(static) if (ucell.nat >= 256)
     for (int i = 0; i < ucell.nat; ++i)
     {
         vel[i] *= scale;