diff --git a/Test/openmp_nep_basic_benchmark.cpp b/Test/openmp_nep_basic_benchmark.cpp new file mode 100644 index 00000000000..16c2ff4e72b --- /dev/null +++ b/Test/openmp_nep_basic_benchmark.cpp @@ -0,0 +1,611 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +namespace +{ + +struct Vec3 +{ + double x; + double y; + double z; +}; + +struct Matrix3 +{ + double m[3][3]; +}; + +struct BenchmarkData +{ + int natom = 2000000; + int ntype = 4; + double md_dt = 0.5; + double lat0 = 10.0; + double fact_f = 0.0433641153087705; + Matrix3 gt{}; + + std::vector pos; + std::vector vel; + std::vector base_vel; + std::vector force; + std::vector mass; + std::vector> ionmbl; + std::vector force_matrix; + + std::vector atom_type_index; + std::vector atom_local_index; + std::vector type_offset; + std::vector tau_x; + std::vector tau_y; + std::vector tau_z; + std::vector nep_coord; + std::vector nep_e; + std::vector nep_f; + std::vector nep_v; + std::vector nep_force; +}; + +struct Measurement +{ + std::string kernel; + int threads = 1; + double serial_ms = 0.0; + double omp_ms = 0.0; + double max_abs_diff = 0.0; + double checksum = 0.0; +}; + +using Clock = std::chrono::steady_clock; +volatile double global_sink = 0.0; + +double elapsed_ms(const Clock::time_point& start, const Clock::time_point& end) +{ + return std::chrono::duration(end - start).count(); +} + +int parse_int_arg(int argc, char** argv, const std::string& name, int fallback) +{ + for (int i = 1; i + 1 < argc; ++i) + { + if (argv[i] == name) + { + return std::atoi(argv[i + 1]); + } + } + return fallback; +} + +double value_for_index(const int i, const double scale) +{ + return scale * (1.0 + static_cast((i * 17) % 97) / 97.0); +} + +BenchmarkData make_data(const int natom) +{ + BenchmarkData data; + data.natom = natom; + data.gt = {{{1.02, 0.01, 0.03}, {0.02, 0.98, 0.04}, {0.01, 0.02, 1.01}}}; + + data.pos.resize(natom); + data.vel.resize(natom); + data.base_vel.resize(natom); + data.force.resize(natom); + data.mass.resize(natom); + data.ionmbl.resize(natom); + data.force_matrix.resize(static_cast(natom) * 3); + data.atom_type_index.resize(natom); + data.atom_local_index.resize(natom); + data.nep_coord.resize(static_cast(natom) * 3); + data.nep_e.resize(natom); + data.nep_f.resize(static_cast(natom) * 3); + data.nep_v.resize(static_cast(natom) * 9); + data.nep_force.resize(static_cast(natom) * 3); + + data.type_offset.assign(data.ntype + 1, 0); + for (int it = 0; it < data.ntype; ++it) + { + data.type_offset[it + 1] = data.type_offset[it] + natom / data.ntype + (it < natom % data.ntype ? 1 : 0); + } + data.tau_x.resize(natom); + data.tau_y.resize(natom); + data.tau_z.resize(natom); + + for (int i = 0; i < natom; ++i) + { + data.base_vel[i] = {value_for_index(i, 0.001), value_for_index(i + 11, 0.0012), value_for_index(i + 23, 0.0009)}; + data.vel[i] = data.base_vel[i]; + data.force[i] = {value_for_index(i + 3, 0.004), value_for_index(i + 5, 0.003), value_for_index(i + 7, 0.005)}; + data.mass[i] = 10.0 + static_cast(i % 13); + data.ionmbl[i] = {1, (i % 11) == 0 ? 0 : 1, (i % 17) == 0 ? 0 : 1}; + data.force_matrix[3 * static_cast(i)] = data.force[i].x; + data.force_matrix[3 * static_cast(i) + 1] = data.force[i].y; + data.force_matrix[3 * static_cast(i) + 2] = data.force[i].z; + data.nep_e[i] = value_for_index(i + 13, 0.02); + data.nep_f[i] = value_for_index(i + 29, 0.03); + data.nep_f[static_cast(i) + natom] = value_for_index(i + 31, 0.02); + data.nep_f[static_cast(i) + 2 * static_cast(natom)] = value_for_index(i + 37, 0.04); + } + + for (int it = 0; it < data.ntype; ++it) + { + for (int local = 0; local < data.type_offset[it + 1] - data.type_offset[it]; ++local) + { + const int global = data.type_offset[it] + local; + data.atom_type_index[global] = it; + data.atom_local_index[global] = local; + data.tau_x[global] = value_for_index(global + 41, 0.2); + data.tau_y[global] = value_for_index(global + 43, 0.3); + data.tau_z[global] = value_for_index(global + 47, 0.4); + } + } + + for (int j = 0; j < 9; ++j) + { + for (int i = 0; i < natom; ++i) + { + data.nep_v[static_cast(j) * natom + i] = value_for_index(i + j * 19, 0.0003 * (j + 1)); + } + } + + return data; +} + +template +double time_repeated(const int repeat, F&& f) +{ + const auto start = Clock::now(); + for (int r = 0; r < repeat; ++r) + { + f(); + } + return elapsed_ms(start, Clock::now()) / repeat; +} + +void update_pos_serial(const BenchmarkData& data, std::vector& pos) +{ + for (int i = 0; i < data.natom; ++i) + { + double px = data.ionmbl[i][0] ? data.vel[i].x * data.md_dt / data.lat0 : 0.0; + double py = data.ionmbl[i][1] ? data.vel[i].y * data.md_dt / data.lat0 : 0.0; + double pz = data.ionmbl[i][2] ? data.vel[i].z * data.md_dt / data.lat0 : 0.0; + pos[i].x = px * data.gt.m[0][0] + py * data.gt.m[1][0] + pz * data.gt.m[2][0]; + pos[i].y = px * data.gt.m[0][1] + py * data.gt.m[1][1] + pz * data.gt.m[2][1]; + pos[i].z = px * data.gt.m[0][2] + py * data.gt.m[1][2] + pz * data.gt.m[2][2]; + } +} + +void update_pos_omp(const BenchmarkData& data, std::vector& pos) +{ +#pragma omp parallel for schedule(static) if (data.natom >= 256) + for (int i = 0; i < data.natom; ++i) + { + double px = data.ionmbl[i][0] ? data.vel[i].x * data.md_dt / data.lat0 : 0.0; + double py = data.ionmbl[i][1] ? data.vel[i].y * data.md_dt / data.lat0 : 0.0; + double pz = data.ionmbl[i][2] ? data.vel[i].z * data.md_dt / data.lat0 : 0.0; + pos[i].x = px * data.gt.m[0][0] + py * data.gt.m[1][0] + pz * data.gt.m[2][0]; + pos[i].y = px * data.gt.m[0][1] + py * data.gt.m[1][1] + pz * data.gt.m[2][1]; + pos[i].z = px * data.gt.m[0][2] + py * data.gt.m[1][2] + pz * data.gt.m[2][2]; + } +} + +void update_vel_serial(const BenchmarkData& data, std::vector& vel) +{ + for (int i = 0; i < data.natom; ++i) + { + if (data.ionmbl[i][0]) + { + vel[i].x += 0.5 * data.force[i].x * data.md_dt / data.mass[i]; + } + if (data.ionmbl[i][1]) + { + vel[i].y += 0.5 * data.force[i].y * data.md_dt / data.mass[i]; + } + if (data.ionmbl[i][2]) + { + vel[i].z += 0.5 * data.force[i].z * data.md_dt / data.mass[i]; + } + } +} + +void update_vel_omp(const BenchmarkData& data, std::vector& vel) +{ +#pragma omp parallel for schedule(static) if (data.natom >= 256) + for (int i = 0; i < data.natom; ++i) + { + if (data.ionmbl[i][0]) + { + vel[i].x += 0.5 * data.force[i].x * data.md_dt / data.mass[i]; + } + if (data.ionmbl[i][1]) + { + vel[i].y += 0.5 * data.force[i].y * data.md_dt / data.mass[i]; + } + if (data.ionmbl[i][2]) + { + vel[i].z += 0.5 * data.force[i].z * data.md_dt / data.mass[i]; + } + } +} + +double kinetic_serial(const BenchmarkData& data) +{ + double ke = 0.0; + for (int i = 0; i < data.natom; ++i) + { + ke += 0.5 * data.mass[i] + * (data.vel[i].x * data.vel[i].x + data.vel[i].y * data.vel[i].y + data.vel[i].z * data.vel[i].z); + } + return ke; +} + +double kinetic_omp(const BenchmarkData& data) +{ + double ke = 0.0; +#pragma omp parallel for reduction(+:ke) schedule(static) if (data.natom >= 256) + for (int i = 0; i < data.natom; ++i) + { + ke += 0.5 * data.mass[i] + * (data.vel[i].x * data.vel[i].x + data.vel[i].y * data.vel[i].y + data.vel[i].z * data.vel[i].z); + } + return ke; +} + +std::array temp_vector_serial(const BenchmarkData& data) +{ + std::array t{}; + for (int i = 0; i < data.natom; ++i) + { + const double m = data.mass[i]; + const double vx = data.vel[i].x; + const double vy = data.vel[i].y; + const double vz = data.vel[i].z; + t[0] += m * vx * vx; + t[1] += m * vx * vy; + t[2] += m * vx * vz; + t[3] += m * vy * vx; + t[4] += m * vy * vy; + t[5] += m * vy * vz; + t[6] += m * vz * vx; + t[7] += m * vz * vy; + t[8] += m * vz * vz; + } + return t; +} + +std::array temp_vector_omp(const BenchmarkData& data) +{ + double t00 = 0.0; + double t01 = 0.0; + double t02 = 0.0; + double t10 = 0.0; + double t11 = 0.0; + double t12 = 0.0; + double t20 = 0.0; + double t21 = 0.0; + double t22 = 0.0; + +#pragma omp parallel for reduction(+:t00, t01, t02, t10, t11, t12, t20, t21, t22) schedule(static) if (data.natom >= 256) + for (int i = 0; i < data.natom; ++i) + { + const double m = data.mass[i]; + const double vx = data.vel[i].x; + const double vy = data.vel[i].y; + const double vz = data.vel[i].z; + t00 += m * vx * vx; + t01 += m * vx * vy; + t02 += m * vx * vz; + t10 += m * vy * vx; + t11 += m * vy * vy; + t12 += m * vy * vz; + t20 += m * vz * vx; + t21 += m * vz * vy; + t22 += m * vz * vz; + } + return {t00, t01, t02, t10, t11, t12, t20, t21, t22}; +} + +void force_copy_serial(const BenchmarkData& data, std::vector& out) +{ + for (int i = 0; i < data.natom; ++i) + { + out[i].x = data.force_matrix[3 * static_cast(i)]; + out[i].y = data.force_matrix[3 * static_cast(i) + 1]; + out[i].z = data.force_matrix[3 * static_cast(i) + 2]; + } +} + +void force_copy_omp(const BenchmarkData& data, std::vector& out) +{ +#pragma omp parallel for schedule(static) if (data.natom >= 256) + for (int i = 0; i < data.natom; ++i) + { + out[i].x = data.force_matrix[3 * static_cast(i)]; + out[i].y = data.force_matrix[3 * static_cast(i) + 1]; + out[i].z = data.force_matrix[3 * static_cast(i) + 2]; + } +} + +void nep_coord_serial(const BenchmarkData& data, std::vector& coord) +{ + const int nat = data.natom; + for (int iat = 0; iat < nat; ++iat) + { + const int it = data.atom_type_index[iat]; + const int ia = data.atom_local_index[iat]; + const int index = data.type_offset[it] + ia; + coord[iat] = data.tau_x[index] * data.lat0; + coord[static_cast(iat) + nat] = data.tau_y[index] * data.lat0; + coord[static_cast(iat) + 2 * static_cast(nat)] = data.tau_z[index] * data.lat0; + } +} + +void nep_coord_omp(const BenchmarkData& data, std::vector& coord) +{ + const int nat = data.natom; +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int iat = 0; iat < nat; ++iat) + { + const int it = data.atom_type_index[iat]; + const int ia = data.atom_local_index[iat]; + const int index = data.type_offset[it] + ia; + coord[iat] = data.tau_x[index] * data.lat0; + coord[static_cast(iat) + nat] = data.tau_y[index] * data.lat0; + coord[static_cast(iat) + 2 * static_cast(nat)] = data.tau_z[index] * data.lat0; + } +} + +double nep_energy_serial(const BenchmarkData& data) +{ + return std::accumulate(data.nep_e.begin(), data.nep_e.end(), 0.0); +} + +double nep_energy_omp(const BenchmarkData& data) +{ + double energy = 0.0; +#pragma omp parallel for reduction(+:energy) schedule(static) if (data.natom >= 256) + for (int i = 0; i < data.natom; ++i) + { + energy += data.nep_e[i]; + } + return energy; +} + +void nep_force_serial(const BenchmarkData& data, std::vector& out) +{ + const int nat = data.natom; + for (int i = 0; i < nat; ++i) + { + out[3 * static_cast(i)] = data.nep_f[i] * data.fact_f; + out[3 * static_cast(i) + 1] = data.nep_f[static_cast(i) + nat] * data.fact_f; + out[3 * static_cast(i) + 2] = data.nep_f[static_cast(i) + 2 * static_cast(nat)] * data.fact_f; + } +} + +void nep_force_omp(const BenchmarkData& data, std::vector& out) +{ + const int nat = data.natom; +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) + { + out[3 * static_cast(i)] = data.nep_f[i] * data.fact_f; + out[3 * static_cast(i) + 1] = data.nep_f[static_cast(i) + nat] * data.fact_f; + out[3 * static_cast(i) + 2] = data.nep_f[static_cast(i) + 2 * static_cast(nat)] * data.fact_f; + } +} + +std::array nep_virial_serial(const BenchmarkData& data) +{ + std::array sum{}; + const int nat = data.natom; + for (int j = 0; j < 9; ++j) + { + for (int i = 0; i < nat; ++i) + { + sum[j] += data.nep_v[static_cast(j) * nat + i]; + } + } + return sum; +} + +std::array nep_virial_omp(const BenchmarkData& data) +{ + const int nat = data.natom; + double v0 = 0.0; + double v1 = 0.0; + double v2 = 0.0; + double v3 = 0.0; + double v4 = 0.0; + double v5 = 0.0; + double v6 = 0.0; + double v7 = 0.0; + double v8 = 0.0; +#pragma omp parallel for reduction(+:v0, v1, v2, v3, v4, v5, v6, v7, v8) schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) + { + v0 += data.nep_v[i]; + v1 += data.nep_v[static_cast(nat) + i]; + v2 += data.nep_v[2 * static_cast(nat) + i]; + v3 += data.nep_v[3 * static_cast(nat) + i]; + v4 += data.nep_v[4 * static_cast(nat) + i]; + v5 += data.nep_v[5 * static_cast(nat) + i]; + v6 += data.nep_v[6 * static_cast(nat) + i]; + v7 += data.nep_v[7 * static_cast(nat) + i]; + v8 += data.nep_v[8 * static_cast(nat) + i]; + } + return {v0, v1, v2, v3, v4, v5, v6, v7, v8}; +} + +double max_abs_diff(const std::vector& a, const std::vector& b) +{ + double diff = 0.0; + for (std::size_t i = 0; i < a.size(); ++i) + { + diff = std::max(diff, std::abs(a[i].x - b[i].x)); + diff = std::max(diff, std::abs(a[i].y - b[i].y)); + diff = std::max(diff, std::abs(a[i].z - b[i].z)); + } + return diff; +} + +double max_abs_diff(const std::vector& a, const std::vector& b) +{ + double diff = 0.0; + for (std::size_t i = 0; i < a.size(); ++i) + { + diff = std::max(diff, std::abs(a[i] - b[i])); + } + return diff; +} + +double max_abs_diff(const std::array& a, const std::array& b) +{ + double diff = 0.0; + for (std::size_t i = 0; i < a.size(); ++i) + { + diff = std::max(diff, std::abs(a[i] - b[i])); + } + return diff; +} + +double checksum(const std::vector& a) +{ + double sum = 0.0; + for (std::size_t i = 0; i < a.size(); i += 4096) + { + sum += a[i].x + 0.5 * a[i].y + 0.25 * a[i].z; + } + return sum; +} + +double checksum(const std::vector& a) +{ + double sum = 0.0; + for (std::size_t i = 0; i < a.size(); i += 4096) + { + sum += a[i]; + } + return sum; +} + +double checksum(const std::array& a) +{ + return std::accumulate(a.begin(), a.end(), 0.0); +} + +void print_measurement(const Measurement& m) +{ + const double speedup = m.omp_ms > 0.0 ? m.serial_ms / m.omp_ms : 0.0; + const double efficiency = m.threads > 0 ? speedup / m.threads : 0.0; + std::cout << m.kernel << ',' << m.threads << ',' + << std::fixed << std::setprecision(6) + << m.serial_ms << ',' << m.omp_ms << ',' << speedup << ',' << efficiency << ',' + << std::scientific << m.max_abs_diff << ',' << m.checksum << '\n'; +} + +} // namespace + +int main(int argc, char** argv) +{ + const int threads = parse_int_arg(argc, argv, "--threads", 1); + const int natom = parse_int_arg(argc, argv, "--natoms", 2000000); + const int repeat = parse_int_arg(argc, argv, "--repeat", 5); + +#ifdef _OPENMP + omp_set_dynamic(0); + omp_set_num_threads(threads); +#endif + + BenchmarkData data = make_data(natom); + std::vector serial_vec(natom); + std::vector omp_vec(natom); + std::vector serial_doubles(static_cast(natom) * 3); + std::vector omp_doubles(static_cast(natom) * 3); + + std::cout << "kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum\n"; + + { + update_pos_serial(data, serial_vec); + update_pos_omp(data, omp_vec); + const double serial_ms = time_repeated(repeat, [&] { update_pos_serial(data, serial_vec); }); + const double omp_ms = time_repeated(repeat, [&] { update_pos_omp(data, omp_vec); }); + print_measurement({"md_update_pos", threads, serial_ms, omp_ms, max_abs_diff(serial_vec, omp_vec), checksum(omp_vec)}); + } + { + std::vector serial_vel = data.base_vel; + std::vector omp_vel = data.base_vel; + update_vel_serial(data, serial_vel); + update_vel_omp(data, omp_vel); + const double serial_ms = time_repeated(repeat, [&] { update_vel_serial(data, serial_vel); }); + const double omp_ms = time_repeated(repeat, [&] { update_vel_omp(data, omp_vel); }); + print_measurement({"md_update_vel", threads, serial_ms, omp_ms, max_abs_diff(serial_vel, omp_vel), checksum(omp_vel)}); + } + { + const double serial_value = kinetic_serial(data); + const double omp_value = kinetic_omp(data); + const double serial_ms = time_repeated(repeat, [&] { global_sink += kinetic_serial(data); }); + const double omp_ms = time_repeated(repeat, [&] { global_sink += kinetic_omp(data); }); + print_measurement({"md_kinetic_energy", threads, serial_ms, omp_ms, std::abs(serial_value - omp_value), omp_value}); + } + { + const auto serial_value = temp_vector_serial(data); + const auto omp_value = temp_vector_omp(data); + const double serial_ms = time_repeated(repeat, [&] { global_sink += checksum(temp_vector_serial(data)); }); + const double omp_ms = time_repeated(repeat, [&] { global_sink += checksum(temp_vector_omp(data)); }); + print_measurement({"md_temp_vector", threads, serial_ms, omp_ms, max_abs_diff(serial_value, omp_value), checksum(omp_value)}); + } + { + force_copy_serial(data, serial_vec); + force_copy_omp(data, omp_vec); + const double serial_ms = time_repeated(repeat, [&] { force_copy_serial(data, serial_vec); }); + const double omp_ms = time_repeated(repeat, [&] { force_copy_omp(data, omp_vec); }); + print_measurement({"md_force_copy", threads, serial_ms, omp_ms, max_abs_diff(serial_vec, omp_vec), checksum(omp_vec)}); + } + { + nep_coord_serial(data, serial_doubles); + nep_coord_omp(data, omp_doubles); + const double serial_ms = time_repeated(repeat, [&] { nep_coord_serial(data, serial_doubles); }); + const double omp_ms = time_repeated(repeat, [&] { nep_coord_omp(data, omp_doubles); }); + print_measurement({"nep_coord_fill", threads, serial_ms, omp_ms, max_abs_diff(serial_doubles, omp_doubles), checksum(omp_doubles)}); + } + { + const double serial_value = nep_energy_serial(data); + const double omp_value = nep_energy_omp(data); + const double serial_ms = time_repeated(repeat, [&] { global_sink += nep_energy_serial(data); }); + const double omp_ms = time_repeated(repeat, [&] { global_sink += nep_energy_omp(data); }); + print_measurement({"nep_energy_sum", threads, serial_ms, omp_ms, std::abs(serial_value - omp_value), omp_value}); + } + { + nep_force_serial(data, serial_doubles); + nep_force_omp(data, omp_doubles); + const double serial_ms = time_repeated(repeat, [&] { nep_force_serial(data, serial_doubles); }); + const double omp_ms = time_repeated(repeat, [&] { nep_force_omp(data, omp_doubles); }); + print_measurement({"nep_force_fill", threads, serial_ms, omp_ms, max_abs_diff(serial_doubles, omp_doubles), checksum(omp_doubles)}); + } + { + const auto serial_value = nep_virial_serial(data); + const auto omp_value = nep_virial_omp(data); + const double serial_ms = time_repeated(repeat, [&] { global_sink += checksum(nep_virial_serial(data)); }); + const double omp_ms = time_repeated(repeat, [&] { global_sink += checksum(nep_virial_omp(data)); }); + print_measurement({"nep_virial_sum", threads, serial_ms, omp_ms, max_abs_diff(serial_value, omp_value), checksum(omp_value)}); + } + + if (global_sink == -1.0) + { + std::cerr << "unreachable sink: " << global_sink << '\n'; + } + + return 0; +} diff --git a/Test/results/openmp_nep_basic_benchmark.csv b/Test/results/openmp_nep_basic_benchmark.csv new file mode 100644 index 00000000000..0e3136e7095 --- /dev/null +++ b/Test/results/openmp_nep_basic_benchmark.csv @@ -0,0 +1,37 @@ +kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum +md_update_pos,1,13.007151,13.052355,0.996537,0.996537,0.000000e+00,6.693102e-02 +md_update_vel,1,14.545481,14.496280,1.003394,1.003394,0.000000e+00,1.811520e+00 +md_kinetic_energy,1,6.925990,6.857271,1.010021,1.010021,0.000000e+00,1.205301e+02 +md_temp_vector,1,8.115525,8.224651,0.986732,0.986732,0.000000e+00,7.061666e+02 +md_force_copy,1,9.308984,9.119837,1.020740,1.020740,0.000000e+00,4.932209e+00 +nep_coord_fill,1,10.322396,10.129846,1.019008,1.019008,0.000000e+00,6.562763e+03 +nep_energy_sum,1,3.037125,3.027364,1.003224,1.003224,0.000000e+00,5.979382e+04 +nep_force_fill,1,8.900827,8.746296,1.017668,1.017668,0.000000e+00,2.851195e+00 +nep_virial_sum,1,28.520833,12.451097,2.290628,2.290628,0.000000e+00,4.036082e+04 +md_update_pos,2,13.098582,6.573173,1.992733,0.996367,0.000000e+00,6.693102e-02 +md_update_vel,2,14.820314,7.287271,2.033726,1.016863,0.000000e+00,1.811520e+00 +md_kinetic_energy,2,7.014427,3.325272,2.109430,1.054715,2.036415e-10,1.205301e+02 +md_temp_vector,2,8.208209,4.038653,2.032413,1.016206,1.600000e-10,7.061666e+02 +md_force_copy,2,9.362898,4.795636,1.952379,0.976190,0.000000e+00,4.932209e+00 +nep_coord_fill,2,10.446854,5.126669,2.037747,1.018874,0.000000e+00,6.562763e+03 +nep_energy_sum,2,3.059902,1.525828,2.005405,1.002702,8.811185e-09,5.979382e+04 +nep_force_fill,2,8.925261,4.459652,2.001336,1.000668,0.000000e+00,2.851195e+00 +nep_virial_sum,2,28.700600,6.623542,4.333120,2.166560,9.997166e-09,4.036082e+04 +md_update_pos,4,13.473068,3.496484,3.853319,0.963330,0.000000e+00,6.693102e-02 +md_update_vel,4,14.817903,3.852381,3.846427,0.961607,0.000000e+00,1.811520e+00 +md_kinetic_energy,4,7.070885,1.874085,3.772980,0.943245,2.242899e-10,1.205301e+02 +md_temp_vector,4,8.508072,2.182533,3.898256,0.974564,1.489155e-10,7.061666e+02 +md_force_copy,4,9.537895,2.563152,3.721158,0.930290,0.000000e+00,4.932209e+00 +nep_coord_fill,4,10.574139,2.837437,3.726652,0.931663,0.000000e+00,6.562763e+03 +nep_energy_sum,4,3.115476,0.785601,3.965723,0.991431,2.277375e-08,5.979382e+04 +nep_force_fill,4,9.159472,2.378972,3.850181,0.962545,0.000000e+00,2.851195e+00 +nep_virial_sum,4,28.671869,3.517298,8.151675,2.037919,6.323717e-09,4.036082e+04 +md_update_pos,8,13.427898,1.823981,7.361864,0.920233,0.000000e+00,6.693102e-02 +md_update_vel,8,14.830449,2.062811,7.189436,0.898679,0.000000e+00,1.811520e+00 +md_kinetic_energy,8,7.156275,1.025259,6.979966,0.872496,1.775788e-10,1.205301e+02 +md_temp_vector,8,8.650072,1.172442,7.377825,0.922228,1.680860e-10,7.061666e+02 +md_force_copy,8,9.885551,1.377531,7.176284,0.897036,0.000000e+00,4.932209e+00 +nep_coord_fill,8,11.209954,1.566864,7.154390,0.894299,0.000000e+00,6.562763e+03 +nep_energy_sum,8,3.211079,0.399830,8.031107,1.003888,1.861918e-08,5.979382e+04 +nep_force_fill,8,9.512588,1.360189,6.993579,0.874197,0.000000e+00,2.851195e+00 +nep_virial_sum,8,28.884186,2.028955,14.235989,1.779499,8.529241e-09,4.036082e+04 diff --git a/Test/results/run_1.csv b/Test/results/run_1.csv new file mode 100644 index 00000000000..f988811bcee --- /dev/null +++ b/Test/results/run_1.csv @@ -0,0 +1,10 @@ +kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum +md_update_pos,1,13.007151,13.052355,0.996537,0.996537,0.000000e+00,6.693102e-02 +md_update_vel,1,14.545481,14.496280,1.003394,1.003394,0.000000e+00,1.811520e+00 +md_kinetic_energy,1,6.925990,6.857271,1.010021,1.010021,0.000000e+00,1.205301e+02 +md_temp_vector,1,8.115525,8.224651,0.986732,0.986732,0.000000e+00,7.061666e+02 +md_force_copy,1,9.308984,9.119837,1.020740,1.020740,0.000000e+00,4.932209e+00 +nep_coord_fill,1,10.322396,10.129846,1.019008,1.019008,0.000000e+00,6.562763e+03 +nep_energy_sum,1,3.037125,3.027364,1.003224,1.003224,0.000000e+00,5.979382e+04 +nep_force_fill,1,8.900827,8.746296,1.017668,1.017668,0.000000e+00,2.851195e+00 +nep_virial_sum,1,28.520833,12.451097,2.290628,2.290628,0.000000e+00,4.036082e+04 diff --git a/Test/results/run_2.csv b/Test/results/run_2.csv new file mode 100644 index 00000000000..c56f0793298 --- /dev/null +++ b/Test/results/run_2.csv @@ -0,0 +1,10 @@ +kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum +md_update_pos,2,13.098582,6.573173,1.992733,0.996367,0.000000e+00,6.693102e-02 +md_update_vel,2,14.820314,7.287271,2.033726,1.016863,0.000000e+00,1.811520e+00 +md_kinetic_energy,2,7.014427,3.325272,2.109430,1.054715,2.036415e-10,1.205301e+02 +md_temp_vector,2,8.208209,4.038653,2.032413,1.016206,1.600000e-10,7.061666e+02 +md_force_copy,2,9.362898,4.795636,1.952379,0.976190,0.000000e+00,4.932209e+00 +nep_coord_fill,2,10.446854,5.126669,2.037747,1.018874,0.000000e+00,6.562763e+03 +nep_energy_sum,2,3.059902,1.525828,2.005405,1.002702,8.811185e-09,5.979382e+04 +nep_force_fill,2,8.925261,4.459652,2.001336,1.000668,0.000000e+00,2.851195e+00 +nep_virial_sum,2,28.700600,6.623542,4.333120,2.166560,9.997166e-09,4.036082e+04 diff --git a/Test/results/run_4.csv b/Test/results/run_4.csv new file mode 100644 index 00000000000..2f118db3395 --- /dev/null +++ b/Test/results/run_4.csv @@ -0,0 +1,10 @@ +kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum +md_update_pos,4,13.473068,3.496484,3.853319,0.963330,0.000000e+00,6.693102e-02 +md_update_vel,4,14.817903,3.852381,3.846427,0.961607,0.000000e+00,1.811520e+00 +md_kinetic_energy,4,7.070885,1.874085,3.772980,0.943245,2.242899e-10,1.205301e+02 +md_temp_vector,4,8.508072,2.182533,3.898256,0.974564,1.489155e-10,7.061666e+02 +md_force_copy,4,9.537895,2.563152,3.721158,0.930290,0.000000e+00,4.932209e+00 +nep_coord_fill,4,10.574139,2.837437,3.726652,0.931663,0.000000e+00,6.562763e+03 +nep_energy_sum,4,3.115476,0.785601,3.965723,0.991431,2.277375e-08,5.979382e+04 +nep_force_fill,4,9.159472,2.378972,3.850181,0.962545,0.000000e+00,2.851195e+00 +nep_virial_sum,4,28.671869,3.517298,8.151675,2.037919,6.323717e-09,4.036082e+04 diff --git a/Test/results/run_8.csv b/Test/results/run_8.csv new file mode 100644 index 00000000000..be310c03f17 --- /dev/null +++ b/Test/results/run_8.csv @@ -0,0 +1,10 @@ +kernel,threads,serial_ms,omp_ms,speedup,efficiency,max_abs_diff,checksum +md_update_pos,8,13.427898,1.823981,7.361864,0.920233,0.000000e+00,6.693102e-02 +md_update_vel,8,14.830449,2.062811,7.189436,0.898679,0.000000e+00,1.811520e+00 +md_kinetic_energy,8,7.156275,1.025259,6.979966,0.872496,1.775788e-10,1.205301e+02 +md_temp_vector,8,8.650072,1.172442,7.377825,0.922228,1.680860e-10,7.061666e+02 +md_force_copy,8,9.885551,1.377531,7.176284,0.897036,0.000000e+00,4.932209e+00 +nep_coord_fill,8,11.209954,1.566864,7.154390,0.894299,0.000000e+00,6.562763e+03 +nep_energy_sum,8,3.211079,0.399830,8.031107,1.003888,1.861918e-08,5.979382e+04 +nep_force_fill,8,9.512588,1.360189,6.993579,0.874197,0.000000e+00,2.851195e+00 +nep_virial_sum,8,28.884186,2.028955,14.235989,1.779499,8.529241e-09,4.036082e+04 diff --git a/Test/run_openmp_nep_basic_benchmark.sh b/Test/run_openmp_nep_basic_benchmark.sh new file mode 100755 index 00000000000..78d206fbe16 --- /dev/null +++ b/Test/run_openmp_nep_basic_benchmark.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +BUILD_DIR="${SCRIPT_DIR}/build" +RESULT_DIR="${SCRIPT_DIR}/results" +BIN="${BUILD_DIR}/openmp_nep_basic_benchmark" +CSV="${RESULT_DIR}/openmp_nep_basic_benchmark.csv" +LOG="${RESULT_DIR}/openmp_nep_basic_benchmark.log" + +NATOMS="${NATOMS:-2000000}" +REPEAT="${REPEAT:-5}" +CXX="${CXX:-g++}" + +mkdir -p "${BUILD_DIR}" "${RESULT_DIR}" + +{ + echo "Compiler: $(${CXX} --version | head -n 1)" + echo "NATOMS=${NATOMS}" + echo "REPEAT=${REPEAT}" + echo "Build: ${CXX} -O3 -std=c++17 -fopenmp" +} > "${LOG}" + +"${CXX}" -O3 -std=c++17 -fopenmp "${SCRIPT_DIR}/openmp_nep_basic_benchmark.cpp" -o "${BIN}" 2>&1 | tee -a "${LOG}" + +: > "${CSV}" +for threads in 1 2 4 8; do + echo "Running with OMP_NUM_THREADS=${threads}" | tee -a "${LOG}" + export OMP_NUM_THREADS="${threads}" + export OMP_PROC_BIND="${OMP_PROC_BIND:-close}" + export OMP_PLACES="${OMP_PLACES:-cores}" + tmp_csv="${RESULT_DIR}/run_${threads}.csv" + "${BIN}" --threads "${threads}" --natoms "${NATOMS}" --repeat "${REPEAT}" > "${tmp_csv}" + if [[ "${threads}" == "1" ]]; then + cat "${tmp_csv}" >> "${CSV}" + else + tail -n +2 "${tmp_csv}" >> "${CSV}" + fi +done + +echo "CSV: ${CSV}" | tee -a "${LOG}" diff --git a/source/source_esolver/esolver_dp.cpp b/source/source_esolver/esolver_dp.cpp index 879193e668b..b06b4cf2fa1 100644 --- a/source/source_esolver/esolver_dp.cpp +++ b/source/source_esolver/esolver_dp.cpp @@ -36,6 +36,10 @@ void ESolver_DP::before_all_runners(UnitCell& ucell, const Input_para& inp) dp_potential = 0; dp_force.create(ucell.nat, 3); dp_virial.create(3, 3); + dp_cell.resize(9); + dp_coord.resize(3 * ucell.nat); + dp_model_force.clear(); + dp_model_virial.clear(); ModuleIO::CifParser::write(PARAM.globalv.global_out_dir + "STRU.cif", ucell, @@ -44,6 +48,20 @@ void ESolver_DP::before_all_runners(UnitCell& ucell, const Input_para& inp) atype.resize(ucell.nat); + // Build flat atom index for OpenMP coordinate fill in runner() + atom_type_index.resize(ucell.nat); + atom_local_index.resize(ucell.nat); + int iat = 0; + for (int it = 0; it < ucell.ntype; ++it) + { + for (int ia = 0; ia < ucell.atoms[it].na; ++ia) + { + atom_type_index[iat] = it; + atom_local_index[iat] = ia; + iat++; + } + } + rescaling = inp.mdp.dp_rescaling; fparam = inp.mdp.dp_fparam; aparam = inp.mdp.dp_aparam; @@ -59,38 +77,36 @@ void ESolver_DP::runner(UnitCell& ucell, const int istep) ModuleBase::TITLE("ESolver_DP", "runner"); ModuleBase::timer::start("ESolver_DP", "runner"); - std::vector cell(9, 0.0); - cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom; - cell[1] = ucell.latvec.e12 * ucell.lat0_angstrom; - cell[2] = ucell.latvec.e13 * ucell.lat0_angstrom; - cell[3] = ucell.latvec.e21 * ucell.lat0_angstrom; - cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom; - cell[5] = ucell.latvec.e23 * ucell.lat0_angstrom; - cell[6] = ucell.latvec.e31 * ucell.lat0_angstrom; - cell[7] = ucell.latvec.e32 * ucell.lat0_angstrom; - cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom; - - std::vector coord(3 * ucell.nat, 0.0); - int iat = 0; - for (int it = 0; it < ucell.ntype; ++it) + dp_cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom; + dp_cell[1] = ucell.latvec.e12 * ucell.lat0_angstrom; + dp_cell[2] = ucell.latvec.e13 * ucell.lat0_angstrom; + dp_cell[3] = ucell.latvec.e21 * ucell.lat0_angstrom; + dp_cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom; + dp_cell[5] = ucell.latvec.e23 * ucell.lat0_angstrom; + dp_cell[6] = ucell.latvec.e31 * ucell.lat0_angstrom; + dp_cell[7] = ucell.latvec.e32 * ucell.lat0_angstrom; + dp_cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom; + + dp_coord.resize(3 * ucell.nat); + const int nat = ucell.nat; +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int iat = 0; iat < nat; ++iat) { - for (int ia = 0; ia < ucell.atoms[it].na; ++ia) - { - coord[3 * iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom; - coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom; - coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom; - iat++; - } + const int it = atom_type_index[iat]; + const int ia = atom_local_index[iat]; + dp_coord[3 * iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom; + dp_coord[3 * iat + 1] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom; + dp_coord[3 * iat + 2] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom; } - assert(ucell.nat == iat); #ifdef __DPMD - std::vector f, v; dp_potential = 0; dp_force.zero_out(); dp_virial.zero_out(); + dp_model_force.clear(); + dp_model_virial.clear(); - dp.compute(dp_potential, f, v, coord, atype, cell, fparam, aparam); + dp.compute(dp_potential, dp_model_force, dp_model_virial, dp_coord, atype, dp_cell, fparam, aparam); // rescale the energy, force, and stress const double fact_e = rescaling / ModuleBase::Ry_to_eV; @@ -101,18 +117,20 @@ void ESolver_DP::runner(UnitCell& ucell, const int istep) GlobalV::ofs_running << " #TOTAL ENERGY# " << std::setprecision(11) << dp_potential * ModuleBase::Ry_to_eV << " eV" << std::endl; - for (int i = 0; i < ucell.nat; ++i) + const int nat_f = ucell.nat; +#pragma omp parallel for schedule(static) if (nat_f >= 256) + for (int i = 0; i < nat_f; ++i) { - dp_force(i, 0) = f[3 * i] * fact_f; - dp_force(i, 1) = f[3 * i + 1] * fact_f; - dp_force(i, 2) = f[3 * i + 2] * fact_f; + dp_force(i, 0) = dp_model_force[3 * i] * fact_f; + dp_force(i, 1) = dp_model_force[3 * i + 1] * fact_f; + dp_force(i, 2) = dp_model_force[3 * i + 2] * fact_f; } for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) { - dp_virial(i, j) = v[3 * i + j] * fact_v; + dp_virial(i, j) = dp_model_virial[3 * i + j] * fact_v; } } #else diff --git a/source/source_esolver/esolver_dp.h b/source/source_esolver/esolver_dp.h index 405bae44461..72e4b5ff550 100644 --- a/source/source_esolver/esolver_dp.h +++ b/source/source_esolver/esolver_dp.h @@ -109,12 +109,18 @@ class ESolver_DP : public ESolver std::string dp_file; ///< directory of DP model file std::vector atype = {}; ///< atom type corresponding to DP model + std::vector atom_type_index; ///< type index (it) for each global atom iat + std::vector atom_local_index; ///< local index (ia) within type for each global atom iat std::vector fparam = {}; ///< frame parameter for dp potential: dim_fparam std::vector aparam = {}; ///< atomic parameter for dp potential: natoms x dim_aparam double rescaling = 1.0; ///< rescaling factor for DP model double dp_potential = 0.0; ///< computed potential energy ModuleBase::matrix dp_force; ///< computed atomic forces ModuleBase::matrix dp_virial; ///< computed lattice virials + std::vector dp_cell; ///< DP cell buffer in Angstrom + std::vector dp_coord; ///< DP coordinate buffer in Angstrom + std::vector dp_model_force; ///< raw force buffer returned by DP + std::vector dp_model_virial; ///< raw virial buffer returned by DP }; } // namespace ModuleESolver diff --git a/source/source_esolver/esolver_lj.cpp b/source/source_esolver/esolver_lj.cpp index 3ddba86adc2..17ecb760d37 100644 --- a/source/source_esolver/esolver_lj.cpp +++ b/source/source_esolver/esolver_lj.cpp @@ -11,25 +11,27 @@ namespace ModuleESolver { - UnitCellLite ESolver_LJ::change_from_ucell_to_ucell_lite(const UnitCell& ucell) + UnitCellPlus ESolver_LJ::change_from_ucell_to_ucell_plus(const UnitCell& ucell) { - UnitCellLite ucell_lite; - - // Set lattice parameters - ucell_lite.set_lattice(ucell.lat0, ucell.omega, ucell.latvec); - - // Build atom information - std::vector na; - std::vector> tau; - for (int i = 0; i < ucell.ntype; i++) { - na.push_back(ucell.atoms[i].na); - for (int j = 0; j < ucell.atoms[i].na; j++) { - tau.push_back(ucell.atoms[i].tau[j]); + UnitCellPlus ucell_plus; + ucell_plus.lat0 = ucell.lat0; + ucell_plus.omega = ucell.omega; + ucell_plus.nat = ucell.nat; + for(int i=0;i tau1, tau2, dtau; - const NeighborList& neighbor_list = neighbor_search.get_neighbor_list(); - const std::vector& all_atoms = neighbor_search.get_all_atoms(); - for (int it = 0; it < ucell.ntype; ++it) + const int nat = ucell.nat; + +#pragma omp parallel if (nat >= 256) { - Atom* atom1 = &ucell.atoms[it]; - for (int ia = 0; ia < atom1->na; ++ia) + double vl[9] = {0}; + double pot_local = 0.0; + +#pragma omp for schedule(dynamic, 32) + for (int iat = 0; iat < nat; ++iat) { - tau1 = atom1->tau[ia]; - for (int ad = 0; ad < neighbor_list.get_numneigh(index); ++ad) + const int it = atom_type_index[iat]; + const int ia = atom_local_index[iat]; + ModuleBase::Vector3 tau1 = ucell.atoms[it].tau[ia]; + + for (int ad = 0; ad < neighbor_search.neighbor_list.numneigh[iat]; ++ad) { - const NeighborAtom& neighbor_atom = all_atoms[neighbor_list.get_firstneigh(index)[ad]]; - tau2.x = neighbor_atom.position_x; - tau2.y = neighbor_atom.position_y; - tau2.z = neighbor_atom.position_z; - int it2 = neighbor_atom.atom_type; - dtau = (tau1 - tau2) * ucell.lat0; - distance = dtau.norm(); + const int neigh_idx = neighbor_search.neighbor_list.firstneigh[iat][ad]; + ModuleBase::Vector3 tau2; + tau2.x = neighbor_search.all_atoms[neigh_idx].position_x; + tau2.y = neighbor_search.all_atoms[neigh_idx].position_y; + tau2.z = neighbor_search.all_atoms[neigh_idx].position_z; + int it2 = neighbor_search.all_atoms[neigh_idx].atom_type; + + ModuleBase::Vector3 dtau = (tau1 - tau2) * ucell.lat0; + double distance = dtau.norm(); + if (distance < lj_rcut(it, it2)) { - lj_potential += LJ_energy(distance, it, it2) - en_shift(it, it2); + pot_local += LJ_energy(distance, it, it2) - en_shift(it, it2); ModuleBase::Vector3 f_ij = LJ_force(dtau, it, it2); - lj_force(index, 0) += f_ij.x; - lj_force(index, 1) += f_ij.y; - lj_force(index, 2) += f_ij.z; - LJ_virial(f_ij, dtau); + lj_force(iat, 0) += f_ij.x; + lj_force(iat, 1) += f_ij.y; + lj_force(iat, 2) += f_ij.z; + + // per-thread virial accumulation + vl[0] += dtau.x * f_ij.x; + vl[1] += dtau.x * f_ij.y; + vl[2] += dtau.x * f_ij.z; + vl[3] += dtau.y * f_ij.x; + vl[4] += dtau.y * f_ij.y; + vl[5] += dtau.y * f_ij.z; + vl[6] += dtau.z * f_ij.x; + vl[7] += dtau.z * f_ij.y; + vl[8] += dtau.z * f_ij.z; } } - index++; } + +#pragma omp atomic + lj_potential += pot_local; + +#pragma omp atomic + lj_virial(0, 0) += vl[0]; +#pragma omp atomic + lj_virial(0, 1) += vl[1]; +#pragma omp atomic + lj_virial(0, 2) += vl[2]; +#pragma omp atomic + lj_virial(1, 0) += vl[3]; +#pragma omp atomic + lj_virial(1, 1) += vl[4]; +#pragma omp atomic + lj_virial(1, 2) += vl[5]; +#pragma omp atomic + lj_virial(2, 0) += vl[6]; +#pragma omp atomic + lj_virial(2, 1) += vl[7]; +#pragma omp atomic + lj_virial(2, 2) += vl[8]; } diff --git a/source/source_esolver/esolver_lj.h b/source/source_esolver/esolver_lj.h index 1a96510eaa4..473c9bef20f 100644 --- a/source/source_esolver/esolver_lj.h +++ b/source/source_esolver/esolver_lj.h @@ -2,7 +2,8 @@ #define ESOLVER_LJ_H #include "esolver.h" -#include "source_cell/module_neighlist/unitcell_lite.h" +#include "source_cell/module_neighlist/unitcell_plus.h" +#include namespace ModuleESolver { @@ -15,7 +16,7 @@ namespace ModuleESolver classname = "ESolver_LJ"; } - UnitCellLite change_from_ucell_to_ucell_lite(const UnitCell& ucell); + UnitCellPlus change_from_ucell_to_ucell_plus(const UnitCell& ucell); void before_all_runners(UnitCell& ucell, const Input_para& inp) override; @@ -55,6 +56,8 @@ namespace ModuleESolver double lj_potential=0.0; ModuleBase::matrix lj_force; ModuleBase::matrix lj_virial; + std::vector atom_type_index; ///< global atom index to UnitCell atom type + std::vector atom_local_index; ///< global atom index to local index inside atom type //--------------------------------------------------- }; } diff --git a/source/source_esolver/esolver_nep.cpp b/source/source_esolver/esolver_nep.cpp index 8944776aaa6..b586983a647 100644 --- a/source/source_esolver/esolver_nep.cpp +++ b/source/source_esolver/esolver_nep.cpp @@ -23,7 +23,7 @@ #include "source_io/module_output/output_log.h" #include "source_io/module_output/cif_io.h" -#include +#include #include using namespace ModuleESolver; @@ -34,9 +34,26 @@ void ESolver_NEP::before_all_runners(UnitCell& ucell, const Input_para& inp) nep_force.create(ucell.nat, 3); nep_virial.create(3, 3); atype.resize(ucell.nat); + nep_cell.resize(9); + nep_coord.resize(3 * ucell.nat); + nep_virial_sum.resize(9); _e.resize(ucell.nat); _f.resize(3 * ucell.nat); _v.resize(9 * ucell.nat); + atom_type_index.resize(ucell.nat); + atom_local_index.resize(ucell.nat); + + int iat = 0; + for (int it = 0; it < ucell.ntype; ++it) + { + for (int ia = 0; ia < ucell.atoms[it].na; ++ia) + { + atom_type_index[iat] = it; + atom_local_index[iat] = ia; + ++iat; + } + } + assert(ucell.nat == iat); ModuleIO::CifParser::write(PARAM.globalv.global_out_dir + "STRU.cif", ucell, @@ -56,39 +73,35 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep) // note that NEP are column major, thus a transpose is needed // cell - std::vector cell(9, 0.0); - cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom; - cell[1] = ucell.latvec.e21 * ucell.lat0_angstrom; - cell[2] = ucell.latvec.e31 * ucell.lat0_angstrom; - cell[3] = ucell.latvec.e12 * ucell.lat0_angstrom; - cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom; - cell[5] = ucell.latvec.e32 * ucell.lat0_angstrom; - cell[6] = ucell.latvec.e13 * ucell.lat0_angstrom; - cell[7] = ucell.latvec.e23 * ucell.lat0_angstrom; - cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom; + nep_cell[0] = ucell.latvec.e11 * ucell.lat0_angstrom; + nep_cell[1] = ucell.latvec.e21 * ucell.lat0_angstrom; + nep_cell[2] = ucell.latvec.e31 * ucell.lat0_angstrom; + nep_cell[3] = ucell.latvec.e12 * ucell.lat0_angstrom; + nep_cell[4] = ucell.latvec.e22 * ucell.lat0_angstrom; + nep_cell[5] = ucell.latvec.e32 * ucell.lat0_angstrom; + nep_cell[6] = ucell.latvec.e13 * ucell.lat0_angstrom; + nep_cell[7] = ucell.latvec.e23 * ucell.lat0_angstrom; + nep_cell[8] = ucell.latvec.e33 * ucell.lat0_angstrom; // coord - std::vector coord(3 * ucell.nat, 0.0); - int iat = 0; + nep_coord.resize(3 * ucell.nat); const int nat = ucell.nat; - for (int it = 0; it < ucell.ntype; ++it) +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int iat = 0; iat < nat; ++iat) { - for (int ia = 0; ia < ucell.atoms[it].na; ++ia) - { - coord[iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom; - coord[iat + nat] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom; - coord[iat + 2 * nat] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom; - iat++; - } + const int it = atom_type_index[iat]; + const int ia = atom_local_index[iat]; + nep_coord[iat] = ucell.atoms[it].tau[ia].x * ucell.lat0_angstrom; + nep_coord[iat + nat] = ucell.atoms[it].tau[ia].y * ucell.lat0_angstrom; + nep_coord[iat + 2 * nat] = ucell.atoms[it].tau[ia].z * ucell.lat0_angstrom; } - assert(ucell.nat == iat); #ifdef __NEP nep_potential = 0.0; nep_force.zero_out(); nep_virial.zero_out(); - nep.compute(atype, cell, coord, _e, _f, _v); + nep.compute(atype, nep_cell, nep_coord, _e, _f, _v); // unit conversion const double fact_e = 1.0 / ModuleBase::Ry_to_eV; @@ -97,11 +110,18 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep) // potential energy - nep_potential = fact_e * std::accumulate(_e.begin(), _e.end(), 0.0) ; + double energy_sum = 0.0; +#pragma omp parallel for reduction(+:energy_sum) schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) + { + energy_sum += _e[i]; + } + nep_potential = fact_e * energy_sum; GlobalV::ofs_running << " #TOTAL ENERGY# " << std::setprecision(11) << nep_potential * ModuleBase::Ry_to_eV << " eV" << std::endl; // forces +#pragma omp parallel for schedule(static) if (nat >= 256) for (int i = 0; i < nat; ++i) { nep_force(i, 0) = _f[i] * fact_f; @@ -110,22 +130,44 @@ void ESolver_NEP::runner(UnitCell& ucell, const int istep) } // virial - std::vector v_sum(9, 0.0); - for (int j = 0; j < 9; ++j) + double v0 = 0.0; + double v1 = 0.0; + double v2 = 0.0; + double v3 = 0.0; + double v4 = 0.0; + double v5 = 0.0; + double v6 = 0.0; + double v7 = 0.0; + double v8 = 0.0; +#pragma omp parallel for reduction(+:v0, v1, v2, v3, v4, v5, v6, v7, v8) schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { - for (int i = 0; i < nat; ++i) - { - int index = j * nat + i; - v_sum[j] += _v[index]; - } + v0 += _v[i]; + v1 += _v[nat + i]; + v2 += _v[2 * nat + i]; + v3 += _v[3 * nat + i]; + v4 += _v[4 * nat + i]; + v5 += _v[5 * nat + i]; + v6 += _v[6 * nat + i]; + v7 += _v[7 * nat + i]; + v8 += _v[8 * nat + i]; } + nep_virial_sum[0] = v0; + nep_virial_sum[1] = v1; + nep_virial_sum[2] = v2; + nep_virial_sum[3] = v3; + nep_virial_sum[4] = v4; + nep_virial_sum[5] = v5; + nep_virial_sum[6] = v6; + nep_virial_sum[7] = v7; + nep_virial_sum[8] = v8; // virial -> stress for (int i = 0; i < 3; ++i) { for (int j = 0; j < 3; ++j) { - nep_virial(i, j) = v_sum[3 * i + j] * fact_v; + nep_virial(i, j) = nep_virial_sum[3 * i + j] * fact_v; } } #else diff --git a/source/source_esolver/esolver_nep.h b/source/source_esolver/esolver_nep.h index dfec17a83c2..bcbd658c319 100644 --- a/source/source_esolver/esolver_nep.h +++ b/source/source_esolver/esolver_nep.h @@ -95,9 +95,14 @@ class ESolver_NEP : public ESolver std::string nep_file; ///< directory of NEP model file std::vector atype = {}; ///< atom type mapping for NEP model + std::vector atom_type_index; ///< global atom index to UnitCell atom type + std::vector atom_local_index; ///< global atom index to local index inside atom type double nep_potential; ///< computed potential energy ModuleBase::matrix nep_force; ///< computed atomic forces ModuleBase::matrix nep_virial; ///< computed lattice virials + std::vector nep_cell; ///< NEP cell buffer in Angstrom, column-major + std::vector nep_coord; ///< NEP coordinate buffer in Angstrom, column-major + std::vector nep_virial_sum; ///< summed per-atom virial components std::vector _e; ///< temporary storage for energy computation std::vector _f; ///< temporary storage for force computation std::vector _v; ///< temporary storage for virial computation @@ -105,4 +110,4 @@ class ESolver_NEP : public ESolver } // namespace ModuleESolver -#endif \ No newline at end of file +#endif diff --git a/source/source_md/fire.cpp b/source/source_md/fire.cpp index fa575b508d7..1260906e6e4 100644 --- a/source/source_md/fire.cpp +++ b/source/source_md/fire.cpp @@ -19,11 +19,7 @@ FIRE::FIRE(const Parameter& param_in, UnitCell& unit_in) : MD_base(param_in, uni n_min = 4; negative_count = 0; max = 0.0; - - // BUGFIX: - // Do not override the force convergence threshold read from INPUT. - // force_thr is stored in internal force unit, Hartree/Bohr. - // force_thr = 1e-3; + force_thr = 1e-3; } FIRE::~FIRE() @@ -156,44 +152,22 @@ void FIRE::restart(const std::string& global_readin_dir) return; } + void FIRE::check_force(void) { max = 0.0; - int movable_dof = 0; - - for (int i = 0; i < ucell.nat; ++i) + const int nat = ucell.nat; +#pragma omp parallel for default(none) shared(force, nat) reduction(max:max) \ + schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { for (int j = 0; j < 3; ++j) { - // Only movable degrees of freedom should be used - // in the FIRE convergence criterion. - // - // For example: - // m 1 1 1 -> x/y/z are included. - // m 1 0 1 -> y is excluded. - // m 0 0 0 -> this atom contributes no DOF to convergence. - if (!ionmbl[i][j]) - { - continue; - } - - ++movable_dof; - - if (max < std::abs(force[i][j])) - { - max = std::abs(force[i][j]); - } + max = std::max(max, std::abs(force[i][j])); } } - // If there are no movable degrees of freedom, there is nothing to optimize. - if (movable_dof == 0) - { - stop = true; - return; - } - if (2.0 * max < force_thr) { stop = true; @@ -215,56 +189,25 @@ void FIRE::check_fire(void) dt_max = 2.5 * md_dt; } - int movable_dof = 0; - - // Compute P, |F| and |v| only on movable degrees of freedom. - // Fixed atoms/directions may have non-zero raw forces, but they should not - // affect the FIRE velocity projection or adaptive time-step control. - for (int i = 0; i < ucell.nat; ++i) - { - for (int j = 0; j < 3; ++j) - { - if (!ionmbl[i][j]) - { - // Keep frozen components clean. - vel[i][j] = 0.0; - continue; - } - - ++movable_dof; - - P += vel[i][j] * force[i][j]; - sumforce += force[i][j] * force[i][j]; - normvel += vel[i][j] * vel[i][j]; - } - } + const int nat = ucell.nat; - // No movable degrees of freedom: nothing to update. - if (movable_dof == 0) +#pragma omp parallel for reduction(+:P, sumforce, normvel) schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { - return; + P += vel[i].x * force[i].x + vel[i].y * force[i].y + vel[i].z * force[i].z; + sumforce += force[i].norm2(); + normvel += vel[i].norm2(); } - sumforce = std::sqrt(sumforce); - normvel = std::sqrt(normvel); + sumforce = sqrt(sumforce); + normvel = sqrt(normvel); - // If force or velocity norm is zero, the velocity projection is undefined. - // Avoid 0/0. In a truly converged case check_force() should stop the run. - if (sumforce > 0.0 && normvel > 0.0) +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { - for (int i = 0; i < ucell.nat; ++i) + for (int j = 0; j < 3; ++j) { - for (int j = 0; j < 3; ++j) - { - if (!ionmbl[i][j]) - { - vel[i][j] = 0.0; - continue; - } - - vel[i][j] = (1.0 - alpha) * vel[i][j] - + alpha * force[i][j] / sumforce * normvel; - } + vel[i][j] = (1.0 - alpha) * vel[i][j] + alpha * force[i][j] / sumforce * normvel; } } @@ -282,7 +225,8 @@ void FIRE::check_fire(void) md_dt *= fdec; negative_count = 0; - for (int i = 0; i < ucell.nat; ++i) +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { for (int j = 0; j < 3; ++j) { @@ -292,6 +236,6 @@ void FIRE::check_fire(void) alpha = alpha_start; } - + return; } diff --git a/source/source_md/langevin.cpp b/source/source_md/langevin.cpp index 31996810aa0..a6b1ed12a28 100644 --- a/source/source_md/langevin.cpp +++ b/source/source_md/langevin.cpp @@ -88,14 +88,16 @@ void Langevin::post_force() if (my_rank == 0) { double t_target = MD_func::target_temp(step_ + step_rst_, mdp.md_nstep, md_tfirst, md_tlast); - ModuleBase::Vector3 fictitious_force; +#pragma omp parallel for default(none) shared(ucell, allmass, vel, md_damp, \ + t_target, md_dt, force, total_force) \ + schedule(static) if (ucell.nat >= 256) for (int i = 0; i < ucell.nat; ++i) { - fictitious_force = -allmass[i] * vel[i] / md_damp; + ModuleBase::Vector3 fictitious_force = -allmass[i] * vel[i] / md_damp; for (int j = 0; j < 3; ++j) { fictitious_force[j] += sqrt(24.0 * t_target * allmass[i] / md_damp / md_dt) - * (static_cast(std::rand()) / RAND_MAX - 0.5); + * (MD_func::uniform_rand_thread_safe() - 0.5); } total_force[i] = force[i] + fictitious_force; } diff --git a/source/source_md/md_base.cpp b/source/source_md/md_base.cpp index 390e1c2b082..f87fca9d2ee 100644 --- a/source/source_md/md_base.cpp +++ b/source/source_md/md_base.cpp @@ -96,7 +96,9 @@ void MD_base::update_pos() { if (my_rank == 0) { - for (int i = 0; i < ucell.nat; ++i) + const int natom = ucell.nat; +#pragma omp parallel for schedule(static) if (natom >= 256) + for (int i = 0; i < natom; ++i) { for (int k = 0; k < 3; ++k) { @@ -127,7 +129,9 @@ void MD_base::update_vel(const ModuleBase::Vector3* force) { if (my_rank == 0) { - for (int i = 0; i < ucell.nat; ++i) + const int natom = ucell.nat; +#pragma omp parallel for schedule(static) if (natom >= 256) + for (int i = 0; i < natom; ++i) { for (int k = 0; k < 3; ++k) { diff --git a/source/source_md/md_func.cpp b/source/source_md/md_func.cpp index 6bd1b60dd59..4d653c07e9a 100644 --- a/source/source_md/md_func.cpp +++ b/source/source_md/md_func.cpp @@ -4,6 +4,8 @@ #include "source_base/timer.h" #include "source_io/module_parameter/parameter.h" +#include + namespace MD_func { @@ -40,10 +42,27 @@ double gaussrand() return xx; } +double gaussrand_thread_safe() +{ + // thread_local guarantees each OpenMP thread gets its own RNG state, + // seeded independently via std::random_device. + thread_local std::mt19937 gen(std::random_device{}()); + thread_local std::normal_distribution dist(0.0, 1.0); + return dist(gen); +} + +double uniform_rand_thread_safe() +{ + thread_local std::mt19937 gen(std::random_device{}()); + thread_local std::uniform_real_distribution dist(0.0, 1.0); + return dist(gen); +} + double kinetic_energy(const int& natom, const ModuleBase::Vector3* vel, const double* allmass) { double ke = 0; +#pragma omp parallel for reduction(+:ke) schedule(static) if (natom >= 256) for (int ion = 0; ion < natom; ++ion) { ke += 0.5 * allmass[ion] * vel[ion].norm2(); @@ -52,6 +71,43 @@ double kinetic_energy(const int& natom, const ModuleBase::Vector3* vel, return ke; } +MDKineticState calc_kinetic_state(const int& natom, + const int& frozen_freedom, + const double* allmass, + const ModuleBase::Vector3* vel) +{ + MDKineticState state; + if (3 * natom == frozen_freedom) + { + return state; + } + + state.kinetic = kinetic_energy(natom, vel, allmass); + state.temperature = 2 * state.kinetic / (3 * natom - frozen_freedom); + return state; +} + +MDStressState calc_stress_state(const int& natom, + const double& omega, + const ModuleBase::Vector3* vel, + const double* allmass, + const ModuleBase::matrix& virial) +{ + MDStressState state; + temp_vector(natom, vel, allmass, state.temperature_tensor); + state.stress.create(3, 3); + + for (int i = 0; i < 3; ++i) + { + for (int j = 0; j < 3; ++j) + { + state.stress(i, j) = virial(i, j) + state.temperature_tensor(i, j) / omega; + } + } + + return state; +} + void compute_stress(const UnitCell& unit_in, const ModuleBase::Vector3* vel, const double* allmass, @@ -61,17 +117,7 @@ void compute_stress(const UnitCell& unit_in, { if (cal_stress) { - ModuleBase::matrix t_vector; - - temp_vector(unit_in.nat, vel, allmass, t_vector); - - for (int i = 0; i < 3; ++i) - { - for (int j = 0; j < 3; ++j) - { - stress(i, j) = virial(i, j) + t_vector(i, j) / unit_in.omega; - } - } + stress = calc_stress_state(unit_in.nat, unit_in.omega, vel, allmass, virial).stress; } return; @@ -121,6 +167,7 @@ void rescale_vel(const int& natom, factor = 0.5 * (3 * natom - frozen_freedom) * temperature / kinetic_energy(natom, vel, allmass); } +#pragma omp parallel for schedule(static) if (natom >= 256) for (int i = 0; i < natom; i++) { vel[i] = vel[i] * sqrt(factor); @@ -273,7 +320,9 @@ void force_virial(ModuleESolver::ESolver* p_esolver, force_temp *= 0.5; virial *= 0.5; - for (int i = 0; i < unit_in.nat; ++i) + const int natom = unit_in.nat; +#pragma omp parallel for schedule(static) if (natom >= 256) + for (int i = 0; i < natom; ++i) { for (int j = 0; j < 3; ++j) { @@ -463,8 +512,9 @@ double current_temp(double& kinetic, } else { - kinetic = kinetic_energy(natom, vel, allmass); - return 2 * kinetic / (3 * natom - frozen_freedom); + const MDKineticState state = calc_kinetic_state(natom, frozen_freedom, allmass, vel); + kinetic = state.kinetic; + return state.temperature; } } @@ -475,17 +525,45 @@ void temp_vector(const int& natom, { t_vector.create(3, 3); + double t00 = 0.0; + double t01 = 0.0; + double t02 = 0.0; + double t10 = 0.0; + double t11 = 0.0; + double t12 = 0.0; + double t20 = 0.0; + double t21 = 0.0; + double t22 = 0.0; + +#pragma omp parallel for reduction(+:t00, t01, t02, t10, t11, t12, t20, t21, t22) schedule(static) if (natom >= 256) for (int ion = 0; ion < natom; ++ion) { - for (int i = 0; i < 3; ++i) - { - for (int j = 0; j < 3; ++j) - { - t_vector(i, j) += allmass[ion] * vel[ion][i] * vel[ion][j]; - } - } + const double mass = allmass[ion]; + const double vx = vel[ion].x; + const double vy = vel[ion].y; + const double vz = vel[ion].z; + + t00 += mass * vx * vx; + t01 += mass * vx * vy; + t02 += mass * vx * vz; + t10 += mass * vy * vx; + t11 += mass * vy * vy; + t12 += mass * vy * vz; + t20 += mass * vz * vx; + t21 += mass * vz * vy; + t22 += mass * vz * vz; } + t_vector(0, 0) = t00; + t_vector(0, 1) = t01; + t_vector(0, 2) = t02; + t_vector(1, 0) = t10; + t_vector(1, 1) = t11; + t_vector(1, 2) = t12; + t_vector(2, 0) = t20; + t_vector(2, 1) = t21; + t_vector(2, 2) = t22; + return; } diff --git a/source/source_md/md_func.h b/source/source_md/md_func.h index be433ffe4ac..39c28910779 100644 --- a/source/source_md/md_func.h +++ b/source/source_md/md_func.h @@ -1,6 +1,7 @@ #ifndef MD_FUNC_H #define MD_FUNC_H +#include "md_statistics.h" #include "source_esolver/esolver.h" class Parameter; @@ -24,6 +25,28 @@ namespace MD_func */ double gaussrand(); +/** + * @brief generate a Gaussian random number (thread-safe) + * + * Uses thread_local std::mt19937 and std::normal_distribution. + * Thread-safe alternative to gaussrand() for use inside + * #pragma omp parallel regions. + * + * @return Gaussian random number with mean 0 and variance 1 + */ +double gaussrand_thread_safe(); + +/** + * @brief generate a uniform random number in [0,1) (thread-safe) + * + * Uses thread_local std::mt19937 and std::uniform_real_distribution. + * Thread-safe alternative to std::rand() for use inside + * #pragma omp parallel regions. + * + * @return uniform random number in [0, 1) + */ +double uniform_rand_thread_safe(); + /** * @brief initialize the atomic velocities * @@ -117,6 +140,14 @@ void force_virial(ModuleESolver::ESolver* p_esolver, */ double kinetic_energy(const int& natom, const ModuleBase::Vector3* vel, const double* allmass); +/** + * @brief calculate kinetic energy and temperature without writing caller-owned state + */ +MDKineticState calc_kinetic_state(const int& natom, + const int& frozen_freedom, + const double* allmass, + const ModuleBase::Vector3* vel); + /** * @brief calculate the total stress tensor * @@ -134,6 +165,15 @@ void compute_stress(const UnitCell& unit_in, const ModuleBase::matrix& virial, ModuleBase::matrix& stress); +/** + * @brief calculate stress and ionic temperature tensor without writing caller-owned state + */ +MDStressState calc_stress_state(const int& natom, + const double& omega, + const ModuleBase::Vector3* vel, + const double* allmass, + const ModuleBase::matrix& virial); + /** * @brief output the stress information * diff --git a/source/source_md/md_statistics.h b/source/source_md/md_statistics.h new file mode 100644 index 00000000000..e7bef175be5 --- /dev/null +++ b/source/source_md/md_statistics.h @@ -0,0 +1,23 @@ +#ifndef MD_STATISTICS_H +#define MD_STATISTICS_H + +#include "source_base/matrix.h" + +namespace MD_func +{ + +struct MDKineticState +{ + double kinetic = 0.0; + double temperature = 0.0; +}; + +struct MDStressState +{ + ModuleBase::matrix stress; + ModuleBase::matrix temperature_tensor; +}; + +} // namespace MD_func + +#endif // MD_STATISTICS_H diff --git a/source/source_md/msst.cpp b/source/source_md/msst.cpp index e33c24b45e5..dcd00b01926 100644 --- a/source/source_md/msst.cpp +++ b/source/source_md/msst.cpp @@ -239,8 +239,9 @@ void MSST::restart(const std::string& global_readin_dir) double MSST::vel_sum() const { double vsum = 0; - - for (int i = 0; i < ucell.nat; ++i) + const int nat = ucell.nat; +#pragma omp parallel for reduction(+:vsum) schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { vsum += vel[i].norm2(); } @@ -262,7 +263,9 @@ void MSST::rescale(std::ofstream& ofs, const double& volume) unitcell::setup_cell_after_vc(ucell,ofs); /// rescale velocity - for (int i = 0; i < ucell.nat; ++i) + const int nat = ucell.nat; +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { vel[i][sd] *= dilation[sd]; } @@ -276,8 +279,10 @@ void MSST::propagate_vel() const int sd = mdp.msst_direction; const double dthalf = 0.5 * md_dt; const double fac = msst_vis * pow(omega[sd], 2) / (vsum * ucell.omega); + const int nat = ucell.nat; - for (int i = 0; i < ucell.nat; ++i) +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { ModuleBase::Vector3 const_C = force[i] / allmass[i]; ModuleBase::Vector3 const_D; diff --git a/source/source_md/nhchain.cpp b/source/source_md/nhchain.cpp index dc72669ec4e..76cce8da662 100644 --- a/source/source_md/nhchain.cpp +++ b/source/source_md/nhchain.cpp @@ -553,7 +553,9 @@ void Nose_Hoover::particle_thermo() } /// rescale velocity due to thermostats - for (int i = 0; i < ucell.nat; ++i) + const int nat = ucell.nat; +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { vel[i] *= scale; } @@ -695,7 +697,9 @@ void Nose_Hoover::vel_baro() factor[i] = exp(-(v_omega[i] + mtk_term) * md_dt / 4); } - for (int i = 0; i < ucell.nat; ++i) + const int nat = ucell.nat; +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { for (int j = 0; j < 3; ++j) { diff --git a/source/source_md/run_md.cpp b/source/source_md/run_md.cpp index ef28e5c8975..b7aab6511a0 100644 --- a/source/source_md/run_md.cpp +++ b/source/source_md/run_md.cpp @@ -12,41 +12,48 @@ #include "verlet.h" #include "source_cell/update_cell.h" #include "source_cell/print_cell.h" -namespace Run_MD -{ +#include -void md_line(UnitCell& unit_in, ModuleESolver::ESolver* p_esolver, const Parameter& param_in) +namespace +{ +std::unique_ptr create_md_runner(const Parameter& param_in, UnitCell& unit_in) { - ModuleBase::TITLE("Run_MD", "md_line"); - ModuleBase::timer::start("Run_MD", "md_line"); - - /// determine the md_type - MD_base* mdrun = nullptr; if (param_in.mdp.md_type == "fire") { - mdrun = new FIRE(param_in, unit_in); + return std::unique_ptr(new FIRE(param_in, unit_in)); } - else if ((param_in.mdp.md_type == "nvt" && param_in.mdp.md_thermostat == "nhc") || param_in.mdp.md_type == "npt") + if ((param_in.mdp.md_type == "nvt" && param_in.mdp.md_thermostat == "nhc") || param_in.mdp.md_type == "npt") { - mdrun = new Nose_Hoover(param_in, unit_in); + return std::unique_ptr(new Nose_Hoover(param_in, unit_in)); } - else if (param_in.mdp.md_type == "nve" || param_in.mdp.md_type == "nvt") + if (param_in.mdp.md_type == "nve" || param_in.mdp.md_type == "nvt") { - mdrun = new Verlet(param_in, unit_in); + return std::unique_ptr(new Verlet(param_in, unit_in)); } - else if (param_in.mdp.md_type == "langevin") + if (param_in.mdp.md_type == "langevin") { - mdrun = new Langevin(param_in, unit_in); + return std::unique_ptr(new Langevin(param_in, unit_in)); } - else if (param_in.mdp.md_type == "msst") + if (param_in.mdp.md_type == "msst") { - mdrun = new MSST(param_in, unit_in); - } - else - { - ModuleBase::WARNING_QUIT("md_line", "no such md_type!"); + return std::unique_ptr(new MSST(param_in, unit_in)); } + ModuleBase::WARNING_QUIT("md_line", "no such md_type!"); + return nullptr; +} +} // namespace + +namespace Run_MD +{ + +void md_line(UnitCell& unit_in, ModuleESolver::ESolver* p_esolver, const Parameter& param_in) +{ + ModuleBase::TITLE("Run_MD", "md_line"); + ModuleBase::timer::start("Run_MD", "md_line"); + + std::unique_ptr mdrun = create_md_runner(param_in, unit_in); + /// md cycle, mohan update 2026-01-04, change '<=' to '<' while ((mdrun->step_ + mdrun->step_rst_) < param_in.mdp.md_nstep && !mdrun->stop) { @@ -129,7 +136,6 @@ void md_line(UnitCell& unit_in, ModuleESolver::ESolver* p_esolver, const Paramet mdrun->step_++; } - delete mdrun; ModuleBase::timer::end("Run_MD", "md_line"); return; } diff --git a/source/source_md/test/CMakeLists.txt b/source/source_md/test/CMakeLists.txt index 3ff705c4f4a..2e04f1d70c0 100644 --- a/source/source_md/test/CMakeLists.txt +++ b/source/source_md/test/CMakeLists.txt @@ -37,7 +37,6 @@ list(APPEND depend_files ../../source_base/realarray.cpp ../../source_base/complexarray.cpp ../../source_base/complexmatrix.cpp - ../../source_base/global_variable.cpp ../../source_base/libm/branred.cpp ../../source_base/libm/sincos.cpp ../../source_base/math_integral.cpp diff --git a/source/source_md/test/fire_test.cpp b/source/source_md/test/fire_test.cpp index 3b294da46ac..e52ae4cbecd 100644 --- a/source/source_md/test/fire_test.cpp +++ b/source/source_md/test/fire_test.cpp @@ -5,9 +5,8 @@ #undef private #define private public #define protected public -#include "source_esolver/esolver_lj.h" #include "source_md/fire.h" -#include "setcell.h" +#include "md_test_fixture.h" #define doublethreshold 1e-12 /************************************************ @@ -35,31 +34,8 @@ * - output MD information such as energy, temperature, and pressure */ -class FIREtest : public testing::Test +class FIREtest : public MdIntegratorFixture { - protected: - MD_base* mdrun; - UnitCell ucell; - Parameter param_in; - ModuleESolver::ESolver* p_esolver; - - void SetUp() - { - Setcell::setupcell(ucell); - Setcell::parameters(param_in.input); - - p_esolver = new ModuleESolver::ESolver_LJ(); - p_esolver->before_all_runners(ucell, param_in.inp); - - mdrun = new FIRE(param_in, ucell); - mdrun->setup(p_esolver, PARAM.sys.global_readin_dir); - } - - void TearDown() - { - delete mdrun; - delete p_esolver; - } }; TEST_F(FIREtest, Setup) @@ -167,7 +143,7 @@ TEST_F(FIREtest, Restart) mdrun->restart(PARAM.sys.global_readin_dir); remove("Restart_md.txt"); - FIRE* fire = dynamic_cast(mdrun); + FIRE* fire = dynamic_cast(mdrun.get()); EXPECT_EQ(mdrun->step_rst_, 3); EXPECT_EQ(fire->alpha, 0.1); EXPECT_EQ(fire->negative_count, 0); diff --git a/source/source_md/test/langevin_test.cpp b/source/source_md/test/langevin_test.cpp index 69df605b153..65d462a86da 100644 --- a/source/source_md/test/langevin_test.cpp +++ b/source/source_md/test/langevin_test.cpp @@ -5,9 +5,8 @@ #undef private #define private public #define protected public -#include "source_esolver/esolver_lj.h" #include "source_md/langevin.h" -#include "setcell.h" +#include "md_test_fixture.h" #define doublethreshold 1e-12 /************************************************ @@ -35,31 +34,8 @@ * - output MD information such as energy, temperature, and pressure */ -class Langevin_test : public testing::Test +class Langevin_test : public MdIntegratorFixture { - protected: - MD_base* mdrun; - UnitCell ucell; - Parameter param_in; - ModuleESolver::ESolver* p_esolver; - - void SetUp() - { - Setcell::setupcell(ucell); - Setcell::parameters(param_in.input); - - p_esolver = new ModuleESolver::ESolver_LJ(); - p_esolver->before_all_runners(ucell, param_in.inp); - - mdrun = new Langevin(param_in, ucell); - mdrun->setup(p_esolver, PARAM.sys.global_readin_dir); - } - - void TearDown() - { - delete mdrun; - delete p_esolver; - } }; TEST_F(Langevin_test, setup) diff --git a/source/source_md/test/lj_pot_test.cpp b/source/source_md/test/lj_pot_test.cpp index 64c0b52fe6b..99cec432b56 100644 --- a/source/source_md/test/lj_pot_test.cpp +++ b/source/source_md/test/lj_pot_test.cpp @@ -1,9 +1,9 @@ #include "gtest/gtest.h" #define private public #include "source_io/module_parameter/parameter.h" +#include "md_test_fixture.h" #include "source_esolver/esolver_lj.h" #include "source_md/md_func.h" -#include "setcell.h" #undef private #define doublethreshold 1e-12 @@ -17,46 +17,23 @@ * - calculate energy, force, virial for lj pot */ -class LJ_pot_test : public testing::Test +class LJ_pot_test : public LjPotTestFixture { - protected: - ModuleBase::Vector3* force; - ModuleBase::matrix stress; - double potential; - int natom; - UnitCell ucell; - Input_para input; - - void SetUp() - { - Setcell::setupcell(ucell); - - natom = ucell.nat; - force = new ModuleBase::Vector3[natom]; - stress.create(3, 3); - - Setcell::parameters(input); - } - - void TearDown() - { - delete[] force; - } }; TEST_F(LJ_pot_test, potential) { - ModuleESolver::ESolver* p_esolver = new ModuleESolver::ESolver_LJ(); + std::unique_ptr p_esolver(new ModuleESolver::ESolver_LJ()); p_esolver->before_all_runners(ucell, input); - MD_func::force_virial(p_esolver, 0, ucell, potential, force, true, stress); + MD_func::force_virial(p_esolver.get(), 0, ucell, potential, force, true, stress); EXPECT_NEAR(potential, -0.011957818623534381, doublethreshold); } TEST_F(LJ_pot_test, force) { - ModuleESolver::ESolver* p_esolver = new ModuleESolver::ESolver_LJ(); + std::unique_ptr p_esolver(new ModuleESolver::ESolver_LJ()); p_esolver->before_all_runners(ucell, input); - MD_func::force_virial(p_esolver, 0, ucell, potential, force, true, stress); + MD_func::force_virial(p_esolver.get(), 0, ucell, potential, force, true, stress); EXPECT_NEAR(force[0].x, 0.00049817733089377704, doublethreshold); EXPECT_NEAR(force[0].y, 0.00082237246837022328, doublethreshold); EXPECT_NEAR(force[0].z, -3.0493186101154812e-20, doublethreshold); @@ -73,9 +50,9 @@ TEST_F(LJ_pot_test, force) TEST_F(LJ_pot_test, stress) { - ModuleESolver::ESolver* p_esolver = new ModuleESolver::ESolver_LJ(); + std::unique_ptr p_esolver(new ModuleESolver::ESolver_LJ()); p_esolver->before_all_runners(ucell, input); - MD_func::force_virial(p_esolver, 0, ucell, potential, force, true, stress); + MD_func::force_virial(p_esolver.get(), 0, ucell, potential, force, true, stress); EXPECT_NEAR(stress(0, 0), 8.0360222227631859e-07, doublethreshold); EXPECT_NEAR(stress(0, 1), 1.7207745586539077e-07, doublethreshold); EXPECT_NEAR(stress(0, 2), 0, doublethreshold); @@ -89,7 +66,7 @@ TEST_F(LJ_pot_test, stress) TEST_F(LJ_pot_test, RcutSearchRadius) { - ModuleESolver::ESolver_LJ* p_esolver = new ModuleESolver::ESolver_LJ(); + std::unique_ptr p_esolver(new ModuleESolver::ESolver_LJ()); ucell.ntype = 2; std::vector rcut = {3.0}; p_esolver->rcut_search_radius(ucell.ntype, rcut); @@ -114,7 +91,7 @@ TEST_F(LJ_pot_test, RcutSearchRadius) TEST_F(LJ_pot_test, SetC6C12) { - ModuleESolver::ESolver_LJ* p_esolver = new ModuleESolver::ESolver_LJ(); + std::unique_ptr p_esolver(new ModuleESolver::ESolver_LJ()); ucell.ntype = 2; // no rule @@ -187,7 +164,7 @@ TEST_F(LJ_pot_test, SetC6C12) TEST_F(LJ_pot_test, CalEnShift) { - ModuleESolver::ESolver_LJ* p_esolver = new ModuleESolver::ESolver_LJ(); + std::unique_ptr p_esolver(new ModuleESolver::ESolver_LJ()); ucell.ntype = 2; std::vector rcut = {3.0}; @@ -214,4 +191,4 @@ TEST_F(LJ_pot_test, CalEnShift) EXPECT_NEAR(p_esolver->en_shift(0, 1), -3.303688865319793e-07, doublethreshold); EXPECT_NEAR(p_esolver->en_shift(1, 0), -3.303688865319793e-07, doublethreshold); EXPECT_NEAR(p_esolver->en_shift(1, 1), -5.6443326024140752e-06, doublethreshold); -} \ No newline at end of file +} diff --git a/source/source_md/test/md_func_test.cpp b/source/source_md/test/md_func_test.cpp index eb9ce57a5f9..a9bae026fa8 100644 --- a/source/source_md/test/md_func_test.cpp +++ b/source/source_md/test/md_func_test.cpp @@ -5,9 +5,8 @@ #undef private #define private public #define protected public -#include "source_esolver/esolver_lj.h" +#include "md_test_fixture.h" #include "source_md/md_func.h" -#include "setcell.h" #define doublethreshold 1e-12 /************************************************ @@ -50,45 +49,8 @@ * - test the current_md_info function with an incorrect file path */ -class MD_func_test : public testing::Test +class MD_func_test : public MdFuncTestFixture { - protected: - UnitCell ucell; - double* allmass; // atom mass - ModuleBase::Vector3* pos; // atom position - ModuleBase::Vector3* vel; // atom velocity - ModuleBase::Vector3* ionmbl; // atom is frozen or not - ModuleBase::Vector3* force; // atom force - ModuleBase::matrix virial; // virial for this lattice - ModuleBase::matrix stress; // stress for this lattice - double potential; // potential energy - int natom; // atom number - double temperature; // temperature - int frozen_freedom; // frozen_freedom - Parameter param_in; - - void SetUp() - { - Setcell::setupcell(ucell); - Setcell::parameters(param_in.input); - natom = ucell.nat; - allmass = new double[natom]; - pos = new ModuleBase::Vector3[natom]; - ionmbl = new ModuleBase::Vector3[natom]; - vel = new ModuleBase::Vector3[natom]; - force = new ModuleBase::Vector3[natom]; - stress.create(3, 3); - virial.create(3, 3); - } - - void TearDown() - { - delete[] allmass; - delete[] pos; - delete[] vel; - delete[] ionmbl; - delete[] force; - } }; TEST_F(MD_func_test, gaussrand) diff --git a/source/source_md/test/md_test_fixture.h b/source/source_md/test/md_test_fixture.h new file mode 100644 index 00000000000..fccc19e96f0 --- /dev/null +++ b/source/source_md/test/md_test_fixture.h @@ -0,0 +1,111 @@ +#ifndef MD_TEST_FIXTURE_H +#define MD_TEST_FIXTURE_H + +#include "gtest/gtest.h" +#include "source_esolver/esolver_lj.h" +#include "source_io/module_parameter/parameter.h" +#include "source_md/md_base.h" +#include "setcell.h" + +#include +#include + +class MdTestBase : public testing::Test +{ + protected: + UnitCell ucell; + Parameter param_in; + std::unique_ptr p_esolver; + + void SetUp() override + { + Setcell::setupcell(ucell); + Setcell::parameters(param_in.input); + + p_esolver.reset(new ModuleESolver::ESolver_LJ()); + p_esolver->before_all_runners(ucell, param_in.inp); + } +}; + +template +class MdIntegratorFixture : public MdTestBase +{ + protected: + std::unique_ptr mdrun; + + void SetUp() override + { + MdTestBase::SetUp(); + mdrun.reset(new Integrator(param_in, ucell)); + mdrun->setup(p_esolver.get(), PARAM.sys.global_readin_dir); + } +}; + +class MdFuncTestFixture : public testing::Test +{ + protected: + UnitCell ucell; + std::vector allmass_store; + std::vector> pos_store; + std::vector> vel_store; + std::vector> ionmbl_store; + std::vector> force_store; + double* allmass = nullptr; + ModuleBase::Vector3* pos = nullptr; + ModuleBase::Vector3* vel = nullptr; + ModuleBase::Vector3* ionmbl = nullptr; + ModuleBase::Vector3* force = nullptr; + ModuleBase::matrix virial; + ModuleBase::matrix stress; + double potential = 0.0; + int natom = 0; + double temperature = 0.0; + int frozen_freedom = 0; + Parameter param_in; + + void SetUp() override + { + Setcell::setupcell(ucell); + Setcell::parameters(param_in.input); + natom = ucell.nat; + + allmass_store.resize(natom); + pos_store.resize(natom); + vel_store.resize(natom); + ionmbl_store.resize(natom); + force_store.resize(natom); + allmass = allmass_store.data(); + pos = pos_store.data(); + vel = vel_store.data(); + ionmbl = ionmbl_store.data(); + force = force_store.data(); + stress.create(3, 3); + virial.create(3, 3); + } +}; + +class LjPotTestFixture : public testing::Test +{ + protected: + std::vector> force_store; + ModuleBase::Vector3* force = nullptr; + ModuleBase::matrix stress; + double potential = 0.0; + int natom = 0; + UnitCell ucell; + Input_para input; + + void SetUp() override + { + Setcell::setupcell(ucell); + + natom = ucell.nat; + force_store.resize(natom); + force = force_store.data(); + stress.create(3, 3); + + Setcell::parameters(input); + } +}; + +#endif // MD_TEST_FIXTURE_H diff --git a/source/source_md/test/msst_test.cpp b/source/source_md/test/msst_test.cpp index 7d0fd8054d1..8b3982be73b 100644 --- a/source/source_md/test/msst_test.cpp +++ b/source/source_md/test/msst_test.cpp @@ -5,9 +5,8 @@ #undef private #define private public #define protected public -#include "source_esolver/esolver_lj.h" #include "source_md/msst.h" -#include "setcell.h" +#include "md_test_fixture.h" #define doublethreshold 1e-12 /************************************************ @@ -35,31 +34,8 @@ * - output MD information such as energy, temperature, and pressure */ -class MSST_test : public testing::Test +class MSST_test : public MdIntegratorFixture { - protected: - MD_base* mdrun; - UnitCell ucell; - Parameter param_in; - ModuleESolver::ESolver* p_esolver; - - void SetUp() - { - Setcell::setupcell(ucell); - Setcell::parameters(param_in.input); - - p_esolver = new ModuleESolver::ESolver_LJ(); - p_esolver->before_all_runners(ucell, param_in.inp); - - mdrun = new MSST(param_in, ucell); - mdrun->setup(p_esolver, PARAM.sys.global_readin_dir); - } - - void TearDown() - { - delete mdrun; - delete p_esolver; - } }; TEST_F(MSST_test, setup) @@ -208,7 +184,7 @@ TEST_F(MSST_test, restart) mdrun->restart(PARAM.sys.global_readin_dir); remove("Restart_md.txt"); - MSST* msst = dynamic_cast(mdrun); + MSST* msst = dynamic_cast(mdrun.get()); EXPECT_EQ(mdrun->step_rst_, 3); EXPECT_EQ(msst->omega[mdrun->mdp.msst_direction], -0.00977662); EXPECT_EQ(msst->e0, -0.00768262); diff --git a/source/source_md/test/nhchain_test.cpp b/source/source_md/test/nhchain_test.cpp index 647df0a730d..4f7a4244011 100644 --- a/source/source_md/test/nhchain_test.cpp +++ b/source/source_md/test/nhchain_test.cpp @@ -5,9 +5,8 @@ #undef private #define private public #define protected public -#include "source_esolver/esolver_lj.h" #include "source_md/nhchain.h" -#include "setcell.h" +#include "md_test_fixture.h" #define doublethreshold 1e-12 /************************************************ * unit test of functions in nhchain.h @@ -33,34 +32,20 @@ * - Nose_Hoover::print_md * - output MD information such as energy, temperature, and pressure */ -class NHC_test : public testing::Test +class NHC_test : public MdTestBase { protected: - MD_base* mdrun; - UnitCell ucell; - Parameter param_in; - ModuleESolver::ESolver* p_esolver; + std::unique_ptr mdrun; - void SetUp() + void SetUp() override { - Setcell::setupcell(ucell); - Setcell::parameters(param_in.input); - - p_esolver = new ModuleESolver::ESolver_LJ(); - p_esolver->before_all_runners(ucell, param_in.inp); - + MdTestBase::SetUp(); param_in.input.mdp.md_type = "npt"; param_in.input.mdp.md_pmode = "tri"; param_in.input.mdp.md_pfirst = 1; param_in.input.mdp.md_plast = 1; - mdrun = new Nose_Hoover(param_in, ucell); - mdrun->setup(p_esolver, PARAM.sys.global_readin_dir); - } - - void TearDown() - { - delete mdrun; - delete p_esolver; + mdrun.reset(new Nose_Hoover(param_in, ucell)); + mdrun->setup(p_esolver.get(), PARAM.sys.global_readin_dir); } }; @@ -179,7 +164,7 @@ TEST_F(NHC_test, restart) mdrun->restart(PARAM.sys.global_readin_dir); remove("Restart_md.txt"); - Nose_Hoover* nhc = dynamic_cast(mdrun); + Nose_Hoover* nhc = dynamic_cast(mdrun.get()); EXPECT_EQ(mdrun->step_rst_, 3); EXPECT_EQ(mdrun->mdp.md_tchain, 4); EXPECT_EQ(mdrun->mdp.md_pchain, 4); diff --git a/source/source_md/test/verlet_test.cpp b/source/source_md/test/verlet_test.cpp index 86dc66a85e1..d859f0b9ff8 100644 --- a/source/source_md/test/verlet_test.cpp +++ b/source/source_md/test/verlet_test.cpp @@ -5,9 +5,8 @@ #undef private #define private public #define protected public -#include "source_esolver/esolver_lj.h" #include "source_md/verlet.h" -#include "setcell.h" +#include "md_test_fixture.h" #define doublethreshold 1e-12 @@ -36,30 +35,8 @@ * - output MD information such as energy, temperature, and pressure */ -class Verlet_test : public testing::Test +class Verlet_test : public MdIntegratorFixture { - protected: - MD_base* mdrun; - UnitCell ucell; - Parameter param_in; - ModuleESolver::ESolver* p_esolver; - - void SetUp() - { - Setcell::setupcell(ucell); - Setcell::parameters(param_in.input); - - p_esolver = new ModuleESolver::ESolver_LJ(); - p_esolver->before_all_runners(ucell, param_in.inp); - - mdrun = new Verlet(param_in, ucell); - mdrun->setup(p_esolver, PARAM.sys.global_readin_dir); - } - - void TearDown() - { - delete mdrun; - } }; TEST_F(Verlet_test, setup) diff --git a/source/source_md/verlet.cpp b/source/source_md/verlet.cpp index 3fd79ff1b68..825c46133f1 100644 --- a/source/source_md/verlet.cpp +++ b/source/source_md/verlet.cpp @@ -75,17 +75,18 @@ void Verlet::apply_thermostat(void) { if (my_rank == 0) { - double deviation = 0.0; +#pragma omp parallel for default(none) shared(ucell, mdp, md_tlast, allmass, ionmbl, vel) \ + schedule(static) if (ucell.nat >= 256) for (int i = 0; i < ucell.nat; ++i) { - if (static_cast(std::rand()) / RAND_MAX <= 1.0 / mdp.md_nraise) + if (MD_func::uniform_rand_thread_safe() <= 1.0 / mdp.md_nraise) { - deviation = sqrt(md_tlast / allmass[i]); + double deviation = sqrt(md_tlast / allmass[i]); for (int k = 0; k < 3; ++k) { if (ionmbl[i][k]) { - vel[i][k] = deviation * MD_func::gaussrand(); + vel[i][k] = deviation * MD_func::gaussrand_thread_safe(); } } } @@ -124,7 +125,9 @@ void Verlet::thermalize(const int& nraise, const double& current_temp, const dou fac = sqrt(target_temp / current_temp); } - for (int i = 0; i < ucell.nat; ++i) + const int nat = ucell.nat; +#pragma omp parallel for schedule(static) if (nat >= 256) + for (int i = 0; i < nat; ++i) { vel[i] *= fac; } @@ -158,14 +161,16 @@ void Verlet::apply_csvr(const double& current_temp, const double& target_temp) factor = exp(-1.0 / taut); } - // Generate Gaussian random numbers using MD_func - double rr = MD_func::gaussrand(); + // Generate Gaussian random numbers using thread-safe RNG + double rr = MD_func::gaussrand_thread_safe(); // Calculate sum of squared Gaussian random numbers (ndeg - 1) double sumnoises = 0.0; +#pragma omp parallel for default(none) shared(ndeg) reduction(+:sumnoises) \ + schedule(static) if (ndeg >= 256) for (int i = 0; i < ndeg - 1; ++i) { - double r = MD_func::gaussrand(); + double r = MD_func::gaussrand_thread_safe(); sumnoises += r * r; } @@ -179,7 +184,9 @@ void Verlet::apply_csvr(const double& current_temp, const double& target_temp) // Calculate scaling factor double scale = sqrt(resample); - // Apply velocity scaling + // Apply velocity scaling (embarrassingly parallel) +#pragma omp parallel for default(none) shared(ucell, vel, scale) \ + schedule(static) if (ucell.nat >= 256) for (int i = 0; i < ucell.nat; ++i) { vel[i] *= scale;