Parallelization with OpenMP, 2-3x faster than before

2026-03-08 12:32:30 +01:00
parent e45ef8162d
commit 6b42051e45
3 changed files with 53 additions and 5 deletions
@@ -3,6 +3,9 @@ project(RWSim VERSION 1.0)
 set(CMAKE_CXX_STANDARD 17)
 find_package(OpenMP REQUIRED)
 add_subdirectory(src)
 target_compile_options(rwsim PUBLIC -Werror -Wall -Wextra -Wconversion -O2)
 target_link_libraries(rwsim PUBLIC OpenMP::OpenMP_CXX)
@@ -5,7 +5,7 @@ add_subdirectory(utils)
 add_subdirectory(experiments)
 add_library(simulation STATIC sims.cpp sims.h)
-target_link_libraries(simulation PRIVATE utils experiments)
+target_link_libraries(simulation PRIVATE utils experiments OpenMP::OpenMP_CXX)
 add_executable(
        rwsim
@@ -3,6 +3,7 @@
 #include <iostream>
 #include <chrono>
 #include <omp.h>
 void run_simulation(
@@ -20,12 +21,56 @@ void run_simulation(
    experiment.setup(parameter, optional);
    const auto start = printStart(optional);
    const int num_threads = omp_get_max_threads();
    // Create per-thread RNGs seeded deterministically from the main RNG
    std::vector<std::mt19937_64> thread_rngs;
    thread_rngs.reserve(num_threads);
    for (int i = 0; i < num_threads; i++) {
        thread_rngs.emplace_back(rng());
    }
    // Create per-thread clones of motion, distribution, and experiment
    std::vector<std::unique_ptr<motions::BaseMotion>> thread_motions;
    std::vector<std::unique_ptr<times::BaseDistribution>> thread_dists;
    std::vector<std::unique_ptr<Experiment>> thread_experiments;
    for (int i = 0; i < num_threads; i++) {
        thread_motions.push_back(motion.clone());
        thread_dists.push_back(dist.clone());
        thread_experiments.push_back(experiment.clone());
    }
    int steps_done = 0;
    auto last_print_out = std::chrono::system_clock::now();
-    for (int mol_i = 0; mol_i < num_walker; mol_i++) {
+    #pragma omp parallel
-        auto traj = make_trajectory(motion, dist, experiment.tmax(), rng);
+    {
-        experiment.accumulate(traj, motion.getInitOmega(), num_walker);
+        const int tid = omp_get_thread_num();
-        last_print_out = printSteps(last_print_out, start, num_walker, mol_i);
+        auto& local_rng = thread_rngs[tid];
        auto& local_motion = *thread_motions[tid];
        auto& local_dist = *thread_dists[tid];
        auto& local_experiment = *thread_experiments[tid];
        #pragma omp for schedule(static)
        for (int mol_i = 0; mol_i < num_walker; mol_i++) {
            auto traj = make_trajectory(local_motion, local_dist, experiment.tmax(), local_rng);
            local_experiment.accumulate(traj, local_motion.getInitOmega(), num_walker);
            if (tid == 0) {
                #pragma omp atomic
                steps_done++;
                last_print_out = printSteps(last_print_out, start, num_walker, steps_done);
            } else {
                #pragma omp atomic
                steps_done++;
            }
        }
    }
    // Merge per-thread results
    for (int i = 0; i < num_threads; i++) {
        experiment.merge(*thread_experiments[i]);
    }
    experiment.save(motion, dist);