Parallelization with OpenMP, 2-3x faster than before

2026-03-08 12:32:30 +01:00
parent e45ef8162d
commit 6b42051e45
3 changed files with 53 additions and 5 deletions
@@ -3,6 +3,9 @@ project(RWSim VERSION 1.0)

 set(CMAKE_CXX_STANDARD 17)

+find_package(OpenMP REQUIRED)
+
 add_subdirectory(src)

 target_compile_options(rwsim PUBLIC -Werror -Wall -Wextra -Wconversion -O2)
+target_link_libraries(rwsim PUBLIC OpenMP::OpenMP_CXX)
@@ -5,7 +5,7 @@ add_subdirectory(utils)
 add_subdirectory(experiments)

 add_library(simulation STATIC sims.cpp sims.h)
-target_link_libraries(simulation PRIVATE utils experiments)
+target_link_libraries(simulation PRIVATE utils experiments OpenMP::OpenMP_CXX)

 add_executable(
        rwsim
@@ -3,6 +3,7 @@

 #include <iostream>
 #include <chrono>
+#include <omp.h>


 void run_simulation(
@@ -20,12 +21,56 @@ void run_simulation(
    experiment.setup(parameter, optional);

    const auto start = printStart(optional);
+
+    const int num_threads = omp_get_max_threads();
+
+    // Create per-thread RNGs seeded deterministically from the main RNG
+    std::vector<std::mt19937_64> thread_rngs;
+    thread_rngs.reserve(num_threads);
+    for (int i = 0; i < num_threads; i++) {
+        thread_rngs.emplace_back(rng());
+    }
+
+    // Create per-thread clones of motion, distribution, and experiment
+    std::vector<std::unique_ptr<motions::BaseMotion>> thread_motions;
+    std::vector<std::unique_ptr<times::BaseDistribution>> thread_dists;
+    std::vector<std::unique_ptr<Experiment>> thread_experiments;
+    for (int i = 0; i < num_threads; i++) {
+        thread_motions.push_back(motion.clone());
+        thread_dists.push_back(dist.clone());
+        thread_experiments.push_back(experiment.clone());
+    }
+
+    int steps_done = 0;
    auto last_print_out = std::chrono::system_clock::now();

-    for (int mol_i = 0; mol_i < num_walker; mol_i++) {
-        auto traj = make_trajectory(motion, dist, experiment.tmax(), rng);
-        experiment.accumulate(traj, motion.getInitOmega(), num_walker);
-        last_print_out = printSteps(last_print_out, start, num_walker, mol_i);
+    #pragma omp parallel
+    {
+        const int tid = omp_get_thread_num();
+        auto& local_rng = thread_rngs[tid];
+        auto& local_motion = *thread_motions[tid];
+        auto& local_dist = *thread_dists[tid];
+        auto& local_experiment = *thread_experiments[tid];
+
+        #pragma omp for schedule(static)
+        for (int mol_i = 0; mol_i < num_walker; mol_i++) {
+            auto traj = make_trajectory(local_motion, local_dist, experiment.tmax(), local_rng);
+            local_experiment.accumulate(traj, local_motion.getInitOmega(), num_walker);
+
+            if (tid == 0) {
+                #pragma omp atomic
+                steps_done++;
+                last_print_out = printSteps(last_print_out, start, num_walker, steps_done);
+            } else {
+                #pragma omp atomic
+                steps_done++;
+            }
+        }
+    }
+
+    // Merge per-thread results
+    for (int i = 0; i < num_threads; i++) {
+        experiment.merge(*thread_experiments[i]);
    }

    experiment.save(motion, dist);