Parallelization with OpenMP, 2-3x faster than before

This commit is contained in:
Dominik Demuth
2026-03-08 12:32:30 +01:00
parent e45ef8162d
commit 6b42051e45
3 changed files with 53 additions and 5 deletions

View File

@@ -3,6 +3,9 @@ project(RWSim VERSION 1.0)
set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD 17)
find_package(OpenMP REQUIRED)
add_subdirectory(src) add_subdirectory(src)
target_compile_options(rwsim PUBLIC -Werror -Wall -Wextra -Wconversion -O2) target_compile_options(rwsim PUBLIC -Werror -Wall -Wextra -Wconversion -O2)
target_link_libraries(rwsim PUBLIC OpenMP::OpenMP_CXX)

View File

@@ -5,7 +5,7 @@ add_subdirectory(utils)
add_subdirectory(experiments) add_subdirectory(experiments)
add_library(simulation STATIC sims.cpp sims.h) add_library(simulation STATIC sims.cpp sims.h)
target_link_libraries(simulation PRIVATE utils experiments) target_link_libraries(simulation PRIVATE utils experiments OpenMP::OpenMP_CXX)
add_executable( add_executable(
rwsim rwsim

View File

@@ -3,6 +3,7 @@
#include <iostream> #include <iostream>
#include <chrono> #include <chrono>
#include <omp.h>
void run_simulation( void run_simulation(
@@ -20,12 +21,56 @@ void run_simulation(
experiment.setup(parameter, optional); experiment.setup(parameter, optional);
const auto start = printStart(optional); const auto start = printStart(optional);
const int num_threads = omp_get_max_threads();
// Create per-thread RNGs seeded deterministically from the main RNG
std::vector<std::mt19937_64> thread_rngs;
thread_rngs.reserve(num_threads);
for (int i = 0; i < num_threads; i++) {
thread_rngs.emplace_back(rng());
}
// Create per-thread clones of motion, distribution, and experiment
std::vector<std::unique_ptr<motions::BaseMotion>> thread_motions;
std::vector<std::unique_ptr<times::BaseDistribution>> thread_dists;
std::vector<std::unique_ptr<Experiment>> thread_experiments;
for (int i = 0; i < num_threads; i++) {
thread_motions.push_back(motion.clone());
thread_dists.push_back(dist.clone());
thread_experiments.push_back(experiment.clone());
}
int steps_done = 0;
auto last_print_out = std::chrono::system_clock::now(); auto last_print_out = std::chrono::system_clock::now();
for (int mol_i = 0; mol_i < num_walker; mol_i++) { #pragma omp parallel
auto traj = make_trajectory(motion, dist, experiment.tmax(), rng); {
experiment.accumulate(traj, motion.getInitOmega(), num_walker); const int tid = omp_get_thread_num();
last_print_out = printSteps(last_print_out, start, num_walker, mol_i); auto& local_rng = thread_rngs[tid];
auto& local_motion = *thread_motions[tid];
auto& local_dist = *thread_dists[tid];
auto& local_experiment = *thread_experiments[tid];
#pragma omp for schedule(static)
for (int mol_i = 0; mol_i < num_walker; mol_i++) {
auto traj = make_trajectory(local_motion, local_dist, experiment.tmax(), local_rng);
local_experiment.accumulate(traj, local_motion.getInitOmega(), num_walker);
if (tid == 0) {
#pragma omp atomic
steps_done++;
last_print_out = printSteps(last_print_out, start, num_walker, steps_done);
} else {
#pragma omp atomic
steps_done++;
}
}
}
// Merge per-thread results
for (int i = 0; i < num_threads; i++) {
experiment.merge(*thread_experiments[i]);
} }
experiment.save(motion, dist); experiment.save(motion, dist);