From 96639a9582d88a088f162bf6e8eea95d7b73cb18 Mon Sep 17 00:00:00 2001 From: Augustin Degomme Date: Thu, 30 Sep 2021 17:44:09 +0200 Subject: [PATCH] Add a simgrid barrier at the end of MPI_Init call. Rationale is that if one process is delayed in its initialization due to previous work, other processes might start sending to it even if it has not registered yet in the instance and in the comm world. Thanks Julien Emmanuel for the report. There is still an issue for ti-tracing and replay. ti-traces write in a common file the path to their trace file, and if the order of initialization is not perfectly ordered, a subsequent replay would assign wrong traces at each process. We should find a new way to generate this file properly. For now, as the barrier is done after this initialization, this bug is still under the rug. --- src/smpi/include/private.hpp | 1 + src/smpi/include/smpi_actor.hpp | 1 + src/smpi/internals/smpi_deployment.cpp | 12 +++++++++++- src/smpi/internals/smpi_global.cpp | 1 + 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/smpi/include/private.hpp b/src/smpi/include/private.hpp index e3734a50ed..022643fad0 100644 --- a/src/smpi/include/private.hpp +++ b/src/smpi/include/private.hpp @@ -80,6 +80,7 @@ XBT_PRIVATE int smpi_get_universe_size(); XBT_PRIVATE void smpi_deployment_register_process(const std::string& instance_id, int rank, const simgrid::s4u::Actor* actor); +XBT_PRIVATE void smpi_deployment_startup_barrier(const std::string& instance_id); XBT_PRIVATE void smpi_deployment_unregister_process(const std::string& instance_id); XBT_PRIVATE MPI_Comm* smpi_deployment_comm_world(const std::string& instance_id); diff --git a/src/smpi/include/smpi_actor.hpp b/src/smpi/include/smpi_actor.hpp index dd1519d900..f4b1157d93 100644 --- a/src/smpi/include/smpi_actor.hpp +++ b/src/smpi/include/smpi_actor.hpp @@ -59,6 +59,7 @@ public: void mark_as_initialized(); void set_replaying(bool value); bool replaying() const; + std::string get_instance_id() const { return instance_id_;} void set_tracing_category(const std::string& category) { tracing_category_ = category; } const std::string& get_tracing_category() const { return tracing_category_; } smpi_trace_call_location_t* call_location(); diff --git a/src/smpi/internals/smpi_deployment.cpp b/src/smpi/internals/smpi_deployment.cpp index a9d4450b5f..f1656e5cae 100644 --- a/src/smpi/internals/smpi_deployment.cpp +++ b/src/smpi/internals/smpi_deployment.cpp @@ -7,6 +7,7 @@ #include "smpi_host.hpp" #include "private.hpp" #include "simgrid/s4u/Engine.hpp" +#include "simgrid/s4u/Barrier.hpp" #include "smpi_comm.hpp" #include @@ -25,8 +26,9 @@ public: auto* group = new simgrid::smpi::Group(size_); comm_world_ = new simgrid::smpi::Comm(group, nullptr, false, -1); universe_size += max_no_processes; + bar_ = new s4u::Barrier(size_); } - + s4u::Barrier* bar_; unsigned int size_; unsigned int finalized_ranks_ = 0; MPI_Comm comm_world_; @@ -63,6 +65,13 @@ void smpi_deployment_register_process(const std::string& instance_id, int rank, { const Instance& instance = smpi_instances.at(instance_id); instance.comm_world_->group()->set_mapping(actor->get_pid(), rank); + +} + +void smpi_deployment_startup_barrier(const std::string& instance_id) +{ + const Instance& instance = smpi_instances.at(instance_id); + instance.bar_->wait(); } void smpi_deployment_unregister_process(const std::string& instance_id) @@ -72,6 +81,7 @@ void smpi_deployment_unregister_process(const std::string& instance_id) if (instance.finalized_ranks_ == instance.size_) { simgrid::smpi::Comm::destroy(instance.comm_world_); + delete instance.bar_; smpi_instances.erase(instance_id); } } diff --git a/src/smpi/internals/smpi_global.cpp b/src/smpi/internals/smpi_global.cpp index 719aac55d4..491428a0fb 100644 --- a/src/smpi/internals/smpi_global.cpp +++ b/src/smpi/internals/smpi_global.cpp @@ -635,6 +635,7 @@ void smpi_mpi_init() { smpi_init_fortran_types(); if(_smpi_init_sleep > 0) simgrid::s4u::this_actor::sleep_for(_smpi_init_sleep); + smpi_deployment_startup_barrier(smpi_process()->get_instance_id()); } void SMPI_thread_create() { -- 2.20.1