From 08027d3c5a9bf1d0bef122b2a9a1f0b037fde9b6 Mon Sep 17 00:00:00 2001 From: Augustin Degomme Date: Wed, 17 Jul 2019 22:53:20 +0200 Subject: [PATCH] Have the communicators created together share a unique ID. This allows to avoid matching communications meant for another communicator. This was potentially causing very nasty bugs and was totally overlooked in SMPI. --- src/smpi/include/smpi_comm.hpp | 4 +++- src/smpi/internals/smpi_deployment.cpp | 2 +- src/smpi/mpi/smpi_comm.cpp | 20 +++++++++++++++++++- src/smpi/mpi/smpi_request.cpp | 10 ++++++---- 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/smpi/include/smpi_comm.hpp b/src/smpi/include/smpi_comm.hpp index a332a6a1b3..ae30f6583a 100644 --- a/src/smpi/include/smpi_comm.hpp +++ b/src/smpi/include/smpi_comm.hpp @@ -31,19 +31,21 @@ class Comm : public F2C, public Keyval{ std::list rma_wins_; // attached windows for synchronization. std::string name_; MPI_Info info_; + int id_; public: static std::unordered_map keyvals_; static int keyval_id_; Comm() = default; - Comm(MPI_Group group, MPI_Topology topo, int smp = 0); + Comm(MPI_Group group, MPI_Topology topo, int smp = 0, int id=MPI_UNDEFINED); int dup(MPI_Comm* newcomm); int dup_with_info(MPI_Info info, MPI_Comm* newcomm); MPI_Group group(); MPI_Topology topo() { return topo_; } int size(); int rank(); + int id(); void get_name(char* name, int* len); void set_name(const char* name); MPI_Info info(); diff --git a/src/smpi/internals/smpi_deployment.cpp b/src/smpi/internals/smpi_deployment.cpp index c73ed55c4f..379c77b170 100644 --- a/src/smpi/internals/smpi_deployment.cpp +++ b/src/smpi/internals/smpi_deployment.cpp @@ -66,7 +66,7 @@ void SMPI_app_instance_register(const char *name, xbt_main_func_t code, int num_ Instance instance(std::string(name), num_processes, MPI_COMM_NULL, new simgrid::s4u::Barrier(num_processes)); MPI_Group group = new simgrid::smpi::Group(instance.size); - instance.comm_world = new simgrid::smpi::Comm(group, nullptr); + instance.comm_world = new simgrid::smpi::Comm(group, nullptr, 0, -1); // FIXME : using MPI_Attr_put with MPI_UNIVERSE_SIZE is forbidden and we make it a no-op (which triggers a warning as MPI_ERR_ARG is returned). // Directly calling Comm::attr_put breaks for now, as MPI_UNIVERSE_SIZE,is <0 // instance.comm_world->attr_put(MPI_UNIVERSE_SIZE, reinterpret_cast(instance.size)); diff --git a/src/smpi/mpi/smpi_comm.cpp b/src/smpi/mpi/smpi_comm.cpp index 7f51e7e86c..380c314145 100644 --- a/src/smpi/mpi/smpi_comm.cpp +++ b/src/smpi/mpi/smpi_comm.cpp @@ -28,7 +28,7 @@ namespace smpi{ std::unordered_map Comm::keyvals_; int Comm::keyval_id_=0; -Comm::Comm(MPI_Group group, MPI_Topology topo, int smp) : group_(group), topo_(topo),is_smp_comm_(smp) +Comm::Comm(MPI_Group group, MPI_Topology topo, int smp, int id) : group_(group), topo_(topo),is_smp_comm_(smp), id_(id) { refcount_ = 1; topoType_ = MPI_INVALID_TOPO; @@ -39,6 +39,19 @@ Comm::Comm(MPI_Group group, MPI_Topology topo, int smp) : group_(group), topo_(t leaders_map_ = nullptr; is_blocked_ = 0; info_ = MPI_INFO_NULL; + static int global_id_=0; + //First creation of comm is done before SIMIX_run, so only do comms for others + if(id==MPI_UNDEFINED && smp==0 && this->rank()!=MPI_UNDEFINED ){ + int id; + if(this->rank()==0){ + id=global_id_; + global_id_++; + } + Colls::bcast(&id, 1, MPI_INT, 0, this); + XBT_DEBUG("Communicator %p has id %d", this, id); + id_=id;//only set here, as we don't want to change it in the middle of the bcast + Colls::barrier(this); + } } void Comm::destroy(Comm* comm) @@ -131,6 +144,11 @@ int Comm::rank() return group_->rank(s4u::Actor::self()); } +int Comm::id() +{ + return id_; +} + void Comm::get_name (char* name, int* len) { if (this == MPI_COMM_UNINITIALIZED){ diff --git a/src/smpi/mpi/smpi_request.cpp b/src/smpi/mpi/smpi_request.cpp index 692567af8e..e0113e842e 100644 --- a/src/smpi/mpi/smpi_request.cpp +++ b/src/smpi/mpi/smpi_request.cpp @@ -115,11 +115,12 @@ int Request::match_recv(void* a, void* b, simgrid::kernel::activity::CommImpl*) { MPI_Request ref = static_cast(a); MPI_Request req = static_cast(b); - XBT_DEBUG("Trying to match a recv of src %d against %d, tag %d against %d",ref->src_,req->src_, ref->tag_, req->tag_); + XBT_DEBUG("Trying to match a recv of src %d against %d, tag %d against %d, id %d against %d",ref->src_,req->src_, ref->tag_, req->tag_,ref->comm_->id(),req->comm_->id()); xbt_assert(ref, "Cannot match recv against null reference"); xbt_assert(req, "Cannot match recv against null request"); - if(((ref->src_ == MPI_ANY_SOURCE && (ref->comm_->group()->rank(req->src_) != MPI_UNDEFINED)) || req->src_ == ref->src_) + if((ref->comm_->id()==MPI_UNDEFINED || req->comm_->id() == MPI_UNDEFINED || (ref->comm_->id()==req->comm_->id())) + && ((ref->src_ == MPI_ANY_SOURCE && (ref->comm_->group()->rank(req->src_) != MPI_UNDEFINED)) || req->src_ == ref->src_) && ((ref->tag_ == MPI_ANY_TAG && req->tag_ >=0) || req->tag_ == ref->tag_)){ //we match, we can transfer some values if(ref->src_ == MPI_ANY_SOURCE) @@ -141,11 +142,12 @@ int Request::match_send(void* a, void* b, simgrid::kernel::activity::CommImpl*) { MPI_Request ref = static_cast(a); MPI_Request req = static_cast(b); - XBT_DEBUG("Trying to match a send of src %d against %d, tag %d against %d",ref->src_,req->src_, ref->tag_, req->tag_); + XBT_DEBUG("Trying to match a send of src %d against %d, tag %d against %d, id %d against %d",ref->src_,req->src_, ref->tag_, req->tag_,ref->comm_->id(),req->comm_->id()); xbt_assert(ref, "Cannot match send against null reference"); xbt_assert(req, "Cannot match send against null request"); - if(((req->src_ == MPI_ANY_SOURCE && (req->comm_->group()->rank(ref->src_) != MPI_UNDEFINED)) || req->src_ == ref->src_) + if((ref->comm_->id()==MPI_UNDEFINED || req->comm_->id() == MPI_UNDEFINED || (ref->comm_->id()==req->comm_->id())) + && ((req->src_ == MPI_ANY_SOURCE && (req->comm_->group()->rank(ref->src_) != MPI_UNDEFINED)) || req->src_ == ref->src_) && ((req->tag_ == MPI_ANY_TAG && ref->tag_ >=0)|| req->tag_ == ref->tag_)){ if(req->src_ == MPI_ANY_SOURCE) req->real_src_ = ref->src_; -- 2.20.1