1 /* Copyright (c) 2010-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
14 #include <simgrid/s4u/host.hpp>
17 #include "smpi_mpi_dt_private.h"
18 #include "src/simix/smx_private.h"
19 #include "colls/colls.h"
21 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_comm, smpi, "Logging specific to SMPI (comm)");
23 xbt_dict_t smpi_comm_keyvals = nullptr;
24 int comm_keyval_id = 0;//avoid collisions
27 simgrid::SMPI::Comm mpi_MPI_COMM_UNINITIALIZED;
28 MPI_Comm MPI_COMM_UNINITIALIZED=&mpi_MPI_COMM_UNINITIALIZED;
30 /* Support for cartesian topology was added, but there are 2 other types of topology, graph et dist graph. In order to
31 * support them, we have to add a field MPIR_Topo_type, and replace the MPI_Topology field by an union. */
33 static int smpi_compare_rankmap(const void *a, const void *b)
35 const int* x = static_cast<const int*>(a);
36 const int* y = static_cast<const int*>(b);
58 Comm::Comm(MPI_Group group, MPI_Topology topo)
62 m_topoType = MPI_INVALID_TOPO;
64 m_intra_comm = MPI_COMM_NULL;
65 m_leaders_comm = MPI_COMM_NULL;
67 m_non_uniform_map = nullptr;
68 m_leaders_map = nullptr;
75 if (this == MPI_COMM_UNINITIALIZED)
76 return smpi_process_comm_world()->destroy();
77 delete m_topo; // there's no use count on topos
81 int Comm::dup(MPI_Comm* newcomm){
82 if(smpi_privatize_global_variables){ //we need to switch as the called function may silently touch global variables
83 smpi_switch_data_segment(smpi_process_index());
85 MPI_Group cp = new simgrid::SMPI::Group(this->group());
86 (*newcomm) = new simgrid::SMPI::Comm(cp, this->topo());
87 int ret = MPI_SUCCESS;
89 if(m_attributes !=nullptr){
90 (*newcomm)->m_attributes = xbt_dict_new_homogeneous(nullptr);
91 xbt_dict_cursor_t cursor = nullptr;
96 xbt_dict_foreach (m_attributes, cursor, key, value_in) {
97 smpi_comm_key_elem elem =
98 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, key, sizeof(int)));
99 if (elem != nullptr && elem->copy_fn != MPI_NULL_COPY_FN) {
100 ret = elem->copy_fn(this, atoi(key), nullptr, value_in, &value_out, &flag);
101 if (ret != MPI_SUCCESS) {
102 (*newcomm)->destroy();
103 *newcomm = MPI_COMM_NULL;
104 xbt_dict_cursor_free(&cursor);
108 xbt_dict_set_ext((*newcomm)->m_attributes, key, sizeof(int), value_out, nullptr);
115 MPI_Group Comm::group()
117 if (this == MPI_COMM_UNINITIALIZED)
118 return smpi_process_comm_world()->group();
122 MPI_Topology Comm::topo() {
128 if (this == MPI_COMM_UNINITIALIZED)
129 return smpi_process_comm_world()->size();
130 return m_group->size();
135 if (this == MPI_COMM_UNINITIALIZED)
136 return smpi_process_comm_world()->rank();
137 return m_group->rank(smpi_process_index());
140 void Comm::get_name (char* name, int* len)
142 if (this == MPI_COMM_UNINITIALIZED)
143 return smpi_process_comm_world()->get_name(name, len);
144 if(this == MPI_COMM_WORLD) {
145 strncpy(name, "WORLD",5);
148 *len = snprintf(name, MPI_MAX_NAME_STRING, "%p", this);
152 void Comm::set_leaders_comm(MPI_Comm leaders){
153 if (this == MPI_COMM_UNINITIALIZED)
154 return smpi_process_comm_world()->set_leaders_comm(leaders);
155 m_leaders_comm=leaders;
158 void Comm::set_intra_comm(MPI_Comm leaders){
159 m_intra_comm=leaders;
162 int* Comm::get_non_uniform_map(){
163 if (this == MPI_COMM_UNINITIALIZED)
164 return smpi_process_comm_world()->get_non_uniform_map();
165 return m_non_uniform_map;
168 int* Comm::get_leaders_map(){
169 if (this == MPI_COMM_UNINITIALIZED)
170 return smpi_process_comm_world()->get_leaders_map();
171 return m_leaders_map;
174 MPI_Comm Comm::get_leaders_comm(){
175 if (this == MPI_COMM_UNINITIALIZED)
176 return smpi_process_comm_world()->get_leaders_comm();
177 return m_leaders_comm;
180 MPI_Comm Comm::get_intra_comm(){
181 if (this == MPI_COMM_UNINITIALIZED || this==MPI_COMM_WORLD)
182 return smpi_process_get_comm_intra();
183 else return m_intra_comm;
186 int Comm::is_uniform(){
187 if (this == MPI_COMM_UNINITIALIZED)
188 return smpi_process_comm_world()->is_uniform();
192 int Comm::is_blocked(){
193 if (this == MPI_COMM_UNINITIALIZED)
194 return smpi_process_comm_world()->is_blocked();
198 MPI_Comm Comm::split(int color, int key)
200 if (this == MPI_COMM_UNINITIALIZED)
201 return smpi_process_comm_world()->split(color, key);
202 int system_tag = 123;
205 MPI_Group group_root = nullptr;
206 MPI_Group group_out = nullptr;
207 MPI_Group group = this->group();
208 int rank = this->rank();
209 int size = this->size();
210 /* Gather all colors and keys on rank 0 */
211 int* sendbuf = xbt_new(int, 2);
215 recvbuf = xbt_new(int, 2 * size);
219 smpi_mpi_gather(sendbuf, 2, MPI_INT, recvbuf, 2, MPI_INT, 0, this);
221 /* Do the actual job */
223 MPI_Group* group_snd = xbt_new(MPI_Group, size);
224 int* rankmap = xbt_new(int, 2 * size);
225 for (int i = 0; i < size; i++) {
226 if (recvbuf[2 * i] != MPI_UNDEFINED) {
228 for (int j = i + 1; j < size; j++) {
229 if(recvbuf[2 * i] == recvbuf[2 * j]) {
230 recvbuf[2 * j] = MPI_UNDEFINED;
231 rankmap[2 * count] = j;
232 rankmap[2 * count + 1] = recvbuf[2 * j + 1];
236 /* Add self in the group */
237 recvbuf[2 * i] = MPI_UNDEFINED;
238 rankmap[2 * count] = i;
239 rankmap[2 * count + 1] = recvbuf[2 * i + 1];
241 qsort(rankmap, count, 2 * sizeof(int), &smpi_compare_rankmap);
242 group_out = new simgrid::SMPI::Group(count);
244 group_root = group_out; /* Save root's group */
246 for (int j = 0; j < count; j++) {
247 int index = group->index(rankmap[2 * j]);
248 group_out->set_mapping(index, j);
250 MPI_Request* requests = xbt_new(MPI_Request, count);
252 for (int j = 0; j < count; j++) {
253 if(rankmap[2 * j] != 0) {
254 group_snd[reqs]=new simgrid::SMPI::Group(group_out);
255 requests[reqs] = smpi_mpi_isend(&(group_snd[reqs]), 1, MPI_PTR, rankmap[2 * j], system_tag, this);
260 group_out->destroy();
262 smpi_mpi_waitall(reqs, requests, MPI_STATUS_IGNORE);
269 group_out = group_root; /* exit with root's group */
271 if(color != MPI_UNDEFINED) {
272 smpi_mpi_recv(&group_out, 1, MPI_PTR, 0, system_tag, this, MPI_STATUS_IGNORE);
273 } /* otherwise, exit with group_out == nullptr */
275 return group_out!=nullptr ? new simgrid::SMPI::Comm(group_out, nullptr) : MPI_COMM_NULL;
279 if (this == MPI_COMM_UNINITIALIZED)
280 return smpi_process_comm_world()->use();
285 void Comm::cleanup_attributes(){
286 if(m_attributes !=nullptr){
287 xbt_dict_cursor_t cursor = nullptr;
291 xbt_dict_foreach (m_attributes, cursor, key, value) {
292 smpi_comm_key_elem elem = static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null(smpi_comm_keyvals, key));
293 if (elem != nullptr && elem->delete_fn != nullptr)
294 elem->delete_fn(this, atoi(key), value, &flag);
296 xbt_dict_free(&m_attributes);
300 void Comm::cleanup_smp(){
301 if (m_intra_comm != MPI_COMM_NULL)
302 m_intra_comm->unuse();
303 if (m_leaders_comm != MPI_COMM_NULL)
304 m_leaders_comm->unuse();
305 if (m_non_uniform_map != nullptr)
306 xbt_free(m_non_uniform_map);
307 if (m_leaders_map != nullptr)
308 xbt_free(m_leaders_map);
312 if (this == MPI_COMM_UNINITIALIZED)
313 return smpi_process_comm_world()->unuse();
319 this->cleanup_attributes();
324 static int compare_ints (const void *a, const void *b)
326 const int *da = static_cast<const int *>(a);
327 const int *db = static_cast<const int *>(b);
329 return static_cast<int>(*da > *db) - static_cast<int>(*da < *db);
332 void Comm::init_smp(){
335 if (this == MPI_COMM_UNINITIALIZED)
336 return smpi_process_comm_world()->init_smp();
338 int comm_size = this->size();
340 // If we are in replay - perform an ugly hack
341 // tell SimGrid we are not in replay for a while, because we need the buffers to be copied for the following calls
342 bool replaying = false; //cache data to set it back again after
343 if(smpi_process_get_replaying()){
345 smpi_process_set_replaying(false);
348 if(smpi_privatize_global_variables){ //we need to switch as the called function may silently touch global variables
349 smpi_switch_data_segment(smpi_process_index());
351 //identify neighbours in comm
352 //get the indexes of all processes sharing the same simix host
353 xbt_swag_t process_list = SIMIX_host_self()->processes();
354 int intra_comm_size = 0;
356 int min_index=INT_MAX;//the minimum index will be the leader
357 smx_actor_t process = nullptr;
358 xbt_swag_foreach(process, process_list) {
359 int index = process->pid -1;
361 if(this->group()->rank(index)!=MPI_UNDEFINED){
363 //the process is in the comm
364 if(index < min_index)
369 XBT_DEBUG("number of processes deployed on my node : %d", intra_comm_size);
370 MPI_Group group_intra = new simgrid::SMPI::Group(intra_comm_size);
373 xbt_swag_foreach(process, process_list) {
374 int index = process->pid -1;
375 if(this->group()->rank(index)!=MPI_UNDEFINED){
376 group_intra->set_mapping(index, i);
381 MPI_Comm comm_intra = new simgrid::SMPI::Comm(group_intra, nullptr);
384 int * leaders_map= static_cast<int*>(xbt_malloc0(sizeof(int)*comm_size));
385 int * leader_list= static_cast<int*>(xbt_malloc0(sizeof(int)*comm_size));
386 for(i=0; i<comm_size; i++){
390 smpi_coll_tuned_allgather_mpich(&leader, 1, MPI_INT , leaders_map, 1, MPI_INT, this);
392 if(smpi_privatize_global_variables){ //we need to switch as the called function may silently touch global variables
393 smpi_switch_data_segment(smpi_process_index());
396 if(m_leaders_map==nullptr){
397 m_leaders_map= leaders_map;
399 xbt_free(leaders_map);
402 int leader_group_size = 0;
403 for(i=0; i<comm_size; i++){
405 for(j=0;j<leader_group_size; j++){
406 if(m_leaders_map[i]==leader_list[j]){
411 leader_list[leader_group_size]=m_leaders_map[i];
415 qsort(leader_list, leader_group_size, sizeof(int),compare_ints);
417 MPI_Group leaders_group = new simgrid::SMPI::Group(leader_group_size);
419 MPI_Comm leader_comm = MPI_COMM_NULL;
420 if(MPI_COMM_WORLD!=MPI_COMM_UNINITIALIZED && this!=MPI_COMM_WORLD){
421 //create leader_communicator
422 for (i=0; i< leader_group_size;i++)
423 leaders_group->set_mapping(leader_list[i], i);
424 leader_comm = new simgrid::SMPI::Comm(leaders_group, nullptr);
425 this->set_leaders_comm(leader_comm);
426 this->set_intra_comm(comm_intra);
428 //create intracommunicator
430 for (i=0; i< leader_group_size;i++)
431 leaders_group->set_mapping(leader_list[i], i);
433 if(this->get_leaders_comm()==MPI_COMM_NULL){
434 leader_comm = new simgrid::SMPI::Comm(leaders_group, nullptr);
435 this->set_leaders_comm(leader_comm);
437 leader_comm=this->get_leaders_comm();
438 leaders_group->unuse();
440 smpi_process_set_comm_intra(comm_intra);
445 // Are the nodes uniform ? = same number of process/node
446 int my_local_size=comm_intra->size();
447 if(comm_intra->rank()==0) {
448 int* non_uniform_map = xbt_new0(int,leader_group_size);
449 smpi_coll_tuned_allgather_mpich(&my_local_size, 1, MPI_INT,
450 non_uniform_map, 1, MPI_INT, leader_comm);
451 for(i=0; i < leader_group_size; i++) {
452 if(non_uniform_map[0] != non_uniform_map[i]) {
457 if(is_uniform==0 && this->is_uniform()!=0){
458 m_non_uniform_map= non_uniform_map;
460 xbt_free(non_uniform_map);
462 m_is_uniform=is_uniform;
464 smpi_coll_tuned_bcast_mpich(&(m_is_uniform),1, MPI_INT, 0, comm_intra );
466 if(smpi_privatize_global_variables){ //we need to switch as the called function may silently touch global variables
467 smpi_switch_data_segment(smpi_process_index());
469 // Are the ranks blocked ? = allocated contiguously on the SMP nodes
471 int prev=this->group()->rank(comm_intra->group()->index(0));
472 for (i=1; i<my_local_size; i++){
473 int that=this->group()->rank(comm_intra->group()->index(i));
482 smpi_mpi_allreduce(&is_blocked, &(global_blocked), 1, MPI_INT, MPI_LAND, this);
484 if(MPI_COMM_WORLD==MPI_COMM_UNINITIALIZED || this==MPI_COMM_WORLD){
486 m_is_blocked=global_blocked;
489 m_is_blocked=global_blocked;
491 xbt_free(leader_list);
494 smpi_process_set_replaying(true);
497 int Comm::attr_delete(int keyval){
498 smpi_comm_key_elem elem =
499 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(&keyval), sizeof(int)));
502 if(elem->delete_fn!=MPI_NULL_DELETE_FN){
503 void* value = nullptr;
505 if(this->attr_get(keyval, &value, &flag)==MPI_SUCCESS){
506 int ret = elem->delete_fn(this, keyval, value, &flag);
511 if(m_attributes==nullptr)
514 xbt_dict_remove_ext(m_attributes, reinterpret_cast<const char*>(&keyval), sizeof(int));
518 int Comm::attr_get(int keyval, void* attr_value, int* flag){
519 smpi_comm_key_elem elem =
520 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(&keyval), sizeof(int)));
523 if(m_attributes==nullptr){
528 *static_cast<void**>(attr_value) =
529 xbt_dict_get_ext(m_attributes, reinterpret_cast<const char*>(&keyval), sizeof(int));
538 int Comm::attr_put(int keyval, void* attr_value){
539 if(smpi_comm_keyvals==nullptr)
540 smpi_comm_keyvals = xbt_dict_new_homogeneous(nullptr);
541 smpi_comm_key_elem elem =
542 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(&keyval), sizeof(int)));
546 void* value = nullptr;
547 this->attr_get(keyval, &value, &flag);
548 if(flag!=0 && elem->delete_fn!=MPI_NULL_DELETE_FN){
549 int ret = elem->delete_fn(this, keyval, value, &flag);
553 if(m_attributes==nullptr)
554 m_attributes = xbt_dict_new_homogeneous(nullptr);
556 xbt_dict_set_ext(m_attributes, reinterpret_cast<const char*>(&keyval), sizeof(int), attr_value, nullptr);
563 int smpi_comm_keyval_create(MPI_Comm_copy_attr_function* copy_fn, MPI_Comm_delete_attr_function* delete_fn, int* keyval,
565 if(smpi_comm_keyvals==nullptr)
566 smpi_comm_keyvals = xbt_dict_new_homogeneous(nullptr);
568 smpi_comm_key_elem value = static_cast<smpi_comm_key_elem>(xbt_new0(s_smpi_mpi_comm_key_elem_t,1));
570 value->copy_fn=copy_fn;
571 value->delete_fn=delete_fn;
573 *keyval = comm_keyval_id;
574 xbt_dict_set_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(keyval), sizeof(int),static_cast<void*>(value), nullptr);
579 int smpi_comm_keyval_free(int* keyval){
580 smpi_comm_key_elem elem =
581 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(keyval), sizeof(int)));
584 xbt_dict_remove_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(keyval), sizeof(int));