1 /* Copyright (c) 2010-2015. The SimGrid Team.
2 * All rights reserved. */
4 /* This program is free software; you can redistribute it and/or modify it
5 * under the terms of the license (GNU LGPL) which comes with this package. */
14 #include <simgrid/s4u/host.hpp>
17 #include "src/simix/smx_private.h"
18 #include "colls/colls.h"
20 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_comm, smpi, "Logging specific to SMPI (comm)");
22 xbt_dict_t smpi_comm_keyvals = nullptr;
23 int comm_keyval_id = 0;//avoid collisions
26 Comm mpi_MPI_COMM_UNINITIALIZED;
27 MPI_Comm MPI_COMM_UNINITIALIZED=&mpi_MPI_COMM_UNINITIALIZED;
29 /* Support for cartesian topology was added, but there are 2 other types of topology, graph et dist graph. In order to
30 * support them, we have to add a field MPIR_Topo_type, and replace the MPI_Topology field by an union. */
32 static int smpi_compare_rankmap(const void *a, const void *b)
34 const int* x = static_cast<const int*>(a);
35 const int* y = static_cast<const int*>(b);
57 Comm::Comm(MPI_Group group, MPI_Topology topo) : group_(group), topo_(topo)
60 topoType_ = MPI_INVALID_TOPO;
61 intra_comm_ = MPI_COMM_NULL;
62 leaders_comm_ = MPI_COMM_NULL;
64 non_uniform_map_ = nullptr;
65 leaders_map_ = nullptr;
72 if (this == MPI_COMM_UNINITIALIZED){
73 smpi_process_comm_world()->destroy();
76 delete topo_; // there's no use count on topos
80 int Comm::dup(MPI_Comm* newcomm){
81 if(smpi_privatize_global_variables){ //we need to switch as the called function may silently touch global variables
82 smpi_switch_data_segment(smpi_process_index());
84 MPI_Group cp = new Group(this->group());
85 (*newcomm) = new Comm(cp, this->topo());
86 int ret = MPI_SUCCESS;
88 if(attributes_ !=nullptr){
89 (*newcomm)->attributes_ = xbt_dict_new_homogeneous(nullptr);
90 xbt_dict_cursor_t cursor = nullptr;
95 xbt_dict_foreach (attributes_, cursor, key, value_in) {
96 smpi_comm_key_elem elem =
97 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, key, sizeof(int)));
98 if (elem != nullptr && elem->copy_fn != MPI_NULL_COPY_FN) {
99 ret = elem->copy_fn(this, atoi(key), nullptr, value_in, &value_out, &flag);
100 if (ret != MPI_SUCCESS) {
101 (*newcomm)->destroy();
102 *newcomm = MPI_COMM_NULL;
103 xbt_dict_cursor_free(&cursor);
107 xbt_dict_set_ext((*newcomm)->attributes_, key, sizeof(int), value_out, nullptr);
114 MPI_Group Comm::group()
116 if (this == MPI_COMM_UNINITIALIZED)
117 return smpi_process_comm_world()->group();
121 MPI_Topology Comm::topo() {
127 if (this == MPI_COMM_UNINITIALIZED)
128 return smpi_process_comm_world()->size();
129 return group_->size();
134 if (this == MPI_COMM_UNINITIALIZED)
135 return smpi_process_comm_world()->rank();
136 return group_->rank(smpi_process_index());
139 void Comm::get_name (char* name, int* len)
141 if (this == MPI_COMM_UNINITIALIZED){
142 smpi_process_comm_world()->get_name(name, len);
145 if(this == MPI_COMM_WORLD) {
146 strncpy(name, "WORLD",5);
149 *len = snprintf(name, MPI_MAX_NAME_STRING, "%p", this);
153 void Comm::set_leaders_comm(MPI_Comm leaders){
154 if (this == MPI_COMM_UNINITIALIZED){
155 smpi_process_comm_world()->set_leaders_comm(leaders);
158 leaders_comm_=leaders;
161 void Comm::set_intra_comm(MPI_Comm leaders){
165 int* Comm::get_non_uniform_map(){
166 if (this == MPI_COMM_UNINITIALIZED)
167 return smpi_process_comm_world()->get_non_uniform_map();
168 return non_uniform_map_;
171 int* Comm::get_leaders_map(){
172 if (this == MPI_COMM_UNINITIALIZED)
173 return smpi_process_comm_world()->get_leaders_map();
177 MPI_Comm Comm::get_leaders_comm(){
178 if (this == MPI_COMM_UNINITIALIZED)
179 return smpi_process_comm_world()->get_leaders_comm();
180 return leaders_comm_;
183 MPI_Comm Comm::get_intra_comm(){
184 if (this == MPI_COMM_UNINITIALIZED || this==MPI_COMM_WORLD)
185 return smpi_process_get_comm_intra();
186 else return intra_comm_;
189 int Comm::is_uniform(){
190 if (this == MPI_COMM_UNINITIALIZED)
191 return smpi_process_comm_world()->is_uniform();
195 int Comm::is_blocked(){
196 if (this == MPI_COMM_UNINITIALIZED)
197 return smpi_process_comm_world()->is_blocked();
201 MPI_Comm Comm::split(int color, int key)
203 if (this == MPI_COMM_UNINITIALIZED)
204 return smpi_process_comm_world()->split(color, key);
205 int system_tag = 123;
208 MPI_Group group_root = nullptr;
209 MPI_Group group_out = nullptr;
210 MPI_Group group = this->group();
211 int rank = this->rank();
212 int size = this->size();
213 /* Gather all colors and keys on rank 0 */
214 int* sendbuf = xbt_new(int, 2);
218 recvbuf = xbt_new(int, 2 * size);
222 smpi_mpi_gather(sendbuf, 2, MPI_INT, recvbuf, 2, MPI_INT, 0, this);
224 /* Do the actual job */
226 MPI_Group* group_snd = xbt_new(MPI_Group, size);
227 int* rankmap = xbt_new(int, 2 * size);
228 for (int i = 0; i < size; i++) {
229 if (recvbuf[2 * i] != MPI_UNDEFINED) {
231 for (int j = i + 1; j < size; j++) {
232 if(recvbuf[2 * i] == recvbuf[2 * j]) {
233 recvbuf[2 * j] = MPI_UNDEFINED;
234 rankmap[2 * count] = j;
235 rankmap[2 * count + 1] = recvbuf[2 * j + 1];
239 /* Add self in the group */
240 recvbuf[2 * i] = MPI_UNDEFINED;
241 rankmap[2 * count] = i;
242 rankmap[2 * count + 1] = recvbuf[2 * i + 1];
244 qsort(rankmap, count, 2 * sizeof(int), &smpi_compare_rankmap);
245 group_out = new Group(count);
247 group_root = group_out; /* Save root's group */
249 for (int j = 0; j < count; j++) {
250 int index = group->index(rankmap[2 * j]);
251 group_out->set_mapping(index, j);
253 MPI_Request* requests = xbt_new(MPI_Request, count);
255 for (int j = 0; j < count; j++) {
256 if(rankmap[2 * j] != 0) {
257 group_snd[reqs]=new Group(group_out);
258 requests[reqs] = Request::isend(&(group_snd[reqs]), 1, MPI_PTR, rankmap[2 * j], system_tag, this);
263 group_out->destroy();
265 Request::waitall(reqs, requests, MPI_STATUS_IGNORE);
272 group_out = group_root; /* exit with root's group */
274 if(color != MPI_UNDEFINED) {
275 Request::recv(&group_out, 1, MPI_PTR, 0, system_tag, this, MPI_STATUS_IGNORE);
276 } /* otherwise, exit with group_out == nullptr */
278 return group_out!=nullptr ? new Comm(group_out, nullptr) : MPI_COMM_NULL;
282 if (this == MPI_COMM_UNINITIALIZED){
283 smpi_process_comm_world()->use();
290 void Comm::cleanup_attributes(){
291 if(attributes_ !=nullptr){
292 xbt_dict_cursor_t cursor = nullptr;
296 xbt_dict_foreach (attributes_, cursor, key, value) {
297 smpi_comm_key_elem elem = static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null(smpi_comm_keyvals, key));
298 if (elem != nullptr && elem->delete_fn != nullptr)
299 elem->delete_fn(this, atoi(key), value, &flag);
301 xbt_dict_free(&attributes_);
305 void Comm::cleanup_smp(){
306 if (intra_comm_ != MPI_COMM_NULL)
307 intra_comm_->unuse();
308 if (leaders_comm_ != MPI_COMM_NULL)
309 leaders_comm_->unuse();
310 if (non_uniform_map_ != nullptr)
311 xbt_free(non_uniform_map_);
312 if (leaders_map_ != nullptr)
313 xbt_free(leaders_map_);
317 if (this == MPI_COMM_UNINITIALIZED){
318 smpi_process_comm_world()->unuse();
326 this->cleanup_attributes();
331 static int compare_ints (const void *a, const void *b)
333 const int *da = static_cast<const int *>(a);
334 const int *db = static_cast<const int *>(b);
336 return static_cast<int>(*da > *db) - static_cast<int>(*da < *db);
339 void Comm::init_smp(){
342 if (this == MPI_COMM_UNINITIALIZED)
343 smpi_process_comm_world()->init_smp();
345 int comm_size = this->size();
347 // If we are in replay - perform an ugly hack
348 // tell SimGrid we are not in replay for a while, because we need the buffers to be copied for the following calls
349 bool replaying = false; //cache data to set it back again after
350 if(smpi_process_get_replaying()){
352 smpi_process_set_replaying(false);
355 if(smpi_privatize_global_variables){ //we need to switch as the called function may silently touch global variables
356 smpi_switch_data_segment(smpi_process_index());
358 //identify neighbours in comm
359 //get the indexes of all processes sharing the same simix host
360 xbt_swag_t process_list = SIMIX_host_self()->processes();
361 int intra_comm_size = 0;
363 int min_index=INT_MAX;//the minimum index will be the leader
364 smx_actor_t process = nullptr;
365 xbt_swag_foreach(process, process_list) {
366 int index = process->pid -1;
368 if(this->group()->rank(index)!=MPI_UNDEFINED){
370 //the process is in the comm
371 if(index < min_index)
376 XBT_DEBUG("number of processes deployed on my node : %d", intra_comm_size);
377 MPI_Group group_intra = new Group(intra_comm_size);
380 xbt_swag_foreach(process, process_list) {
381 int index = process->pid -1;
382 if(this->group()->rank(index)!=MPI_UNDEFINED){
383 group_intra->set_mapping(index, i);
388 MPI_Comm comm_intra = new Comm(group_intra, nullptr);
391 int * leaders_map= static_cast<int*>(xbt_malloc0(sizeof(int)*comm_size));
392 int * leader_list= static_cast<int*>(xbt_malloc0(sizeof(int)*comm_size));
393 for(i=0; i<comm_size; i++){
397 smpi_coll_tuned_allgather_mpich(&leader, 1, MPI_INT , leaders_map, 1, MPI_INT, this);
399 if(smpi_privatize_global_variables){ //we need to switch as the called function may silently touch global variables
400 smpi_switch_data_segment(smpi_process_index());
403 if(leaders_map_==nullptr){
404 leaders_map_= leaders_map;
406 xbt_free(leaders_map);
409 int leader_group_size = 0;
410 for(i=0; i<comm_size; i++){
412 for(j=0;j<leader_group_size; j++){
413 if(leaders_map_[i]==leader_list[j]){
418 leader_list[leader_group_size]=leaders_map_[i];
422 qsort(leader_list, leader_group_size, sizeof(int),compare_ints);
424 MPI_Group leaders_group = new Group(leader_group_size);
426 MPI_Comm leader_comm = MPI_COMM_NULL;
427 if(MPI_COMM_WORLD!=MPI_COMM_UNINITIALIZED && this!=MPI_COMM_WORLD){
428 //create leader_communicator
429 for (i=0; i< leader_group_size;i++)
430 leaders_group->set_mapping(leader_list[i], i);
431 leader_comm = new Comm(leaders_group, nullptr);
432 this->set_leaders_comm(leader_comm);
433 this->set_intra_comm(comm_intra);
435 //create intracommunicator
437 for (i=0; i< leader_group_size;i++)
438 leaders_group->set_mapping(leader_list[i], i);
440 if(this->get_leaders_comm()==MPI_COMM_NULL){
441 leader_comm = new Comm(leaders_group, nullptr);
442 this->set_leaders_comm(leader_comm);
444 leader_comm=this->get_leaders_comm();
445 leaders_group->unuse();
447 smpi_process_set_comm_intra(comm_intra);
452 // Are the nodes uniform ? = same number of process/node
453 int my_local_size=comm_intra->size();
454 if(comm_intra->rank()==0) {
455 int* non_uniform_map = xbt_new0(int,leader_group_size);
456 smpi_coll_tuned_allgather_mpich(&my_local_size, 1, MPI_INT,
457 non_uniform_map, 1, MPI_INT, leader_comm);
458 for(i=0; i < leader_group_size; i++) {
459 if(non_uniform_map[0] != non_uniform_map[i]) {
464 if(is_uniform==0 && this->is_uniform()!=0){
465 non_uniform_map_= non_uniform_map;
467 xbt_free(non_uniform_map);
469 is_uniform_=is_uniform;
471 smpi_coll_tuned_bcast_mpich(&(is_uniform_),1, MPI_INT, 0, comm_intra );
473 if(smpi_privatize_global_variables){ //we need to switch as the called function may silently touch global variables
474 smpi_switch_data_segment(smpi_process_index());
476 // Are the ranks blocked ? = allocated contiguously on the SMP nodes
478 int prev=this->group()->rank(comm_intra->group()->index(0));
479 for (i=1; i<my_local_size; i++){
480 int that=this->group()->rank(comm_intra->group()->index(i));
489 smpi_mpi_allreduce(&is_blocked, &(global_blocked), 1, MPI_INT, MPI_LAND, this);
491 if(MPI_COMM_WORLD==MPI_COMM_UNINITIALIZED || this==MPI_COMM_WORLD){
493 is_blocked_=global_blocked;
496 is_blocked_=global_blocked;
498 xbt_free(leader_list);
501 smpi_process_set_replaying(true);
504 int Comm::attr_delete(int keyval){
505 smpi_comm_key_elem elem =
506 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(&keyval), sizeof(int)));
509 if(elem->delete_fn!=MPI_NULL_DELETE_FN){
510 void* value = nullptr;
512 if(this->attr_get(keyval, &value, &flag)==MPI_SUCCESS){
513 int ret = elem->delete_fn(this, keyval, value, &flag);
518 if(attributes_==nullptr)
521 xbt_dict_remove_ext(attributes_, reinterpret_cast<const char*>(&keyval), sizeof(int));
525 int Comm::attr_get(int keyval, void* attr_value, int* flag){
526 smpi_comm_key_elem elem =
527 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(&keyval), sizeof(int)));
530 if(attributes_==nullptr){
535 *static_cast<void**>(attr_value) =
536 xbt_dict_get_ext(attributes_, reinterpret_cast<const char*>(&keyval), sizeof(int));
545 int Comm::attr_put(int keyval, void* attr_value){
546 if(smpi_comm_keyvals==nullptr)
547 smpi_comm_keyvals = xbt_dict_new_homogeneous(nullptr);
548 smpi_comm_key_elem elem =
549 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(&keyval), sizeof(int)));
553 void* value = nullptr;
554 this->attr_get(keyval, &value, &flag);
555 if(flag!=0 && elem->delete_fn!=MPI_NULL_DELETE_FN){
556 int ret = elem->delete_fn(this, keyval, value, &flag);
560 if(attributes_==nullptr)
561 attributes_ = xbt_dict_new_homogeneous(nullptr);
563 xbt_dict_set_ext(attributes_, reinterpret_cast<const char*>(&keyval), sizeof(int), attr_value, nullptr);
570 int smpi_comm_keyval_create(MPI_Comm_copy_attr_function* copy_fn, MPI_Comm_delete_attr_function* delete_fn, int* keyval,
572 if(smpi_comm_keyvals==nullptr)
573 smpi_comm_keyvals = xbt_dict_new_homogeneous(nullptr);
575 smpi_comm_key_elem value = static_cast<smpi_comm_key_elem>(xbt_new0(s_smpi_mpi_comm_key_elem_t,1));
577 value->copy_fn=copy_fn;
578 value->delete_fn=delete_fn;
580 *keyval = comm_keyval_id;
581 xbt_dict_set_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(keyval), sizeof(int),static_cast<void*>(value), nullptr);
586 int smpi_comm_keyval_free(int* keyval){
587 smpi_comm_key_elem elem =
588 static_cast<smpi_comm_key_elem>(xbt_dict_get_or_null_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(keyval), sizeof(int)));
591 xbt_dict_remove_ext(smpi_comm_keyvals, reinterpret_cast<const char*>(keyval), sizeof(int));