1 /* smpi_coll.c -- various optimized routing for collectives */
3 /* Copyright (c) 2009, 2010. The SimGrid Team.
4 * All rights reserved. */
6 /* This program is free software; you can redistribute it and/or modify it
7 * under the terms of the license (GNU LGPL) which comes with this package. */
14 #include "colls/colls.h"
15 #include "simgrid/sg_config.h"
17 s_mpi_coll_description_t mpi_coll_allgather_description[] = {
19 "allgather default collective",
21 COLL_ALLGATHERS(COLL_DESCRIPTION, COLL_COMMA),
22 {NULL, NULL, NULL} /* this array must be NULL terminated */
25 s_mpi_coll_description_t mpi_coll_allgatherv_description[] = {
27 "allgatherv default collective",
29 COLL_ALLGATHERVS(COLL_DESCRIPTION, COLL_COMMA),
30 {NULL, NULL, NULL} /* this array must be NULL terminated */
33 s_mpi_coll_description_t mpi_coll_allreduce_description[] = {
35 "allreduce default collective",
37 COLL_ALLREDUCES(COLL_DESCRIPTION, COLL_COMMA),
38 {NULL, NULL, NULL} /* this array must be NULL terminated */
41 s_mpi_coll_description_t mpi_coll_alltoall_description[] = {
43 "Ompi alltoall default collective",
44 smpi_coll_tuned_alltoall_ompi2},
45 COLL_ALLTOALLS(COLL_DESCRIPTION, COLL_COMMA),
47 "Alltoall Bruck (SG) collective",
48 smpi_coll_tuned_alltoall_bruck},
50 "Alltoall basic linear (SG) collective",
51 smpi_coll_tuned_alltoall_basic_linear},
52 {NULL, NULL, NULL} /* this array must be NULL terminated */
55 s_mpi_coll_description_t mpi_coll_alltoallv_description[] = {
57 "Ompi alltoallv default collective",
58 smpi_coll_basic_alltoallv},
59 COLL_ALLTOALLVS(COLL_DESCRIPTION, COLL_COMMA),
60 {NULL, NULL, NULL} /* this array must be NULL terminated */
63 s_mpi_coll_description_t mpi_coll_bcast_description[] = {
65 "allgather default collective",
67 COLL_BCASTS(COLL_DESCRIPTION, COLL_COMMA),
68 {NULL, NULL, NULL} /* this array must be NULL terminated */
71 s_mpi_coll_description_t mpi_coll_reduce_description[] = {
73 "allgather default collective",
75 COLL_REDUCES(COLL_DESCRIPTION, COLL_COMMA),
76 {NULL, NULL, NULL} /* this array must be NULL terminated */
81 /** Displays the long description of all registered models, and quit */
82 void coll_help(const char *category, s_mpi_coll_description_t * table)
85 printf("Long description of the %s models accepted by this simulator:\n",
87 for (i = 0; table[i].name; i++)
88 printf(" %s: %s\n", table[i].name, table[i].description);
91 int find_coll_description(s_mpi_coll_description_t * table,
95 char *name_list = NULL;
97 if(name==NULL){//no argument provided, use active selector's algorithm
98 name=(char*)sg_cfg_get_string("smpi/coll_selector");
101 for (i = 0; table[i].name; i++)
102 if (!strcmp(name, table[i].name)) {
107 // collective seems not handled by the active selector, try with default one
108 name=(char*)"default";
109 for (i = 0; table[i].name; i++)
110 if (!strcmp(name, table[i].name)) {
114 name_list = strdup(table[0].name);
115 for (i = 1; table[i].name; i++) {
117 xbt_realloc(name_list,
118 strlen(name_list) + strlen(table[i].name) + 3);
119 strcat(name_list, ", ");
120 strcat(name_list, table[i].name);
122 xbt_die("Model '%s' is invalid! Valid models are: %s.", name, name_list);
126 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_coll, smpi,
127 "Logging specific to SMPI (coll)");
129 int (*mpi_coll_allgather_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm);
130 int (*mpi_coll_allgatherv_fun)(void *, int, MPI_Datatype, void*, int*, int*, MPI_Datatype, MPI_Comm);
131 int (*mpi_coll_allreduce_fun)(void *sbuf, void *rbuf, int rcount, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm);
132 int (*mpi_coll_alltoall_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm);
133 int (*mpi_coll_alltoallv_fun)(void *, int*, int*, MPI_Datatype, void*, int*, int*, MPI_Datatype, MPI_Comm);
134 int (*mpi_coll_bcast_fun)(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm com);
135 int (*mpi_coll_reduce_fun)(void *buf, void *rbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm);
146 typedef struct s_proc_tree *proc_tree_t;
151 static proc_tree_t alloc_tree(int arity)
156 tree = xbt_new(struct s_proc_tree, 1);
157 tree->PROCTREE_A = arity;
159 tree->numChildren = 0;
160 tree->child = xbt_new(int, arity);
161 for (i = 0; i < arity; i++) {
172 static void free_tree(proc_tree_t tree)
174 xbt_free(tree->child);
179 * Build the tree depending on a process rank (index) and the group size (extent)
180 * @param root the rank of the tree root
181 * @param rank the rank of the calling process
182 * @param size the total number of processes
184 static void build_tree(int root, int rank, int size, proc_tree_t * tree)
186 int index = (rank - root + size) % size;
187 int firstChildIdx = index * (*tree)->PROCTREE_A + 1;
191 (*tree)->root = root;
193 for (i = 0; i < (*tree)->PROCTREE_A && firstChildIdx + i < size; i++) {
194 (*tree)->child[i] = (firstChildIdx + i + root) % size;
195 (*tree)->numChildren++;
201 (*tree)->parent = (((index - 1) / (*tree)->PROCTREE_A) + root) % size;
208 static void tree_bcast(void *buf, int count, MPI_Datatype datatype,
209 MPI_Comm comm, proc_tree_t tree)
211 int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
213 MPI_Request *requests;
215 rank = smpi_comm_rank(comm);
216 /* wait for data from my parent in the tree */
218 XBT_DEBUG("<%d> tree_bcast(): i am not root: recv from %d, tag=%d)",
219 rank, tree->parent, system_tag + rank);
220 smpi_mpi_recv(buf, count, datatype, tree->parent, system_tag + rank,
221 comm, MPI_STATUS_IGNORE);
223 requests = xbt_new(MPI_Request, tree->numChildren);
224 XBT_DEBUG("<%d> creates %d requests (1 per child)", rank,
226 /* iniates sends to ranks lower in the tree */
227 for (i = 0; i < tree->numChildren; i++) {
228 if (tree->child[i] == -1) {
229 requests[i] = MPI_REQUEST_NULL;
231 XBT_DEBUG("<%d> send to <%d>, tag=%d", rank, tree->child[i],
232 system_tag + tree->child[i]);
234 smpi_isend_init(buf, count, datatype, tree->child[i],
235 system_tag + tree->child[i], comm);
238 smpi_mpi_startall(tree->numChildren, requests);
239 smpi_mpi_waitall(tree->numChildren, requests, MPI_STATUS_IGNORE);
246 static void tree_antibcast(void *buf, int count, MPI_Datatype datatype,
247 MPI_Comm comm, proc_tree_t tree)
249 int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
251 MPI_Request *requests;
253 rank = smpi_comm_rank(comm);
254 // everyone sends to its parent, except root.
256 XBT_DEBUG("<%d> tree_antibcast(): i am not root: send to %d, tag=%d)",
257 rank, tree->parent, system_tag + rank);
258 smpi_mpi_send(buf, count, datatype, tree->parent, system_tag + rank,
261 //every one receives as many messages as it has children
262 requests = xbt_new(MPI_Request, tree->numChildren);
263 XBT_DEBUG("<%d> creates %d requests (1 per child)", rank,
265 for (i = 0; i < tree->numChildren; i++) {
266 if (tree->child[i] == -1) {
267 requests[i] = MPI_REQUEST_NULL;
269 XBT_DEBUG("<%d> recv from <%d>, tag=%d", rank, tree->child[i],
270 system_tag + tree->child[i]);
272 smpi_irecv_init(buf, count, datatype, tree->child[i],
273 system_tag + tree->child[i], comm);
276 smpi_mpi_startall(tree->numChildren, requests);
277 smpi_mpi_waitall(tree->numChildren, requests, MPI_STATUS_IGNORE);
282 * bcast with a binary, ternary, or whatever tree ..
284 void nary_tree_bcast(void *buf, int count, MPI_Datatype datatype, int root,
285 MPI_Comm comm, int arity)
287 proc_tree_t tree = alloc_tree(arity);
290 rank = smpi_comm_rank(comm);
291 size = smpi_comm_size(comm);
292 build_tree(root, rank, size, &tree);
293 tree_bcast(buf, count, datatype, comm, tree);
298 * barrier with a binary, ternary, or whatever tree ..
300 void nary_tree_barrier(MPI_Comm comm, int arity)
302 proc_tree_t tree = alloc_tree(arity);
306 rank = smpi_comm_rank(comm);
307 size = smpi_comm_size(comm);
308 build_tree(0, rank, size, &tree);
309 tree_antibcast(&dummy, 1, MPI_CHAR, comm, tree);
310 tree_bcast(&dummy, 1, MPI_CHAR, comm, tree);
314 int smpi_coll_tuned_alltoall_ompi2(void *sendbuf, int sendcount,
315 MPI_Datatype sendtype, void *recvbuf,
316 int recvcount, MPI_Datatype recvtype,
320 size = smpi_comm_size(comm);
321 sendsize = smpi_datatype_size(sendtype) * sendcount;
322 if (sendsize < 200 && size > 12) {
324 smpi_coll_tuned_alltoall_bruck(sendbuf, sendcount, sendtype,
325 recvbuf, recvcount, recvtype,
327 } else if (sendsize < 3000) {
329 smpi_coll_tuned_alltoall_basic_linear(sendbuf, sendcount,
331 recvcount, recvtype, comm);
334 smpi_coll_tuned_alltoall_ring(sendbuf, sendcount, sendtype,
335 recvbuf, recvcount, recvtype,
343 * Openmpi calls this routine when the message size sent to each rank < 2000 bytes and size < 12
344 * FIXME: uh, check smpi_pmpi again, but this routine is called for > 12, not
347 int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount,
348 MPI_Datatype sendtype, void *recvbuf,
349 int recvcount, MPI_Datatype recvtype,
352 int system_tag = 777;
353 int i, rank, size, err, count;
355 MPI_Aint sendext = 0;
356 MPI_Aint recvext = 0;
357 MPI_Request *requests;
359 // FIXME: check implementation
360 rank = smpi_comm_rank(comm);
361 size = smpi_comm_size(comm);
362 XBT_DEBUG("<%d> algorithm alltoall_bruck() called.", rank);
363 err = smpi_datatype_extent(sendtype, &lb, &sendext);
364 err = smpi_datatype_extent(recvtype, &lb, &recvext);
365 /* Local copy from self */
367 smpi_datatype_copy((char *)sendbuf + rank * sendcount * sendext,
369 (char *)recvbuf + rank * recvcount * recvext,
370 recvcount, recvtype);
371 if (err == MPI_SUCCESS && size > 1) {
372 /* Initiate all send/recv to/from others. */
373 requests = xbt_new(MPI_Request, 2 * (size - 1));
375 /* Create all receives that will be posted first */
376 for (i = 0; i < size; ++i) {
378 XBT_DEBUG("<%d> skip request creation [src = %d, recvcount = %d]",
383 smpi_irecv_init((char *)recvbuf + i * recvcount * recvext, recvcount,
384 recvtype, i, system_tag, comm);
387 /* Now create all sends */
388 for (i = 0; i < size; ++i) {
390 XBT_DEBUG("<%d> skip request creation [dst = %d, sendcount = %d]",
395 smpi_isend_init((char *)sendbuf + i * sendcount * sendext, sendcount,
396 sendtype, i, system_tag, comm);
399 /* Wait for them all. */
400 smpi_mpi_startall(count, requests);
401 XBT_DEBUG("<%d> wait for %d requests", rank, count);
402 smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
409 * Alltoall basic_linear (STARMPI:alltoall-simple)
411 int smpi_coll_tuned_alltoall_basic_linear(void *sendbuf, int sendcount,
412 MPI_Datatype sendtype,
413 void *recvbuf, int recvcount,
414 MPI_Datatype recvtype,
417 int system_tag = 888;
418 int i, rank, size, err, count;
419 MPI_Aint lb = 0, sendext = 0, recvext = 0;
420 MPI_Request *requests;
423 rank = smpi_comm_rank(comm);
424 size = smpi_comm_size(comm);
425 XBT_DEBUG("<%d> algorithm alltoall_basic_linear() called.", rank);
426 err = smpi_datatype_extent(sendtype, &lb, &sendext);
427 err = smpi_datatype_extent(recvtype, &lb, &recvext);
428 /* simple optimization */
429 err = smpi_datatype_copy((char *)sendbuf + rank * sendcount * sendext,
431 (char *)recvbuf + rank * recvcount * recvext,
432 recvcount, recvtype);
433 if (err == MPI_SUCCESS && size > 1) {
434 /* Initiate all send/recv to/from others. */
435 requests = xbt_new(MPI_Request, 2 * (size - 1));
436 /* Post all receives first -- a simple optimization */
438 for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
440 smpi_irecv_init((char *)recvbuf + i * recvcount * recvext, recvcount,
441 recvtype, i, system_tag, comm);
444 /* Now post all sends in reverse order
445 * - We would like to minimize the search time through message queue
446 * when messages actually arrive in the order in which they were posted.
447 * TODO: check the previous assertion
449 for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size) {
451 smpi_isend_init((char *)sendbuf + i * sendcount * sendext, sendcount,
452 sendtype, i, system_tag, comm);
455 /* Wait for them all. */
456 smpi_mpi_startall(count, requests);
457 XBT_DEBUG("<%d> wait for %d requests", rank, count);
458 smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
464 int smpi_coll_basic_alltoallv(void *sendbuf, int *sendcounts,
465 int *senddisps, MPI_Datatype sendtype,
466 void *recvbuf, int *recvcounts,
467 int *recvdisps, MPI_Datatype recvtype,
470 int system_tag = 889;
471 int i, rank, size, err, count;
472 MPI_Aint lb = 0, sendext = 0, recvext = 0;
473 MPI_Request *requests;
476 rank = smpi_comm_rank(comm);
477 size = smpi_comm_size(comm);
478 XBT_DEBUG("<%d> algorithm basic_alltoallv() called.", rank);
479 err = smpi_datatype_extent(sendtype, &lb, &sendext);
480 err = smpi_datatype_extent(recvtype, &lb, &recvext);
481 /* Local copy from self */
483 smpi_datatype_copy((char *)sendbuf + senddisps[rank] * sendext,
484 sendcounts[rank], sendtype,
485 (char *)recvbuf + recvdisps[rank] * recvext,
486 recvcounts[rank], recvtype);
487 if (err == MPI_SUCCESS && size > 1) {
488 /* Initiate all send/recv to/from others. */
489 requests = xbt_new(MPI_Request, 2 * (size - 1));
491 /* Create all receives that will be posted first */
492 for (i = 0; i < size; ++i) {
493 if (i == rank || recvcounts[i] == 0) {
495 ("<%d> skip request creation [src = %d, recvcounts[src] = %d]",
496 rank, i, recvcounts[i]);
500 smpi_irecv_init((char *)recvbuf + recvdisps[i] * recvext,
501 recvcounts[i], recvtype, i, system_tag, comm);
504 /* Now create all sends */
505 for (i = 0; i < size; ++i) {
506 if (i == rank || sendcounts[i] == 0) {
508 ("<%d> skip request creation [dst = %d, sendcounts[dst] = %d]",
509 rank, i, sendcounts[i]);
513 smpi_isend_init((char *)sendbuf + senddisps[i] * sendext,
514 sendcounts[i], sendtype, i, system_tag, comm);
517 /* Wait for them all. */
518 smpi_mpi_startall(count, requests);
519 XBT_DEBUG("<%d> wait for %d requests", rank, count);
520 smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);