3 /* smpi_coll.c -- various optimized routing for collectives */
5 /* Copyright (c) 2009 Stephane Genaud. */
6 /* All rights reserved. */
8 /* This program is free software; you can redistribute it and/or modify it
9 * * under the terms of the license (GNU LGPL) which comes with this package. */
17 #include "smpi_coll_private.h"
20 /* proc_tree taken and translated from P2P-MPI */
31 typedef struct proc_tree * proc_tree_t;
36 void build_tree( int index, int extent, proc_tree_t *tree);
37 proc_tree_t alloc_tree( int arity );
38 void free_tree( proc_tree_t tree);
39 void print_tree(proc_tree_t tree);
47 proc_tree_t alloc_tree( int arity ) {
48 proc_tree_t tree = malloc(1*sizeof(struct proc_tree));
51 tree->PROCTREE_A = arity;
53 tree->numChildren = 0;
54 tree->child = malloc(arity*sizeof(int));
55 for (i=0; i < arity; i++) {
66 void free_tree( proc_tree_t tree) {
74 * Build the tree depending on a process rank (index) and the group size (extent)
75 * @param index the rank of the calling process
76 * @param extent the total number of processes
78 void build_tree( int index, int extent, proc_tree_t *tree) {
79 int places = (*tree)->PROCTREE_A * index;
87 for (i = 1; i <= (*tree)->PROCTREE_A; i++) {
89 ch = (*tree)->PROCTREE_A * index + i + (*tree)->root;
90 //printf("places %d\n",places);
92 if (places < extent) {
93 //printf("ch <%d> = <%d>\n",i,ch);
94 //printf("adding to the tree at index <%d>\n\n",i-1);
95 (*tree)->child[i - 1] = ch;
96 (*tree)->numChildren++;
99 //fprintf(stderr,"not adding to the tree\n");
102 //fprintf(stderr,"procTree.numChildren <%d>\n",(*tree)->numChildren);
104 if (index == (*tree)->root) {
109 pr = (index - 1) / (*tree)->PROCTREE_A;
110 (*tree)->parent = pr;
115 * prints the tree as a graphical representation
117 void print_tree(proc_tree_t tree) {
120 if (-1 != tree->parent ) {
121 printf("[%d]\n +---",tree->parent);
127 printf("<%d>\n",tree->me);
128 for (i=0;i < tree->numChildren; i++) {
129 printf("%s +--- %d\n", spacer,tree->child[i]);
137 int tree_bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
138 int tree_bcast( void *buf, int count, MPI_Datatype datatype, int root,
139 MPI_Comm comm, proc_tree_t tree)
141 int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
143 int retval = MPI_SUCCESS;
145 smpi_mpi_request_t request;
146 smpi_mpi_request_t * requests;
148 rank = smpi_mpi_comm_rank(comm);
150 /* wait for data from my parent in the tree */
153 printf("[%d] recv(%d from %d, tag=%d)\n",rank,rank, tree->parent, system_tag+rank);
155 retval = smpi_create_request(buf, count, datatype,
159 if (MPI_SUCCESS != retval) {
160 printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
161 rank,retval,__FILE__,__LINE__);
163 smpi_mpi_irecv(request);
165 printf("[%d] waiting on irecv from %d\n",rank , tree->parent);
167 smpi_mpi_wait(request, MPI_STATUS_IGNORE);
168 xbt_mallocator_release(smpi_global->request_mallocator, request);
171 requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
173 printf("[%d] creates %d requests\n",rank,tree->numChildren);
176 /* iniates sends to ranks lower in the tree */
177 for (i=0; i < tree->numChildren; i++) {
178 if (tree->child[i] != -1) {
180 printf("[%d] send(%d->%d, tag=%d)\n",rank,rank, tree->child[i], system_tag+tree->child[i]);
182 retval = smpi_create_request(buf, count, datatype,
183 rank, tree->child[i],
184 system_tag + tree->child[i],
185 comm, &(requests[i]));
187 printf("[%d] after create req[%d]=%p req->(src=%d,dst=%d)\n",rank , i, requests[i],requests[i]->src,requests[i]->dst );
189 if (MPI_SUCCESS != retval) {
190 printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
191 rank,retval,__FILE__,__LINE__);
193 smpi_mpi_isend(requests[i]);
194 /* FIXME : we should not wait immediately here. See next FIXME. */
195 smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
196 xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
199 /* FIXME : normally, we sould wait only once all isend have been issued:
200 * this is the following commented code. It deadlocks, probably because
201 * of a bug in the sender process */
203 /* wait for completion of sends */
204 /* printf("[%d] wait for %d send completions\n",rank,tree->numChildren);
205 smpi_mpi_waitall( tree->numChildren, requests, MPI_STATUS_IGNORE);
206 printf("[%d] reqs completed\n)",rank);
211 /* checked ok with valgrind --leak-check=full*/
218 int tree_antibcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
219 int tree_antibcast( void *buf, int count, MPI_Datatype datatype, int root,
220 MPI_Comm comm, proc_tree_t tree)
222 int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
224 int retval = MPI_SUCCESS;
226 smpi_mpi_request_t request;
227 smpi_mpi_request_t * requests;
229 rank = smpi_mpi_comm_rank(comm);
231 //------------------anti-bcast-------------------
233 // everyone sends to its parent, except root.
235 retval = smpi_create_request(buf, count, datatype,
239 if (MPI_SUCCESS != retval) {
240 printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
241 rank,retval,__FILE__,__LINE__);
243 smpi_mpi_isend(request);
244 smpi_mpi_wait(request, MPI_STATUS_IGNORE);
245 xbt_mallocator_release(smpi_global->request_mallocator, request);
248 //every one receives as many messages as it has children
249 requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
250 for (i=0; i < tree->numChildren; i++) {
251 if (tree->child[i] != -1) {
252 retval = smpi_create_request(buf, count, datatype,
253 tree->child[i], rank,
254 system_tag + tree->child[i],
255 comm, &(requests[i]));
256 if (MPI_SUCCESS != retval) {
257 printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
258 rank,retval,__FILE__,__LINE__);
260 smpi_mpi_irecv(requests[i]);
261 /* FIXME : we should not wait immediately here. See next FIXME. */
262 smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
263 xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
269 /* checked ok with valgrind --leak-check=full*/
273 * bcast with a binary, ternary, or whatever tree ..
275 int nary_tree_bcast(void *buf, int count, MPI_Datatype datatype, int root,
276 MPI_Comm comm, int arity)
281 rank = smpi_mpi_comm_rank( comm );
282 // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
283 proc_tree_t tree = alloc_tree( arity );
284 build_tree( rank, comm->size, &tree );
286 retval = tree_bcast( buf, count, datatype, root, comm, tree );
297 int nary_tree_barrier( MPI_Comm comm , int arity)
300 int retval = MPI_SUCCESS;
303 rank = smpi_mpi_comm_rank(comm);
304 // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
305 proc_tree_t tree = alloc_tree( arity );
307 build_tree( rank, comm->size, &tree );
309 retval = tree_antibcast( &dummy, 1, MPI_CHAR, 0, comm, tree);
310 if (MPI_SUCCESS != retval) {
311 printf("[%s:%d] ** Error: tree_antibcast() returned retval=%d\n",__FILE__,__LINE__,retval);
314 retval = tree_bcast(&dummy, 1, MPI_CHAR, 0, comm, tree);
320 /* checked ok with valgrind --leak-check=full*/
325 int smpi_coll_tuned_alltoall_pairwise (void *sendbuf, int sendcount, MPI_Datatype datatype,
326 void* recvbuf, int recvcount, MPI_Datatype recvdatatype, MPI_Comm comm)
328 int retval = MPI_SUCCESS;
330 int size = comm->size;
332 int sendto, recvfrom;
333 int tag_alltoall=999;
334 void * tmpsend, *tmprecv;
336 rank = smpi_mpi_comm_rank(comm);
337 /* Perform pairwise exchange - starting from 1 so the local copy is last */
338 for (step = 1; step < size+1; step++) {
340 /* who do we talk to in this step? */
341 sendto = (rank+step)%size;
342 recvfrom = (rank+size-step)%size;
344 /* where from are we sending and where from are we receiving actual data ? */
345 tmpsend = (char*)sendbuf+sendto*datatype->size*sendcount;
346 tmprecv = (char*)recvbuf+recvfrom*recvdatatype->size*recvcount;
348 /* send and receive */
349 /* in OpenMPI, they use :
350 err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALL,
351 tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL,
352 comm, MPI_STATUS_IGNORE, rank);
354 retval = MPI_Sendrecv(tmpsend, sendcount, datatype, sendto, tag_alltoall,
355 tmprecv, recvcount, recvdatatype, recvfrom, tag_alltoall,
356 comm, MPI_STATUS_IGNORE);
362 int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount, MPI_Datatype sdtype,
363 void* recvbuf, int recvcount, MPI_Datatype rdtype, MPI_Comm comm)
368 int sendto, recvfrom, distance, *displs=NULL, *blen=NULL;
369 int maxpacksize, packsize, position;
370 char * tmpbuf=NULL, *packbuf=NULL;
371 ptrdiff_t lb, sext, rext;
376 size = ompi_comm_size(comm);
377 rank = ompi_comm_rank(comm);
379 OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_bruck rank %d", rank));
381 err = ompi_ddt_get_extent (sdtype, &lb, &sext);
382 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
384 err = ompi_ddt_get_extent (rdtype, &lb, &rext);
385 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
388 displs = (int *) malloc(size*sizeof(int));
389 if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
390 blen = (int *) malloc(size*sizeof(int));
391 if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }
394 /* Prepare for packing data */
395 /*err = MPI_Pack_size( scount*size, sdtype, comm, &maxpacksize );
396 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
398 /* pack buffer allocation */
399 /* packbuf = (char*) malloc((unsigned) maxpacksize);
400 if (packbuf == NULL) { line = __LINE__; err = -1; goto err_hndl; }
402 /* tmp buffer allocation for message data */
403 /* tmpbuf = (char *) malloc(scount*size*sext);
404 if (tmpbuf == NULL) { line = __LINE__; err = -1; goto err_hndl; }
407 /* Step 1 - local rotation - shift up by rank */
408 /* err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) ((size-rank)*scount),
409 tmpbuf, ((char*)sbuf)+rank*scount*sext);
411 line = __LINE__; err = -1; goto err_hndl;
415 err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) (rank*scount),
416 tmpbuf+(size-rank)*scount*sext, (char*)sbuf);
418 line = __LINE__; err = -1; goto err_hndl;
422 /* perform communication step */
423 /* for (distance = 1; distance < size; distance<<=1) {
425 /* send data to "sendto" */
426 /* sendto = (rank+distance)%size;
427 recvfrom = (rank-distance+size)%size;
431 /* create indexed datatype */
432 // for (i = 1; i < size; i++) {
433 // if ((i&distance) == distance) {
434 // displs[k] = i*scount; blen[k] = scount;
438 /* Set indexes and displacements */
439 // err = MPI_Type_indexed(k, blen, displs, sdtype, &iddt);
440 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
441 // /* Commit the new datatype */
442 /// err = MPI_Type_commit(&iddt);
443 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
445 /* have the new distribution ddt, pack and exchange data */
446 // err = MPI_Pack(tmpbuf, 1, iddt, packbuf, maxpacksize, &packsize, comm);
447 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
450 // err = ompi_coll_tuned_sendrecv ( packbuf, packsize, MPI_PACKED, sendto,
451 // MCA_COLL_BASE_TAG_ALLTOALL,
452 // rbuf, packsize, MPI_PACKED, recvfrom,
453 // MCA_COLL_BASE_TAG_ALLTOALL,
454 // comm, MPI_STATUS_IGNORE, rank);
455 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
457 /* Unpack data from rbuf to tmpbuf */
459 // err = MPI_Unpack(rbuf, packsize, &position,
460 // tmpbuf, 1, iddt, comm);
461 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
464 // err = MPI_Type_free(&iddt);
465 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
466 // } /* end of for (distance = 1... */
468 /* Step 3 - local rotation - */
469 // for (i = 0; i < size; i++) {
471 // err = ompi_ddt_copy_content_same_ddt (rdtype, (int32_t) rcount,
472 // ((char*)rbuf)+(((rank-i+size)%size)*rcount*rext),
473 // tmpbuf+i*rcount*rext);
476 // line = __LINE__; err = -1; goto err_hndl;
480 /* Step 4 - clean up */
481 /* if (tmpbuf != NULL) free(tmpbuf);
482 if (packbuf != NULL) free(packbuf);
484 if (displs != NULL) free(displs);
485 if (blen != NULL) free(blen);
490 OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
491 if (tmpbuf != NULL) free(tmpbuf);
492 if (packbuf != NULL) free(packbuf);
494 if (displs != NULL) free(displs);
495 if (blen != NULL) free(blen);
505 * -----------------------------------------------------------------------------------------------------
507 * -----------------------------------------------------------------------------------------------------
516 for (rank=0;rank<size;rank++) {
517 printf("--------------tree for rank %d ----------\n",rank);
518 tree = alloc_tree( 2 );
519 build_tree( rank, size, &tree );
524 printf("-------------- bcast ----------\n");
525 for (rank=0;rank<size;rank++) {