3 /* smpi_coll.c -- various optimized routing for collectives */
5 /* Copyright (c) 2009 Stephane Genaud. */
6 /* All rights reserved. */
8 /* This program is free software; you can redistribute it and/or modify it
9 * * under the terms of the license (GNU LGPL) which comes with this package. */
18 #include "smpi_coll_private.h"
20 XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_coll, smpi,
21 "Logging specific to SMPI (coll)");
23 /* proc_tree taken and translated from P2P-MPI */
34 typedef struct proc_tree * proc_tree_t;
39 void build_tree( int index, int extent, proc_tree_t *tree);
40 proc_tree_t alloc_tree( int arity );
41 void free_tree( proc_tree_t tree);
42 void print_tree(proc_tree_t tree);
50 proc_tree_t alloc_tree( int arity ) {
51 proc_tree_t tree = malloc(1*sizeof(struct proc_tree));
54 tree->PROCTREE_A = arity;
56 tree->numChildren = 0;
57 tree->child = malloc(arity*sizeof(int));
58 for (i=0; i < arity; i++) {
69 void free_tree( proc_tree_t tree) {
77 * Build the tree depending on a process rank (index) and the group size (extent)
78 * @param index the rank of the calling process
79 * @param extent the total number of processes
81 void build_tree( int index, int extent, proc_tree_t *tree) {
82 int places = (*tree)->PROCTREE_A * index;
90 for (i = 1; i <= (*tree)->PROCTREE_A; i++) {
92 ch = (*tree)->PROCTREE_A * index + i + (*tree)->root;
93 //printf("places %d\n",places);
95 if (places < extent) {
96 //printf("ch <%d> = <%d>\n",i,ch);
97 //printf("adding to the tree at index <%d>\n\n",i-1);
98 (*tree)->child[i - 1] = ch;
99 (*tree)->numChildren++;
102 //fprintf(stderr,"not adding to the tree\n");
105 //fprintf(stderr,"procTree.numChildren <%d>\n",(*tree)->numChildren);
107 if (index == (*tree)->root) {
112 pr = (index - 1) / (*tree)->PROCTREE_A;
113 (*tree)->parent = pr;
118 * prints the tree as a graphical representation
120 void print_tree(proc_tree_t tree) {
123 if (-1 != tree->parent ) {
124 printf("[%d]\n +---",tree->parent);
130 printf("<%d>\n",tree->me);
131 for (i=0;i < tree->numChildren; i++) {
132 printf("%s +--- %d\n", spacer,tree->child[i]);
140 int tree_bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
141 int tree_bcast( void *buf, int count, MPI_Datatype datatype, int root,
142 MPI_Comm comm, proc_tree_t tree)
144 int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
146 int retval = MPI_SUCCESS;
148 smpi_mpi_request_t request;
149 smpi_mpi_request_t * requests;
151 rank = smpi_mpi_comm_rank(comm);
153 /* wait for data from my parent in the tree */
155 DEBUG3("<%d> tree_bcast(): i am not root: recv from %d, tag=%d)",rank,tree->parent,system_tag+rank);
156 retval = smpi_create_request(buf, count, datatype,
160 if (MPI_SUCCESS != retval) {
161 printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
162 rank,retval,__FILE__,__LINE__);
164 smpi_mpi_irecv(request);
165 DEBUG2("<%d> tree_bcast(): waiting on irecv from %d",rank, tree->parent);
166 smpi_mpi_wait(request, MPI_STATUS_IGNORE);
167 xbt_mallocator_release(smpi_global->request_mallocator, request);
170 requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
171 DEBUG2("<%d> creates %d requests (1 per child)\n",rank,tree->numChildren);
173 /* iniates sends to ranks lower in the tree */
174 for (i=0; i < tree->numChildren; i++) {
175 if (tree->child[i] != -1) {
176 DEBUG3("<%d> send to <%d>,tag=%d",rank,tree->child[i], system_tag+tree->child[i]);
177 retval = smpi_create_request(buf, count, datatype,
178 rank, tree->child[i],
179 system_tag + tree->child[i],
180 comm, &(requests[i]));
181 if (MPI_SUCCESS != retval) {
182 printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
183 rank,retval,__FILE__,__LINE__);
185 smpi_mpi_isend(requests[i]);
186 /* FIXME : we should not wait immediately here. See next FIXME. */
187 smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
188 xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
191 /* FIXME : normally, we sould wait only once all isend have been issued:
192 * this is the following commented code. It deadlocks, probably because
193 * of a bug in the sender process */
195 /* wait for completion of sends */
196 /* printf("[%d] wait for %d send completions\n",rank,tree->numChildren);
197 smpi_mpi_waitall( tree->numChildren, requests, MPI_STATUS_IGNORE);
198 printf("[%d] reqs completed\n)",rank);
203 /* checked ok with valgrind --leak-check=full*/
210 int tree_antibcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
211 int tree_antibcast( void *buf, int count, MPI_Datatype datatype, int root,
212 MPI_Comm comm, proc_tree_t tree)
214 int system_tag = 999; // used negative int but smpi_create_request() declares this illegal (to be checked)
216 int retval = MPI_SUCCESS;
218 smpi_mpi_request_t request;
219 smpi_mpi_request_t * requests;
221 rank = smpi_mpi_comm_rank(comm);
223 //------------------anti-bcast-------------------
225 // everyone sends to its parent, except root.
227 retval = smpi_create_request(buf, count, datatype,
231 if (MPI_SUCCESS != retval) {
232 ERROR4("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
233 rank,retval,__FILE__,__LINE__);
235 smpi_mpi_isend(request);
236 smpi_mpi_wait(request, MPI_STATUS_IGNORE);
237 xbt_mallocator_release(smpi_global->request_mallocator, request);
240 //every one receives as many messages as it has children
241 requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
242 for (i=0; i < tree->numChildren; i++) {
243 if (tree->child[i] != -1) {
244 retval = smpi_create_request(buf, count, datatype,
245 tree->child[i], rank,
246 system_tag + tree->child[i],
247 comm, &(requests[i]));
248 if (MPI_SUCCESS != retval) {
249 printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
250 rank,retval,__FILE__,__LINE__);
252 smpi_mpi_irecv(requests[i]);
253 /* FIXME : we should not wait immediately here. See next FIXME. */
254 smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
255 xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
261 /* checked ok with valgrind --leak-check=full*/
265 * bcast with a binary, ternary, or whatever tree ..
267 int nary_tree_bcast(void *buf, int count, MPI_Datatype datatype, int root,
268 MPI_Comm comm, int arity)
273 rank = smpi_mpi_comm_rank( comm );
274 DEBUG2("<%d> entered nary_tree_bcast(), arity=%d",rank,arity);
275 // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
276 proc_tree_t tree = alloc_tree( arity );
277 build_tree( rank, comm->size, &tree );
279 retval = tree_bcast( buf, count, datatype, root, comm, tree );
289 int nary_tree_barrier( MPI_Comm comm , int arity)
292 int retval = MPI_SUCCESS;
295 rank = smpi_mpi_comm_rank(comm);
296 // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
297 proc_tree_t tree = alloc_tree( arity );
299 build_tree( rank, comm->size, &tree );
301 retval = tree_antibcast( &dummy, 1, MPI_CHAR, 0, comm, tree);
302 if (MPI_SUCCESS != retval) {
303 printf("[%s:%d] ** Error: tree_antibcast() returned retval=%d\n",__FILE__,__LINE__,retval);
306 retval = tree_bcast(&dummy, 1, MPI_CHAR, 0, comm, tree);
312 /* checked ok with valgrind --leak-check=full*/
319 * this algorithm performs size steps (1<=s<=size) and
320 * at each step s, a process p sends iand receive to.from a unique distinct remote process
321 * size=5 : s=1: 4->0->1, 0->1->2, 1->2->3, ...
322 * s=2: 3->0->2, 4->1->3, 0->2->4, 1->3->0 , 2->4->1
324 * Openmpi calls this routine when the message size sent to each rank is greater than 3000 bytes
327 int smpi_coll_tuned_alltoall_pairwise (void *sendbuf, int sendcount, MPI_Datatype datatype,
328 void* recvbuf, int recvcount, MPI_Datatype recvdatatype, MPI_Comm comm)
330 int retval = MPI_SUCCESS;
332 int size = comm->size;
334 int sendto, recvfrom;
335 int tag_alltoall=999;
336 void * tmpsend, *tmprecv;
338 rank = smpi_mpi_comm_rank(comm);
339 INFO1("<%d> algorithm alltoall_pairwise() called.\n",rank);
342 /* Perform pairwise exchange - starting from 1 so the local copy is last */
343 for (step = 1; step < size+1; step++) {
345 /* who do we talk to in this step? */
346 sendto = (rank+step)%size;
347 recvfrom = (rank+size-step)%size;
349 /* where from are we sending and where from are we receiving actual data ? */
350 tmpsend = (char*)sendbuf+sendto*datatype->size*sendcount;
351 tmprecv = (char*)recvbuf+recvfrom*recvdatatype->size*recvcount;
353 /* send and receive */
354 /* in OpenMPI, they use :
355 err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALL,
356 tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL,
357 comm, MPI_STATUS_IGNORE, rank);
359 retval = MPI_Sendrecv(tmpsend, sendcount, datatype, sendto, tag_alltoall,
360 tmprecv, recvcount, recvdatatype, recvfrom, tag_alltoall,
361 comm, MPI_STATUS_IGNORE);
367 * helper: copy a datatype into another (in the simple case dt1=dt2)
369 int copy_dt( void *sbuf, int scount, const MPI_Datatype sdtype, void *rbuf, int rcount, const MPI_Datatype rdtype);
370 int copy_dt( void *sbuf, int scount, const MPI_Datatype sdtype,
371 void *rbuf, int rcount, const MPI_Datatype rdtype)
373 /* First check if we really have something to do */
375 return ((0 == scount) ? MPI_SUCCESS : MPI_ERR_TRUNCATE);
377 /* If same datatypes used, just copy. */
378 if (sdtype == rdtype) {
379 int count = ( scount < rcount ? scount : rcount );
380 memcpy( rbuf, sbuf, sdtype->size*count);
381 return ((scount > rcount) ? MPI_ERR_TRUNCATE : MPI_SUCCESS);
384 * - If receive packed.
386 * to be treated once we have the MPI_Pack things ...
388 return( MPI_SUCCESS );
392 * Alltoall basic_linear
394 int smpi_coll_tuned_alltoall_basic_linear(void *sbuf, int scount, MPI_Datatype sdtype,
395 void* rbuf, int rcount, MPI_Datatype rdtype, MPI_Comm comm)
398 int system_alltoall_tag = 888;
400 int size = comm->size;
411 rank = smpi_mpi_comm_rank(comm);
412 DEBUG1("<%d> algorithm alltoall_basic_linear() called.",rank);
414 err = smpi_mpi_type_get_extent(sdtype, &lb, &sndinc);
415 err = smpi_mpi_type_get_extent(rdtype, &lb, &rcvinc);
418 /* simple optimization */
419 psnd = ((char *) sbuf) + (rank * sndinc);
420 prcv = ((char *) rbuf) + (rank * rcvinc);
422 err = copy_dt( psnd, scount, sdtype, prcv, rcount, rdtype );
423 if (MPI_SUCCESS != err) {
427 /* If only one process, we're done. */
432 /* Initiate all send/recv to/from others. */
433 reqs = xbt_malloc(2*(size-1) * sizeof(smpi_mpi_request_t));
435 prcv = (char *) rbuf;
436 psnd = (char *) sbuf;
438 /* Post all receives first -- a simple optimization */
439 for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
440 err = smpi_create_request( prcv + (i * rcvinc), rcount, rdtype,
443 comm, &(reqs[nreq]));
444 if (MPI_SUCCESS != err) {
445 DEBUG2("<%d> failed to create request for rank %d",rank,i);
446 for (i=0;i< nreq;i++)
447 xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
452 /* Now post all sends in reverse order
453 * - We would like to minimize the search time through message queue
454 * when messages actually arrive in the order in which they were posted.
456 for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size ) {
457 err = smpi_create_request (psnd + (i * sndinc), scount, sdtype,
460 comm, &(reqs[nreq]));
461 if (MPI_SUCCESS != err) {
462 DEBUG2("<%d> failed to create request for rank %d\n",rank,i);
463 for (i=0;i< nreq;i++)
464 xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
470 /* Start your engines. This will never return an error. */
471 for ( i=0; i< nreq/2; i++ ) {
472 DEBUG3("<%d> issued irecv request reqs[%d]=%p",rank,i,reqs[i]);
473 smpi_mpi_irecv( reqs[i] );
475 for ( i= nreq/2; i<nreq; i++ ) {
476 DEBUG3("<%d> issued isend request reqs[%d]=%p",rank,i,reqs[i]);
477 smpi_mpi_isend( reqs[i] );
481 /* Wait for them all. If there's an error, note that we don't
482 * care what the error was -- just that there *was* an error. The
483 * PML will finish all requests, even if one or more of them fail.
484 * i.e., by the end of this call, all the requests are free-able.
485 * So free them anyway -- even if there was an error, and return
486 * the error after we free everything. */
488 DEBUG2("<%d> wait for %d requests",rank,nreq);
489 // waitall is buggy: use a loop instead for the moment
490 // err = smpi_mpi_waitall(nreq, reqs, MPI_STATUS_IGNORE);
491 for (i=0;i<nreq;i++) {
492 err = smpi_mpi_wait( reqs[i], MPI_STATUS_IGNORE);
496 assert( nreq == 2*(size-1) );
497 for (i=0;i< nreq;i++) {
498 xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
508 * Openmpi calls this routine when the message size sent to each rank < 2000 bytes and size < 12
512 int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount, MPI_Datatype sdtype,
513 void* recvbuf, int recvcount, MPI_Datatype rdtype, MPI_Comm comm)
515 /* int size = comm->size;
517 int sendto, recvfrom, distance, *displs=NULL, *blen=NULL;
518 int maxpacksize, packsize, position;
519 char * tmpbuf=NULL, *packbuf=NULL;
520 ptrdiff_t lb, sext, rext;
525 rank = smpi_mpi_comm_rank(comm);
527 INFO0("coll:tuned:alltoall_intra_bruck ** NOT IMPLEMENTED YET**");
529 displs = xbt_malloc(size*sizeof(int));
530 blen = xbt_malloc(size*sizeof(int));
534 /* Prepare for packing data */
536 err = MPI_Pack_size( scount*size, sdtype, comm, &maxpacksize );
537 if (err != MPI_SUCCESS) { }
539 /* pack buffer allocation */
540 /* packbuf = (char*) malloc((unsigned) maxpacksize);
541 if (packbuf == NULL) { }
543 /* tmp buffer allocation for message data */
544 /* tmpbuf = xbt_malloc(scount*size*sext);
545 if (tmpbuf == NULL) { }
548 /* Step 1 - local rotation - shift up by rank */
549 /* err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) ((size-rank)*scount),
550 tmpbuf, ((char*)sbuf)+rank*scount*sext);
552 line = __LINE__; err = -1; goto err_hndl;
556 err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) (rank*scount),
557 tmpbuf+(size-rank)*scount*sext, (char*)sbuf);
559 line = __LINE__; err = -1; goto err_hndl;
563 /* perform communication step */
564 /* for (distance = 1; distance < size; distance<<=1) {
566 /* send data to "sendto" */
567 /* sendto = (rank+distance)%size;
568 recvfrom = (rank-distance+size)%size;
572 /* create indexed datatype */
573 // for (i = 1; i < size; i++) {
574 // if ((i&distance) == distance) {
575 // displs[k] = i*scount; blen[k] = scount;
579 /* Set indexes and displacements */
580 // err = MPI_Type_indexed(k, blen, displs, sdtype, &iddt);
581 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
582 // /* Commit the new datatype */
583 /// err = MPI_Type_commit(&iddt);
584 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
586 /* have the new distribution ddt, pack and exchange data */
587 // err = MPI_Pack(tmpbuf, 1, iddt, packbuf, maxpacksize, &packsize, comm);
588 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
591 // err = ompi_coll_tuned_sendrecv ( packbuf, packsize, MPI_PACKED, sendto,
592 // MCA_COLL_BASE_TAG_ALLTOALL,
593 // rbuf, packsize, MPI_PACKED, recvfrom,
594 // MCA_COLL_BASE_TAG_ALLTOALL,
595 // comm, MPI_STATUS_IGNORE, rank);
596 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
598 /* Unpack data from rbuf to tmpbuf */
600 // err = MPI_Unpack(rbuf, packsize, &position,
601 // tmpbuf, 1, iddt, comm);
602 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
605 // err = MPI_Type_free(&iddt);
606 // if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
607 // } /* end of for (distance = 1... */
609 /* Step 3 - local rotation - */
610 // for (i = 0; i < size; i++) {
612 // err = ompi_ddt_copy_content_same_ddt (rdtype, (int32_t) rcount,
613 // ((char*)rbuf)+(((rank-i+size)%size)*rcount*rext),
614 // tmpbuf+i*rcount*rext);
617 // line = __LINE__; err = -1; goto err_hndl;
621 /* Step 4 - clean up */
622 /* if (tmpbuf != NULL) free(tmpbuf);
623 if (packbuf != NULL) free(packbuf);
625 if (displs != NULL) free(displs);
626 if (blen != NULL) free(blen);
631 OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
632 if (tmpbuf != NULL) free(tmpbuf);
633 if (packbuf != NULL) free(packbuf);
635 if (displs != NULL) free(displs);
636 if (blen != NULL) free(blen);
640 return -1; /* FIXME: to be changed*/
643 static void print_buffer_int(void *buf, int len, char *msg, int rank);
644 static void print_buffer_int(void *buf, int len, char *msg, int rank)
647 fprintf(stderr,"**<%d> %s (#%d): ", rank, msg,len);
648 for (tmp = 0; tmp < len; tmp++) {
650 fprintf(stderr,"[%d (%p)]", v[tmp],v+tmp);
652 fprintf(stderr,"\n");
662 int smpi_coll_basic_alltoallv(void *sbuf, int *scounts, int *sdisps, MPI_Datatype sdtype,
663 void *rbuf, int *rcounts, int *rdisps, MPI_Datatype rdtype,
667 int system_alltoallv_tag = 889;
669 int size = comm->size;
682 rank = smpi_mpi_comm_rank(comm);
683 DEBUG1("<%d> algorithm basic_alltoallv() called.",rank);
685 err = smpi_mpi_type_get_extent(sdtype, &lb, &sndextent);
686 err = smpi_mpi_type_get_extent(rdtype, &lb, &rcvextent);
689 //print_buffer_int(psnd,size*size,xbt_strdup("sbuff"),rank);
691 /* copy the local sbuf to rbuf when it's me */
692 psnd = ((char *) sbuf) + (sdisps[rank] * sndextent);
693 prcv = ((char *) rbuf) + (rdisps[rank] * rcvextent);
695 if (0 != scounts[rank]) {
696 err = copy_dt( psnd, scounts[rank], sdtype, prcv, rcounts[rank], rdtype );
697 if (MPI_SUCCESS != err) {
702 /* If only one process, we're done. */
707 /* Initiate all send/recv to/from others. */
708 reqs = xbt_malloc(2*(size-1) * sizeof(smpi_mpi_request_t));
711 /* Create all receives that will be posted first */
712 for (i = 0; i < size; ++i) {
713 if (i == rank || 0 == rcounts[i]) {
714 DEBUG3("<%d> skip req creation i=%d,rcounts[i]=%d",rank,i, rcounts[i]);
717 prcv = ((char *) rbuf) + (rdisps[i] * rcvextent);
719 err = smpi_create_request( prcv, rcounts[i], rdtype,
721 system_alltoallv_tag,
722 comm, &(reqs[rreq]));
723 if (MPI_SUCCESS != err) {
724 DEBUG2("<%d> failed to create request for rank %d",rank,i);
725 for (i=0;i< rreq;i++)
726 xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
731 DEBUG2("<%d> %d irecv reqs created",rank,rreq);
732 /* Now create all sends */
733 for (i = 0; i < size; ++i) {
734 if (i == rank || 0 == scounts[i]) {
735 DEBUG3("<%d> skip req creation i=%d,scounts[i]=%d",rank,i, scounts[i]);
738 psnd = ((char *) sbuf) + (sdisps[i] * sndextent);
740 //fprintf(stderr,"<%d> send %d elems to <%d>\n",rank,scounts[i],i);
741 //print_buffer_int(psnd,scounts[i],xbt_strdup("sbuff part"),rank);
742 err = smpi_create_request (psnd, scounts[i], sdtype,
744 system_alltoallv_tag,
745 comm, &(reqs[rreq+sreq]));
746 if (MPI_SUCCESS != err) {
747 DEBUG2("<%d> failed to create request for rank %d\n",rank,i);
748 for (i=0;i< rreq+sreq;i++)
749 xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
754 DEBUG2("<%d> %d isend reqs created",rank,sreq);
756 /* Start your engines. This will never return an error. */
757 for ( i=0; i< rreq; i++ ) {
758 DEBUG3("<%d> issued irecv request reqs[%d]=%p",rank,i,reqs[i]);
759 smpi_mpi_irecv( reqs[i] );
761 DEBUG3("<%d> for (i=%d;i<%d)",rank,rreq,sreq);
762 for ( i=rreq; i< rreq+sreq; i++ ) {
763 DEBUG3("<%d> issued isend request reqs[%d]=%p",rank,i,reqs[i]);
764 smpi_mpi_isend( reqs[i] );
768 /* Wait for them all. If there's an error, note that we don't
769 * care what the error was -- just that there *was* an error. The
770 * PML will finish all requests, even if one or more of them fail.
771 * i.e., by the end of this call, all the requests are free-able.
772 * So free them anyway -- even if there was an error, and return
773 * the error after we free everything. */
775 DEBUG2("<%d> wait for %d requests",rank,rreq+sreq);
776 // waitall is buggy: use a loop instead for the moment
777 // err = smpi_mpi_waitall(nreq, reqs, MPI_STATUS_IGNORE);
778 for (i=0;i< rreq+sreq;i++) {
779 err = smpi_mpi_wait( reqs[i], MPI_STATUS_IGNORE);
783 /* nreq might be < 2*(size-1) since some request creations are skipped */
784 for (i=0;i< rreq+sreq;i++) {
785 xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
795 * -----------------------------------------------------------------------------------------------------
797 * -----------------------------------------------------------------------------------------------------
806 for (rank=0;rank<size;rank++) {
807 printf("--------------tree for rank %d ----------\n",rank);
808 tree = alloc_tree( 2 );
809 build_tree( rank, size, &tree );
814 printf("-------------- bcast ----------\n");
815 for (rank=0;rank<size;rank++) {