Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
do not call smpi_bench_* from SMPI_MPI_Allreduce since it calls SMPI_MPI_* function...
[simgrid.git] / src / smpi / smpi_coll.c
1 /* $Id$tag */
2
3 /* smpi_coll.c -- various optimized routing for collectives                   */
4
5 /* Copyright (c) 2009 Stephane Genaud.                                        */
6 /* All rights reserved.                                                       */
7
8 /* This program is free software; you can redistribute it and/or modify it
9  *  * under the terms of the license (GNU LGPL) which comes with this package. */
10
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15
16 #include "private.h"
17 #include "smpi_coll_private.h"
18
19
20 /* proc_tree taken and translated from P2P-MPI */
21
22 struct proc_tree {
23         int PROCTREE_A;
24         int numChildren;
25         int * child;
26         int parent;
27         int me;
28         int root;
29         int isRoot;
30 };
31 typedef struct proc_tree * proc_tree_t;
32
33
34
35 /* prototypes */
36 void build_tree( int index, int extent, proc_tree_t *tree);
37 proc_tree_t alloc_tree( int arity );
38 void free_tree( proc_tree_t tree);
39 void print_tree(proc_tree_t tree);
40
41
42
43
44 /**
45  * alloc and init
46  **/
47 proc_tree_t alloc_tree( int arity ) {
48         proc_tree_t tree = malloc(1*sizeof(struct proc_tree));
49         int i;
50
51         tree->PROCTREE_A = arity;
52         tree->isRoot = 0; 
53         tree->numChildren = 0;
54         tree->child = malloc(arity*sizeof(int));
55         for (i=0; i < arity; i++) {
56                 tree->child[i] = -1;
57         }
58         tree->root = -1;
59         tree->parent = -1;
60         return( tree );
61 }
62
63 /**
64  * free
65  **/
66 void free_tree( proc_tree_t tree) {
67         free (tree->child );
68         free(tree);
69 }
70
71
72
73 /**
74  * Build the tree depending on a process rank (index) and the group size (extent)
75  * @param index the rank of the calling process
76  * @param extent the total number of processes
77  **/
78 void build_tree( int index, int extent, proc_tree_t *tree) {
79         int places = (*tree)->PROCTREE_A * index;
80         int i;
81         int ch;
82         int pr;
83
84         (*tree)->me = index;
85         (*tree)->root = 0 ;
86
87         for (i = 1; i <= (*tree)->PROCTREE_A; i++) {
88                 ++places;
89                 ch = (*tree)->PROCTREE_A * index + i + (*tree)->root;
90                 //printf("places %d\n",places);
91                 ch %= extent;
92                 if (places < extent) {
93                          //printf("ch <%d> = <%d>\n",i,ch);
94                          //printf("adding to the tree at index <%d>\n\n",i-1);
95                         (*tree)->child[i - 1] = ch;
96                         (*tree)->numChildren++;
97                 }
98                 else {
99                        //fprintf(stderr,"not adding to the tree\n");
100                 }
101         }
102         //fprintf(stderr,"procTree.numChildren <%d>\n",(*tree)->numChildren);
103
104         if (index == (*tree)->root) {
105                 (*tree)->isRoot = 1;
106         }
107         else {
108                 (*tree)->isRoot = 0;
109                 pr = (index - 1) / (*tree)->PROCTREE_A;
110                 (*tree)->parent = pr;
111         }
112 }
113
114 /**
115  * prints the tree as a graphical representation
116  **/
117 void print_tree(proc_tree_t tree) {
118       int i;
119       char *spacer;
120       if (-1 != tree->parent ) {
121             printf("[%d]\n   +---",tree->parent);
122             spacer= strdup("     ");
123       }
124       else {
125             spacer=strdup("");
126       }
127       printf("<%d>\n",tree->me);
128       for (i=0;i < tree->numChildren; i++) {
129               printf("%s   +--- %d\n", spacer,tree->child[i]);
130       }
131       free(spacer);
132 }
133       
134 /**
135  * bcast
136  **/
137 int tree_bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
138 int tree_bcast( void *buf, int count, MPI_Datatype datatype, int root,
139                 MPI_Comm comm, proc_tree_t tree) 
140 {
141         int system_tag = 999;  // used negative int but smpi_create_request() declares this illegal (to be checked)
142         int rank;
143         int retval = MPI_SUCCESS;
144         int i;
145         smpi_mpi_request_t request;
146         smpi_mpi_request_t * requests;
147
148         rank = smpi_mpi_comm_rank(comm);
149
150         /* wait for data from my parent in the tree */
151         if (!tree->isRoot) {
152 #ifdef DEBUG_STEPH
153                 printf("[%d] recv(%d  from %d, tag=%d)\n",rank,rank, tree->parent, system_tag+rank);
154 #endif
155                 retval = smpi_create_request(buf, count, datatype, 
156                                 tree->parent, rank, 
157                                 system_tag + rank, 
158                                 comm, &request);
159                 if (MPI_SUCCESS != retval) {
160                         printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
161                                         rank,retval,__FILE__,__LINE__);
162                 }
163                 smpi_mpi_irecv(request);
164 #ifdef DEBUG_STEPH
165                 printf("[%d] waiting on irecv from %d\n",rank , tree->parent);
166 #endif
167                 smpi_mpi_wait(request, MPI_STATUS_IGNORE);
168                 xbt_mallocator_release(smpi_global->request_mallocator, request);
169         }
170
171         requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
172 #ifdef DEBUG_STEPH
173         printf("[%d] creates %d requests\n",rank,tree->numChildren);
174 #endif
175
176         /* iniates sends to ranks lower in the tree */
177         for (i=0; i < tree->numChildren; i++) {
178                 if (tree->child[i] != -1) {
179 #ifdef DEBUG_STEPH
180                         printf("[%d] send(%d->%d, tag=%d)\n",rank,rank, tree->child[i], system_tag+tree->child[i]);
181 #endif
182                         retval = smpi_create_request(buf, count, datatype, 
183                                         rank, tree->child[i], 
184                                         system_tag + tree->child[i], 
185                                         comm, &(requests[i]));
186 #ifdef DEBUG_STEPH
187                         printf("[%d] after create req[%d]=%p req->(src=%d,dst=%d)\n",rank , i, requests[i],requests[i]->src,requests[i]->dst );
188 #endif
189                         if (MPI_SUCCESS != retval) {
190                               printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
191                                               rank,retval,__FILE__,__LINE__);
192                         }
193                         smpi_mpi_isend(requests[i]);
194                         /* FIXME : we should not wait immediately here. See next FIXME. */
195                         smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
196                         xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
197                 }
198         }
199         /* FIXME : normally, we sould wait only once all isend have been issued:
200          * this is the following commented code. It deadlocks, probably because
201          * of a bug in the sender process */
202         
203         /* wait for completion of sends */
204         /* printf("[%d] wait for %d send completions\n",rank,tree->numChildren);
205           smpi_mpi_waitall( tree->numChildren, requests, MPI_STATUS_IGNORE);
206          printf("[%d] reqs completed\n)",rank);
207            */
208         
209         xbt_free(requests);
210         return(retval);
211         /* checked ok with valgrind --leak-check=full*/
212 }
213
214
215 /**
216  * anti-bcast
217  **/
218 int tree_antibcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
219 int tree_antibcast( void *buf, int count, MPI_Datatype datatype, int root,
220                 MPI_Comm comm, proc_tree_t tree) 
221 {
222         int system_tag = 999;  // used negative int but smpi_create_request() declares this illegal (to be checked)
223         int rank;
224         int retval = MPI_SUCCESS;
225         int i;
226         smpi_mpi_request_t request;
227         smpi_mpi_request_t * requests;
228
229         rank = smpi_mpi_comm_rank(comm);
230
231           //------------------anti-bcast-------------------
232         
233         // everyone sends to its parent, except root.
234         if (!tree->isRoot) {
235                 retval = smpi_create_request(buf, count, datatype,
236                                 rank,tree->parent,  
237                                 system_tag + rank, 
238                                 comm, &request);
239                 if (MPI_SUCCESS != retval) {
240                         printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
241                                         rank,retval,__FILE__,__LINE__);
242                 }
243                 smpi_mpi_isend(request);
244                 smpi_mpi_wait(request, MPI_STATUS_IGNORE);
245                 xbt_mallocator_release(smpi_global->request_mallocator, request);
246         }
247
248         //every one receives as many messages as it has children
249         requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
250         for (i=0; i < tree->numChildren; i++) {
251                 if (tree->child[i] != -1) {
252                         retval = smpi_create_request(buf, count, datatype, 
253                                         tree->child[i], rank, 
254                                         system_tag + tree->child[i], 
255                                         comm, &(requests[i]));
256                         if (MPI_SUCCESS != retval) {
257                                 printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
258                                                 rank,retval,__FILE__,__LINE__);
259                         }
260                         smpi_mpi_irecv(requests[i]);
261                         /* FIXME : we should not wait immediately here. See next FIXME. */
262                         smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
263                         xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
264                 }
265         }
266         xbt_free(requests);
267         return(retval);
268
269         /* checked ok with valgrind --leak-check=full*/
270
271
272 /**
273  * bcast with a binary, ternary, or whatever tree .. 
274  **/
275 int nary_tree_bcast(void *buf, int count, MPI_Datatype datatype, int root,
276                 MPI_Comm comm, int arity)
277 {
278 int rank;
279 int retval;
280
281         rank = smpi_mpi_comm_rank( comm );
282         // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
283         proc_tree_t tree = alloc_tree( arity ); 
284         build_tree( rank, comm->size, &tree );
285
286         retval = tree_bcast( buf, count, datatype, root, comm, tree );
287
288         free_tree( tree );
289         return( retval );
290 }
291
292
293 /**
294  * Barrier
295  **/
296
297 int nary_tree_barrier( MPI_Comm comm , int arity)
298 {
299         int rank;
300         int retval = MPI_SUCCESS;
301         char dummy='$';
302
303         rank = smpi_mpi_comm_rank(comm);
304         // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
305         proc_tree_t tree = alloc_tree( arity ); 
306
307         build_tree( rank, comm->size, &tree );
308
309         retval = tree_antibcast( &dummy, 1, MPI_CHAR, 0, comm, tree);
310         if (MPI_SUCCESS != retval) {
311                 printf("[%s:%d] ** Error: tree_antibcast() returned retval=%d\n",__FILE__,__LINE__,retval);
312         }
313         else {
314             retval = tree_bcast(&dummy,  1, MPI_CHAR, 0, comm, tree);
315         }
316
317         free_tree( tree );
318         return(retval);
319
320         /* checked ok with valgrind --leak-check=full*/
321 }
322
323
324
325 int smpi_coll_tuned_alltoall_pairwise (void *sendbuf, int sendcount, MPI_Datatype datatype,
326                     void* recvbuf, int recvcount, MPI_Datatype recvdatatype, MPI_Comm comm)
327 {
328         int retval = MPI_SUCCESS;
329           int rank;
330           int size = comm->size;
331           int step;
332           int sendto, recvfrom;
333           int tag_alltoall=999;
334           void * tmpsend, *tmprecv;
335
336           rank = smpi_mpi_comm_rank(comm);
337           /* Perform pairwise exchange - starting from 1 so the local copy is last */
338           for (step = 1; step < size+1; step++) {
339
340                     /* who do we talk to in this step? */
341                     sendto  = (rank+step)%size;
342                     recvfrom = (rank+size-step)%size;
343
344                     /* where from are we sending and where from are we receiving actual data ? */
345                     tmpsend = (char*)sendbuf+sendto*datatype->size*sendcount;
346                     tmprecv = (char*)recvbuf+recvfrom*recvdatatype->size*recvcount;
347
348                     /* send and receive */
349                     /* in OpenMPI, they use :
350                          err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALL,
351                          tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL,
352                          comm, MPI_STATUS_IGNORE, rank);
353                      */
354                     retval = MPI_Sendrecv(tmpsend, sendcount, datatype, sendto, tag_alltoall,
355                                                 tmprecv, recvcount, recvdatatype, recvfrom, tag_alltoall,
356                                                   comm, MPI_STATUS_IGNORE);
357           }
358           return(retval);
359
360 }
361
362 int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount, MPI_Datatype sdtype,
363                     void* recvbuf, int recvcount, MPI_Datatype rdtype, MPI_Comm comm)
364 {
365           /*
366           int i, k, line = -1;
367           int rank, size;
368           int sendto, recvfrom, distance, *displs=NULL, *blen=NULL;
369           int maxpacksize, packsize, position;
370           char * tmpbuf=NULL, *packbuf=NULL;
371           ptrdiff_t lb, sext, rext;
372           int err = 0;
373           int weallocated = 0;
374           MPI_Datatype iddt;
375
376           size = ompi_comm_size(comm);
377           rank = ompi_comm_rank(comm);
378
379           OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:alltoall_intra_bruck rank %d", rank));
380
381           err = ompi_ddt_get_extent (sdtype, &lb, &sext);
382           if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
383
384           err = ompi_ddt_get_extent (rdtype, &lb, &rext);
385           if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
386
387
388           displs = (int *) malloc(size*sizeof(int));
389           if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; }
390           blen = (int *) malloc(size*sizeof(int));
391           if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; }
392           weallocated = 1;
393 */
394           /* Prepare for packing data */
395           /*err = MPI_Pack_size( scount*size, sdtype, comm, &maxpacksize );
396           if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
397 */
398           /* pack buffer allocation */
399 /*        packbuf = (char*) malloc((unsigned) maxpacksize);
400           if (packbuf == NULL) { line = __LINE__; err = -1; goto err_hndl; }
401 */
402           /* tmp buffer allocation for message data */
403 /*        tmpbuf = (char *) malloc(scount*size*sext);
404           if (tmpbuf == NULL) { line = __LINE__; err = -1; goto err_hndl; }
405 */
406
407           /* Step 1 - local rotation - shift up by rank */
408 /*        err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) ((size-rank)*scount),
409                                 tmpbuf, ((char*)sbuf)+rank*scount*sext);
410           if (err<0) {
411                     line = __LINE__; err = -1; goto err_hndl;
412           }
413
414           if (rank != 0) {
415                     err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) (rank*scount),
416                                           tmpbuf+(size-rank)*scount*sext, (char*)sbuf);
417                     if (err<0) {
418                                 line = __LINE__; err = -1; goto err_hndl;
419                     }
420           }
421 */
422           /* perform communication step */
423 /*        for (distance = 1; distance < size; distance<<=1) {
424 */
425                     /* send data to "sendto" */
426 /*                  sendto = (rank+distance)%size;
427                     recvfrom = (rank-distance+size)%size;
428                     packsize = 0;
429                     k = 0;
430 */
431                     /* create indexed datatype */
432 //                  for (i = 1; i < size; i++) {
433 //                              if ((i&distance) == distance) {
434 //                                        displs[k] = i*scount; blen[k] = scount;
435 //                                        k++;
436 //                              }
437 //                  }
438                     /* Set indexes and displacements */
439 //                  err = MPI_Type_indexed(k, blen, displs, sdtype, &iddt);
440 //                  if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
441 //                  /* Commit the new datatype */
442 ///                 err = MPI_Type_commit(&iddt);
443 //                  if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
444
445                     /* have the new distribution ddt, pack and exchange data */
446 //                  err = MPI_Pack(tmpbuf, 1, iddt, packbuf, maxpacksize, &packsize, comm);
447 //                  if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
448
449                     /* Sendreceive */
450 //                  err = ompi_coll_tuned_sendrecv ( packbuf, packsize, MPI_PACKED, sendto,
451 //                                        MCA_COLL_BASE_TAG_ALLTOALL,
452 //                                        rbuf, packsize, MPI_PACKED, recvfrom,
453 //                                        MCA_COLL_BASE_TAG_ALLTOALL,
454 //                                        comm, MPI_STATUS_IGNORE, rank);
455 //                  if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
456
457                     /* Unpack data from rbuf to tmpbuf */
458 //                  position = 0;
459 //          err = MPI_Unpack(rbuf, packsize, &position,
460 //                                        tmpbuf, 1, iddt, comm);
461 //                  if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
462
463                     /* free ddt */
464 //                  err = MPI_Type_free(&iddt);
465 //                  if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
466 //        } /* end of for (distance = 1... */
467
468           /* Step 3 - local rotation - */
469 //        for (i = 0; i < size; i++) {
470
471 //                  err = ompi_ddt_copy_content_same_ddt (rdtype, (int32_t) rcount,
472 //                                        ((char*)rbuf)+(((rank-i+size)%size)*rcount*rext),
473 //                                        tmpbuf+i*rcount*rext);
474 //
475 //        if (err<0) {
476 //                              line = __LINE__; err = -1; goto err_hndl;
477 //                  }
478 //        }
479
480           /* Step 4 - clean up */
481 /*        if (tmpbuf != NULL) free(tmpbuf);
482           if (packbuf != NULL) free(packbuf);
483           if (weallocated) {
484                     if (displs != NULL) free(displs);
485                     if (blen != NULL) free(blen);
486           }
487           return OMPI_SUCCESS;
488
489 err_hndl:
490           OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
491           if (tmpbuf != NULL) free(tmpbuf);
492           if (packbuf != NULL) free(packbuf);
493           if (weallocated) {
494                     if (displs != NULL) free(displs);
495                     if (blen != NULL) free(blen);
496           }
497           return err;
498           */
499           int NOTYET=1;
500           return NOTYET;
501 }
502
503
504 /**
505  * -----------------------------------------------------------------------------------------------------
506  * example usage
507  * -----------------------------------------------------------------------------------------------------
508  **/
509 /*
510  * int main() {
511
512       int rank; 
513       int size=12;
514
515       proc_tree_t tree;
516       for (rank=0;rank<size;rank++) {
517             printf("--------------tree for rank %d ----------\n",rank);
518             tree = alloc_tree( 2 );
519             build_tree( rank, size, &tree );
520             print_tree( tree );
521             free_tree( tree );
522    
523       }
524       printf("-------------- bcast ----------\n");
525       for (rank=0;rank<size;rank++) {
526               bcast( rank, size );
527       }
528
529
530 }
531 */
532
533                 
534