Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
57a02e3f6b32e8a963a699671fed3beba2d75157
[simgrid.git] / src / smpi / smpi_coll.c
1 /* $Id$tag */
2
3 /* smpi_coll.c -- various optimized routing for collectives                   */
4
5 /* Copyright (c) 2009 Stephane Genaud.                                        */
6 /* All rights reserved.                                                       */
7
8 /* This program is free software; you can redistribute it and/or modify it
9  *  * under the terms of the license (GNU LGPL) which comes with this package. */
10
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15
16 #include "private.h"
17 #include "smpi_coll_private.h"
18
19
20 /* proc_tree taken and translated from P2P-MPI */
21
22 struct proc_tree {
23         int PROCTREE_A;
24         int numChildren;
25         int * child;
26         int parent;
27         int me;
28         int root;
29         int isRoot;
30 };
31 typedef struct proc_tree * proc_tree_t;
32
33
34
35 /* prototypes */
36 void build_tree( int index, int extent, proc_tree_t *tree);
37 proc_tree_t alloc_tree( int arity );
38 void free_tree( proc_tree_t tree);
39 void print_tree(proc_tree_t tree);
40
41
42
43
44 /**
45  * alloc and init
46  **/
47 proc_tree_t alloc_tree( int arity ) {
48         proc_tree_t tree = malloc(1*sizeof(struct proc_tree));
49         int i;
50
51         tree->PROCTREE_A = arity;
52         tree->isRoot = 0; 
53         tree->numChildren = 0;
54         tree->child = malloc(arity*sizeof(int));
55         for (i=0; i < arity; i++) {
56                 tree->child[i] = -1;
57         }
58         tree->root = -1;
59         tree->parent = -1;
60         return( tree );
61 }
62
63 /**
64  * free
65  **/
66 void free_tree( proc_tree_t tree) {
67         free (tree->child );
68         free(tree);
69 }
70
71
72
73 /**
74  * Build the tree depending on a process rank (index) and the group size (extent)
75  * @param index the rank of the calling process
76  * @param extent the total number of processes
77  **/
78 void build_tree( int index, int extent, proc_tree_t *tree) {
79         int places = (*tree)->PROCTREE_A * index;
80         int i;
81         int ch;
82         int pr;
83
84         (*tree)->me = index;
85         (*tree)->root = 0 ;
86
87         for (i = 1; i <= (*tree)->PROCTREE_A; i++) {
88                 ++places;
89                 ch = (*tree)->PROCTREE_A * index + i + (*tree)->root;
90                 //printf("places %d\n",places);
91                 ch %= extent;
92                 if (places < extent) {
93                          //printf("ch <%d> = <%d>\n",i,ch);
94                          //printf("adding to the tree at index <%d>\n\n",i-1);
95                         (*tree)->child[i - 1] = ch;
96                         (*tree)->numChildren++;
97                 }
98                 else {
99                        //fprintf(stderr,"not adding to the tree\n");
100                 }
101         }
102         //fprintf(stderr,"procTree.numChildren <%d>\n",(*tree)->numChildren);
103
104         if (index == (*tree)->root) {
105                 (*tree)->isRoot = 1;
106         }
107         else {
108                 (*tree)->isRoot = 0;
109                 pr = (index - 1) / (*tree)->PROCTREE_A;
110                 (*tree)->parent = pr;
111         }
112 }
113
114 /**
115  * prints the tree as a graphical representation
116  **/
117 void print_tree(proc_tree_t tree) {
118       int i;
119       char *spacer;
120       if (-1 != tree->parent ) {
121             printf("[%d]\n   +---",tree->parent);
122             spacer= strdup("     ");
123       }
124       else {
125             spacer=strdup("");
126       }
127       printf("<%d>\n",tree->me);
128       for (i=0;i < tree->numChildren; i++) {
129               printf("%s   +--- %d\n", spacer,tree->child[i]);
130       }
131       free(spacer);
132 }
133       
134 /**
135  * bcast
136  **/
137 int tree_bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
138 int tree_bcast( void *buf, int count, MPI_Datatype datatype, int root,
139                 MPI_Comm comm, proc_tree_t tree) 
140 {
141         int system_tag = 999;  // used negative int but smpi_create_request() declares this illegal (to be checked)
142         int rank;
143         int retval = MPI_SUCCESS;
144         int i;
145         smpi_mpi_request_t request;
146         smpi_mpi_request_t * requests;
147
148         rank = smpi_mpi_comm_rank(comm);
149
150         /* wait for data from my parent in the tree */
151         if (!tree->isRoot) {
152 #ifdef DEBUG_STEPH
153                 printf("[%d] recv(%d  from %d, tag=%d)\n",rank,rank, tree->parent, system_tag+rank);
154 #endif
155                 retval = smpi_create_request(buf, count, datatype, 
156                                 tree->parent, rank, 
157                                 system_tag + rank, 
158                                 comm, &request);
159                 if (MPI_SUCCESS != retval) {
160                         printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
161                                         rank,retval,__FILE__,__LINE__);
162                 }
163                 smpi_mpi_irecv(request);
164 #ifdef DEBUG_STEPH
165                 printf("[%d] waiting on irecv from %d\n",rank , tree->parent);
166 #endif
167                 smpi_mpi_wait(request, MPI_STATUS_IGNORE);
168                 xbt_mallocator_release(smpi_global->request_mallocator, request);
169         }
170
171         requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
172 #ifdef DEBUG_STEPH
173         printf("[%d] creates %d requests\n",rank,tree->numChildren);
174 #endif
175
176         /* iniates sends to ranks lower in the tree */
177         for (i=0; i < tree->numChildren; i++) {
178                 if (tree->child[i] != -1) {
179 #ifdef DEBUG_STEPH
180                         printf("[%d] send(%d->%d, tag=%d)\n",rank,rank, tree->child[i], system_tag+tree->child[i]);
181 #endif
182                         retval = smpi_create_request(buf, count, datatype, 
183                                         rank, tree->child[i], 
184                                         system_tag + tree->child[i], 
185                                         comm, &(requests[i]));
186 #ifdef DEBUG_STEPH
187                         printf("[%d] after create req[%d]=%p req->(src=%d,dst=%d)\n",rank , i, requests[i],requests[i]->src,requests[i]->dst );
188 #endif
189                         if (MPI_SUCCESS != retval) {
190                               printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
191                                               rank,retval,__FILE__,__LINE__);
192                         }
193                         smpi_mpi_isend(requests[i]);
194                         /* FIXME : we should not wait immediately here. See next FIXME. */
195                         smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
196                         xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
197                 }
198         }
199         /* FIXME : normally, we sould wait only once all isend have been issued:
200          * this is the following commented code. It deadlocks, probably because
201          * of a bug in the sender process */
202         
203         /* wait for completion of sends */
204         /* printf("[%d] wait for %d send completions\n",rank,tree->numChildren);
205           smpi_mpi_waitall( tree->numChildren, requests, MPI_STATUS_IGNORE);
206          printf("[%d] reqs completed\n)",rank);
207            */
208         
209         xbt_free(requests);
210         return(retval);
211         /* checked ok with valgrind --leak-check=full*/
212 }
213
214
215 /**
216  * anti-bcast
217  **/
218 int tree_antibcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
219 int tree_antibcast( void *buf, int count, MPI_Datatype datatype, int root,
220                 MPI_Comm comm, proc_tree_t tree) 
221 {
222         int system_tag = 999;  // used negative int but smpi_create_request() declares this illegal (to be checked)
223         int rank;
224         int retval = MPI_SUCCESS;
225         int i;
226         smpi_mpi_request_t request;
227         smpi_mpi_request_t * requests;
228
229         rank = smpi_mpi_comm_rank(comm);
230
231           //------------------anti-bcast-------------------
232         
233         // everyone sends to its parent, except root.
234         if (!tree->isRoot) {
235                 retval = smpi_create_request(buf, count, datatype,
236                                 rank,tree->parent,  
237                                 system_tag + rank, 
238                                 comm, &request);
239                 if (MPI_SUCCESS != retval) {
240                         printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
241                                         rank,retval,__FILE__,__LINE__);
242                 }
243                 smpi_mpi_isend(request);
244                 smpi_mpi_wait(request, MPI_STATUS_IGNORE);
245                 xbt_mallocator_release(smpi_global->request_mallocator, request);
246         }
247
248         //every one receives as many messages as it has children
249         requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
250         for (i=0; i < tree->numChildren; i++) {
251                 if (tree->child[i] != -1) {
252                         retval = smpi_create_request(buf, count, datatype, 
253                                         tree->child[i], rank, 
254                                         system_tag + tree->child[i], 
255                                         comm, &(requests[i]));
256                         if (MPI_SUCCESS != retval) {
257                                 printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
258                                                 rank,retval,__FILE__,__LINE__);
259                         }
260                         smpi_mpi_irecv(requests[i]);
261                         /* FIXME : we should not wait immediately here. See next FIXME. */
262                         smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
263                         xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
264                 }
265         }
266         xbt_free(requests);
267         return(retval);
268
269         /* checked ok with valgrind --leak-check=full*/
270
271
272 /**
273  * bcast with a binary, ternary, or whatever tree .. 
274  **/
275 int nary_tree_bcast(void *buf, int count, MPI_Datatype datatype, int root,
276                 MPI_Comm comm, int arity)
277 {
278 int rank;
279 int retval;
280
281         rank = smpi_mpi_comm_rank( comm );
282         // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
283         proc_tree_t tree = alloc_tree( arity ); 
284         build_tree( rank, comm->size, &tree );
285
286         retval = tree_bcast( buf, count, datatype, root, comm, tree );
287
288         free_tree( tree );
289         return( retval );
290 }
291
292
293 /**
294  * Barrier
295  **/
296
297 int nary_tree_barrier( MPI_Comm comm , int arity)
298 {
299         int rank;
300         int retval = MPI_SUCCESS;
301         char dummy='$';
302
303         rank = smpi_mpi_comm_rank(comm);
304         // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
305         proc_tree_t tree = alloc_tree( arity ); 
306
307         build_tree( rank, comm->size, &tree );
308
309         retval = tree_antibcast( &dummy, 1, MPI_CHAR, 0, comm, tree);
310         if (MPI_SUCCESS != retval) {
311                 printf("[%s:%d] ** Error: tree_antibcast() returned retval=%d\n",__FILE__,__LINE__,retval);
312         }
313         else {
314             retval = tree_bcast(&dummy,  1, MPI_CHAR, 0, comm, tree);
315         }
316
317         free_tree( tree );
318         return(retval);
319
320         /* checked ok with valgrind --leak-check=full*/
321 }
322
323
324
325
326
327
328
329 /**
330  * -----------------------------------------------------------------------------------------------------
331  * example usage
332  * -----------------------------------------------------------------------------------------------------
333  **/
334 /*
335  * int main() {
336
337       int rank; 
338       int size=12;
339
340       proc_tree_t tree;
341       for (rank=0;rank<size;rank++) {
342             printf("--------------tree for rank %d ----------\n",rank);
343             tree = alloc_tree( 2 );
344             build_tree( rank, size, &tree );
345             print_tree( tree );
346             free_tree( tree );
347    
348       }
349       printf("-------------- bcast ----------\n");
350       for (rank=0;rank<size;rank++) {
351               bcast( rank, size );
352       }
353
354
355 }
356 */
357
358                 
359