Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Added flag for printing more debug info
[simgrid.git] / src / smpi / smpi_coll.c
index e5a1ebd..4221710 100644 (file)
@@ -12,6 +12,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <assert.h>
 
 #include "private.h"
 #include "smpi_coll_private.h"
@@ -151,9 +152,7 @@ int tree_bcast( void *buf, int count, MPI_Datatype datatype, int root,
 
         /* wait for data from my parent in the tree */
         if (!tree->isRoot) {
-#ifdef DEBUG_STEPH
-                printf("[%d] recv(%d  from %d, tag=%d)\n",rank,rank, tree->parent, system_tag+rank);
-#endif
+                DEBUG3("<%d> tree_bcast(): i am not root: recv from %d, tag=%d)",rank,tree->parent,system_tag+rank);
                 retval = smpi_create_request(buf, count, datatype, 
                                 tree->parent, rank, 
                                 system_tag + rank, 
@@ -163,31 +162,22 @@ int tree_bcast( void *buf, int count, MPI_Datatype datatype, int root,
                                         rank,retval,__FILE__,__LINE__);
                 }
                 smpi_mpi_irecv(request);
-#ifdef DEBUG_STEPH
-                printf("[%d] waiting on irecv from %d\n",rank , tree->parent);
-#endif
+                DEBUG2("<%d> tree_bcast(): waiting on irecv from %d",rank, tree->parent);
                 smpi_mpi_wait(request, MPI_STATUS_IGNORE);
                 xbt_mallocator_release(smpi_global->request_mallocator, request);
         }
 
         requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
-#ifdef DEBUG_STEPH
-        printf("[%d] creates %d requests\n",rank,tree->numChildren);
-#endif
+        DEBUG2("<%d> creates %d requests (1 per child)\n",rank,tree->numChildren);
 
         /* iniates sends to ranks lower in the tree */
         for (i=0; i < tree->numChildren; i++) {
                 if (tree->child[i] != -1) {
-#ifdef DEBUG_STEPH
-                        printf("[%d] send(%d->%d, tag=%d)\n",rank,rank, tree->child[i], system_tag+tree->child[i]);
-#endif
+                        DEBUG3("<%d> send to <%d>,tag=%d",rank,tree->child[i], system_tag+tree->child[i]);
                         retval = smpi_create_request(buf, count, datatype, 
                                         rank, tree->child[i], 
                                         system_tag + tree->child[i], 
                                         comm, &(requests[i]));
-#ifdef DEBUG_STEPH
-                        printf("[%d] after create req[%d]=%p req->(src=%d,dst=%d)\n",rank , i, requests[i],requests[i]->src,requests[i]->dst );
-#endif
                         if (MPI_SUCCESS != retval) {
                               printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
                                               rank,retval,__FILE__,__LINE__);
@@ -239,7 +229,7 @@ int tree_antibcast( void *buf, int count, MPI_Datatype datatype, int root,
                                 system_tag + rank, 
                                 comm, &request);
                 if (MPI_SUCCESS != retval) {
-                        printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
+                        ERROR4("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
                                         rank,retval,__FILE__,__LINE__);
                 }
                 smpi_mpi_isend(request);
@@ -281,6 +271,7 @@ int rank;
 int retval;
 
         rank = smpi_mpi_comm_rank( comm );
+        DEBUG2("<%d> entered nary_tree_bcast(), arity=%d",rank,arity);
         // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
         proc_tree_t tree = alloc_tree( arity ); 
         build_tree( rank, comm->size, &tree );
@@ -345,6 +336,9 @@ int smpi_coll_tuned_alltoall_pairwise (void *sendbuf, int sendcount, MPI_Datatyp
          void * tmpsend, *tmprecv;
 
          rank = smpi_mpi_comm_rank(comm);
+        INFO1("<%d> algorithm alltoall_pairwise() called.\n",rank);
+
+
          /* Perform pairwise exchange - starting from 1 so the local copy is last */
          for (step = 1; step < size+1; step++) {
 
@@ -395,13 +389,13 @@ int copy_dt( void *sbuf, int scount, const MPI_Datatype sdtype,
 }
 
 /**
- *
+ * Alltoall basic_linear
  **/
 int smpi_coll_tuned_alltoall_basic_linear(void *sbuf, int scount, MPI_Datatype sdtype,
                    void* rbuf, int rcount, MPI_Datatype rdtype, MPI_Comm comm)
 {
          int i;
-        int system_tag = 999;
+        int system_alltoall_tag = 888;
          int rank;
          int size = comm->size;
          int err;
@@ -415,9 +409,12 @@ int smpi_coll_tuned_alltoall_basic_linear(void *sbuf, int scount, MPI_Datatype s
 
          /* Initialize. */
          rank = smpi_mpi_comm_rank(comm);
+        DEBUG1("<%d> algorithm alltoall_basic_linear() called.",rank);
 
         err = smpi_mpi_type_get_extent(sdtype, &lb, &sndinc);
         err = smpi_mpi_type_get_extent(rdtype, &lb, &rcvinc);
+        sndinc *= scount;
+        rcvinc *= rcount;
          /* simple optimization */
          psnd = ((char *) sbuf) + (rank * sndinc);
          prcv = ((char *) rbuf) + (rank * rcvinc);
@@ -442,10 +439,10 @@ int smpi_coll_tuned_alltoall_basic_linear(void *sbuf, int scount, MPI_Datatype s
          for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
                    err = smpi_create_request( prcv + (i * rcvinc), rcount, rdtype,
                                         i, rank,
-                                        system_tag + i,
+                                        system_alltoall_tag,
                                         comm, &(reqs[nreq]));
                 if (MPI_SUCCESS != err) {
-                        DEBUG2("[%d] failed to create request for rank %d\n",rank,i);
+                        DEBUG2("<%d> failed to create request for rank %d",rank,i);
                         for (i=0;i< nreq;i++) 
                                 xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
                         return err;
@@ -459,10 +456,10 @@ int smpi_coll_tuned_alltoall_basic_linear(void *sbuf, int scount, MPI_Datatype s
          for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size ) {
                    err = smpi_create_request (psnd + (i * sndinc), scount, sdtype, 
                                          rank, i,
-                                                    system_tag + i
+                                                    system_alltoall_tag
                                          comm, &(reqs[nreq]));
                 if (MPI_SUCCESS != err) {
-                        DEBUG2("[%d] failed to create request for rank %d\n",rank,i);
+                        DEBUG2("<%d> failed to create request for rank %d\n",rank,i);
                         for (i=0;i< nreq;i++) 
                                 xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
                         return err;
@@ -472,9 +469,11 @@ int smpi_coll_tuned_alltoall_basic_linear(void *sbuf, int scount, MPI_Datatype s
 
         /* Start your engines.  This will never return an error. */
         for ( i=0; i< nreq/2; i++ ) {
+            DEBUG3("<%d> issued irecv request reqs[%d]=%p",rank,i,reqs[i]);
             smpi_mpi_irecv( reqs[i] );
         }
         for ( i= nreq/2; i<nreq; i++ ) {
+            DEBUG3("<%d> issued isend request reqs[%d]=%p",rank,i,reqs[i]);
             smpi_mpi_isend( reqs[i] );
         }
 
@@ -486,12 +485,19 @@ int smpi_coll_tuned_alltoall_basic_linear(void *sbuf, int scount, MPI_Datatype s
          * So free them anyway -- even if there was an error, and return
          * the error after we free everything. */
 
-         err = smpi_mpi_waitall(nreq, reqs, MPI_STATUS_IGNORE);
+        DEBUG2("<%d> wait for %d requests",rank,nreq);
+        // waitall is buggy: use a loop instead for the moment
+        // err = smpi_mpi_waitall(nreq, reqs, MPI_STATUS_IGNORE);
+        for (i=0;i<nreq;i++) {
+                err = smpi_mpi_wait( reqs[i], MPI_STATUS_IGNORE);
+        }
 
          /* Free the reqs */
-        for (i=0;i< 2*(size-1);i++) {
+        assert( nreq == 2*(size-1) );
+        for (i=0;i< nreq;i++) {
             xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
         }
+        xbt_free( reqs );
          return err;
 }
 
@@ -634,6 +640,156 @@ err_hndl:
          return -1; /* FIXME: to be changed*/
 }
 
+static void print_buffer_int(void *buf, int len, char *msg, int rank);
+static void print_buffer_int(void *buf, int len, char *msg, int rank)
+{
+  int tmp, *v;
+  fprintf(stderr,"**<%d> %s (#%d): ", rank, msg,len);
+  for (tmp = 0; tmp < len; tmp++) {
+    v = buf;
+    fprintf(stderr,"[%d (%p)]", v[tmp],v+tmp);
+  }
+  fprintf(stderr,"\n");
+  free(msg);
+}
+
+
+
+/**
+ * alltoallv basic 
+ **/
+
+int smpi_coll_basic_alltoallv(void *sbuf, int *scounts, int *sdisps, MPI_Datatype sdtype, 
+                              void *rbuf, int *rcounts, int *rdisps, MPI_Datatype rdtype,
+                              MPI_Comm comm) {
+
+        int i;
+        int system_alltoallv_tag = 889;
+        int rank;
+        int size = comm->size;
+        int err;
+        char *psnd;
+        char *prcv;
+        //int nreq = 0;
+        int rreq = 0;
+        int sreq = 0;
+        MPI_Aint lb;
+        MPI_Aint sndextent;
+        MPI_Aint rcvextent;
+        MPI_Request *reqs;
+
+        /* Initialize. */
+        rank = smpi_mpi_comm_rank(comm);
+        DEBUG1("<%d> algorithm basic_alltoallv() called.",rank);
+
+        err = smpi_mpi_type_get_extent(sdtype, &lb, &sndextent);
+        err = smpi_mpi_type_get_extent(rdtype, &lb, &rcvextent);
+
+        psnd = (char *)sbuf;
+        //print_buffer_int(psnd,size*size,xbt_strdup("sbuff"),rank);
+
+        /* copy the local sbuf to rbuf when it's me */
+        psnd = ((char *) sbuf) + (sdisps[rank] * sndextent);
+        prcv = ((char *) rbuf) + (rdisps[rank] * rcvextent);
+
+        if (0 != scounts[rank]) {
+                err = copy_dt( psnd, scounts[rank], sdtype, prcv, rcounts[rank], rdtype );
+                if (MPI_SUCCESS != err) {
+                        return err;
+                }
+        }
+
+        /* If only one process, we're done. */
+        if (1 == size) {
+                return MPI_SUCCESS;
+        }
+
+        /* Initiate all send/recv to/from others. */
+        reqs =  xbt_malloc(2*(size-1) * sizeof(smpi_mpi_request_t));
+
+
+        /* Create all receives that will be posted first */
+        for (i = 0; i < size; ++i) {
+                if (i == rank || 0 == rcounts[i]) {
+                        DEBUG3("<%d> skip req creation i=%d,rcounts[i]=%d",rank,i, rcounts[i]);
+                        continue;
+                }
+                prcv = ((char *) rbuf) + (rdisps[i] * rcvextent);
+
+                err = smpi_create_request( prcv, rcounts[i], rdtype,
+                                i, rank,
+                                system_alltoallv_tag,
+                                comm, &(reqs[rreq]));
+                if (MPI_SUCCESS != err) {
+                        DEBUG2("<%d> failed to create request for rank %d",rank,i);
+                        for (i=0;i< rreq;i++) 
+                                xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
+                        return err;
+                }
+                rreq++;
+        }
+        DEBUG2("<%d> %d irecv reqs created",rank,rreq);
+        /* Now create all sends  */
+        for (i = 0; i < size; ++i) {
+                if (i == rank || 0 == scounts[i]) {
+                        DEBUG3("<%d> skip req creation i=%d,scounts[i]=%d",rank,i, scounts[i]);
+                        continue;
+                }
+                psnd = ((char *) sbuf) + (sdisps[i] * sndextent);
+
+                //fprintf(stderr,"<%d> send %d elems to <%d>\n",rank,scounts[i],i);
+                //print_buffer_int(psnd,scounts[i],xbt_strdup("sbuff part"),rank);
+                err = smpi_create_request (psnd, scounts[i], sdtype,
+                                rank, i,
+                                system_alltoallv_tag, 
+                                comm, &(reqs[rreq+sreq]));
+                if (MPI_SUCCESS != err) {
+                        DEBUG2("<%d> failed to create request for rank %d\n",rank,i);
+                        for (i=0;i< rreq+sreq;i++) 
+                                xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
+                        return err;
+                }
+                sreq++;
+        }
+        DEBUG2("<%d> %d isend reqs created",rank,sreq);
+
+        /* Start your engines.  This will never return an error. */
+        for ( i=0; i< rreq; i++ ) {
+                DEBUG3("<%d> issued irecv request reqs[%d]=%p",rank,i,reqs[i]);
+                smpi_mpi_irecv( reqs[i] );
+        }
+                DEBUG3("<%d> for (i=%d;i<%d)",rank,rreq,sreq);
+        for ( i=rreq; i< rreq+sreq; i++ ) {
+                DEBUG3("<%d> issued isend request reqs[%d]=%p",rank,i,reqs[i]);
+                smpi_mpi_isend( reqs[i] );
+        }
+
+
+        /* Wait for them all.  If there's an error, note that we don't
+         * care what the error was -- just that there *was* an error.  The
+         * PML will finish all requests, even if one or more of them fail.
+         * i.e., by the end of this call, all the requests are free-able.
+         * So free them anyway -- even if there was an error, and return
+         * the error after we free everything. */
+
+        DEBUG2("<%d> wait for %d requests",rank,rreq+sreq);
+        // waitall is buggy: use a loop instead for the moment
+        // err = smpi_mpi_waitall(nreq, reqs, MPI_STATUS_IGNORE);
+        for (i=0;i< rreq+sreq;i++) {
+                err = smpi_mpi_wait( reqs[i], MPI_STATUS_IGNORE);
+        }
+
+        /* Free the reqs */
+        /* nreq might be < 2*(size-1) since some request creations are skipped */
+        for (i=0;i< rreq+sreq;i++) {
+                xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
+        }
+        xbt_free( reqs );
+        return err;
+}
+
+
+
 
 /**
  * -----------------------------------------------------------------------------------------------------