Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Fix dead assignments.
[simgrid.git] / src / smpi / smpi_coll.c
index 7296453..3bca49d 100644 (file)
-/* $Id$tag */
-
 /* smpi_coll.c -- various optimized routing for collectives                   */
 
-/* Copyright (c) 2009 Stephane Genaud.                                        */
-/* All rights reserved.                                                       */
+/* Copyright (c) 2009-2014. The SimGrid Team.
+ * All rights reserved.                                                     */
 
 /* This program is free software; you can redistribute it and/or modify it
- *  * under the terms of the license (GNU LGPL) which comes with this package. */
-
+ * under the terms of the license (GNU LGPL) which comes with this package. */
 
 #include <stdio.h>
-#include <stdlib.h>
 #include <string.h>
+#include <assert.h>
 
 #include "private.h"
-#include "smpi_coll_private.h"
+#include "colls/colls.h"
+#include "simgrid/sg_config.h"
+
+s_mpi_coll_description_t mpi_coll_gather_description[] = {
+  {"default",
+   "gather default collective",
+   smpi_mpi_gather},
+COLL_GATHERS(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
 
-XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_coll, smpi,
-                                "Logging specific to SMPI (coll)");
 
-/* proc_tree taken and translated from P2P-MPI */
+s_mpi_coll_description_t mpi_coll_allgather_description[] = {
+  {"default",
+   "allgather default collective",
+   smpi_mpi_allgather},
+COLL_ALLGATHERS(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
+
+s_mpi_coll_description_t mpi_coll_allgatherv_description[] = {
+  {"default",
+   "allgatherv default collective",
+   smpi_mpi_allgatherv},
+COLL_ALLGATHERVS(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
+
+s_mpi_coll_description_t mpi_coll_allreduce_description[] = {
+  {"default",
+   "allreduce default collective",
+   smpi_mpi_allreduce},
+COLL_ALLREDUCES(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
+
+s_mpi_coll_description_t mpi_coll_reduce_scatter_description[] = {
+  {"default",
+   "reduce_scatter default collective",
+   smpi_mpi_reduce_scatter},
+COLL_REDUCE_SCATTERS(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
 
-struct proc_tree {
-        int PROCTREE_A;
-        int numChildren;
-        int * child;
-        int parent;
-        int me;
-        int root;
-        int isRoot;
+s_mpi_coll_description_t mpi_coll_scatter_description[] = {
+  {"default",
+   "scatter default collective",
+   smpi_mpi_scatter},
+COLL_SCATTERS(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
 };
-typedef struct proc_tree * proc_tree_t;
 
+s_mpi_coll_description_t mpi_coll_barrier_description[] = {
+  {"default",
+   "barrier default collective",
+   smpi_mpi_barrier},
+COLL_BARRIERS(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
+s_mpi_coll_description_t mpi_coll_alltoall_description[] = {
+  {"default",
+   "Ompi alltoall default collective",
+   smpi_coll_tuned_alltoall_ompi2},
+COLL_ALLTOALLS(COLL_DESCRIPTION, COLL_COMMA),
+  {"bruck",
+   "Alltoall Bruck (SG) collective",
+   smpi_coll_tuned_alltoall_bruck},
+  {"basic_linear",
+   "Alltoall basic linear (SG) collective",
+   smpi_coll_tuned_alltoall_basic_linear},
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
 
+s_mpi_coll_description_t mpi_coll_alltoallv_description[] = {
+  {"default",
+   "Ompi alltoallv default collective",
+   smpi_coll_basic_alltoallv},
+COLL_ALLTOALLVS(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
 
-/* prototypes */
-void build_tree( int index, int extent, proc_tree_t *tree);
-proc_tree_t alloc_tree( int arity );
-void free_tree( proc_tree_t tree);
-void print_tree(proc_tree_t tree);
+s_mpi_coll_description_t mpi_coll_bcast_description[] = {
+  {"default",
+   "bcast default collective",
+   smpi_mpi_bcast},
+COLL_BCASTS(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
 
+s_mpi_coll_description_t mpi_coll_reduce_description[] = {
+  {"default",
+   "reduce default collective",
+   smpi_mpi_reduce},
+COLL_REDUCES(COLL_DESCRIPTION, COLL_COMMA),
+  {NULL, NULL, NULL}      /* this array must be NULL terminated */
+};
 
 
 
-/**
- * alloc and init
- **/
-proc_tree_t alloc_tree( int arity ) {
-        proc_tree_t tree = malloc(1*sizeof(struct proc_tree));
-        int i;
-
-        tree->PROCTREE_A = arity;
-        tree->isRoot = 0; 
-        tree->numChildren = 0;
-        tree->child = malloc(arity*sizeof(int));
-        for (i=0; i < arity; i++) {
-                tree->child[i] = -1;
-        }
-        tree->root = -1;
-        tree->parent = -1;
-        return( tree );
+/** Displays the long description of all registered models, and quit */
+void coll_help(const char *category, s_mpi_coll_description_t * table)
+{
+  int i;
+  printf("Long description of the %s models accepted by this simulator:\n",
+         category);
+  for (i = 0; table[i].name; i++)
+    printf("  %s: %s\n", table[i].name, table[i].description);
 }
 
-/**
- * free
- **/
-void free_tree( proc_tree_t tree) {
-        free (tree->child );
-        free(tree);
+int find_coll_description(s_mpi_coll_description_t * table,
+                           char *name)
+{
+  int i;
+  char *name_list = NULL;
+  int selector_on=0;
+  if(name==NULL){//no argument provided, use active selector's algorithm
+    name=(char*)sg_cfg_get_string("smpi/coll_selector");
+    selector_on=1;
+  }
+  for (i = 0; table[i].name; i++)
+    if (!strcmp(name, table[i].name)) {
+      return i;
+    }
+
+  if(selector_on){
+    // collective seems not handled by the active selector, try with default one
+    name=(char*)"default";
+    for (i = 0; table[i].name; i++)
+      if (!strcmp(name, table[i].name)) {
+        return i;
+    }
+  }
+  if (!table[0].name)
+    xbt_die("No collective is valid! This is a bug.");
+  name_list = xbt_strdup(table[0].name);
+  for (i = 1; table[i].name; i++) {
+    name_list =
+        xbt_realloc(name_list,
+                    strlen(name_list) + strlen(table[i].name) + 3);
+    strcat(name_list, ", ");
+    strcat(name_list, table[i].name);
+  }
+  xbt_die("Collective '%s' is invalid! Valid collectives are: %s.", name, name_list);
+  return -1;
 }
 
+XBT_LOG_NEW_DEFAULT_SUBCATEGORY(smpi_coll, smpi,
+                                "Logging specific to SMPI (coll)");
 
+int (*mpi_coll_gather_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, int root, MPI_Comm);
+int (*mpi_coll_allgather_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm);
+int (*mpi_coll_allgatherv_fun)(void *, int, MPI_Datatype, void*, int*, int*, MPI_Datatype, MPI_Comm);
+int (*mpi_coll_allreduce_fun)(void *sbuf, void *rbuf, int rcount, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm);
+int (*mpi_coll_alltoall_fun)(void *, int, MPI_Datatype, void*, int, MPI_Datatype, MPI_Comm);
+int (*mpi_coll_alltoallv_fun)(void *, int*, int*, MPI_Datatype, void*, int*, int*, MPI_Datatype, MPI_Comm);
+int (*mpi_coll_bcast_fun)(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm com);
+int (*mpi_coll_reduce_fun)(void *buf, void *rbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm);
+int (*mpi_coll_reduce_scatter_fun)(void *sbuf, void *rbuf, int *rcounts,MPI_Datatype dtype,MPI_Op  op,MPI_Comm  comm);
+int (*mpi_coll_scatter_fun)(void *sendbuf, int sendcount, MPI_Datatype sendtype,void *recvbuf, int recvcount, MPI_Datatype recvtype,int root, MPI_Comm comm);
+int (*mpi_coll_barrier_fun)(MPI_Comm comm);
+struct s_proc_tree {
+  int PROCTREE_A;
+  int numChildren;
+  int *child;
+  int parent;
+  int me;
+  int root;
+  int isRoot;
+};
+typedef struct s_proc_tree *proc_tree_t;
 
 /**
- * Build the tree depending on a process rank (index) and the group size (extent)
- * @param index the rank of the calling process
- * @param extent the total number of processes
+ * alloc and init
  **/
-void build_tree( int index, int extent, proc_tree_t *tree) {
-        int places = (*tree)->PROCTREE_A * index;
-        int i;
-        int ch;
-        int pr;
-
-        (*tree)->me = index;
-        (*tree)->root = 0 ;
-
-        for (i = 1; i <= (*tree)->PROCTREE_A; i++) {
-                ++places;
-                ch = (*tree)->PROCTREE_A * index + i + (*tree)->root;
-                //printf("places %d\n",places);
-                ch %= extent;
-                if (places < extent) {
-                         //printf("ch <%d> = <%d>\n",i,ch);
-                         //printf("adding to the tree at index <%d>\n\n",i-1);
-                        (*tree)->child[i - 1] = ch;
-                        (*tree)->numChildren++;
-                }
-                else {
-                       //fprintf(stderr,"not adding to the tree\n");
-                }
-        }
-        //fprintf(stderr,"procTree.numChildren <%d>\n",(*tree)->numChildren);
-
-        if (index == (*tree)->root) {
-                (*tree)->isRoot = 1;
-        }
-        else {
-                (*tree)->isRoot = 0;
-                pr = (index - 1) / (*tree)->PROCTREE_A;
-                (*tree)->parent = pr;
-        }
+static proc_tree_t alloc_tree(int arity)
+{
+  proc_tree_t tree;
+  int i;
+
+  tree = xbt_new(struct s_proc_tree, 1);
+  tree->PROCTREE_A = arity;
+  tree->isRoot = 0;
+  tree->numChildren = 0;
+  tree->child = xbt_new(int, arity);
+  for (i = 0; i < arity; i++) {
+    tree->child[i] = -1;
+  }
+  tree->root = -1;
+  tree->parent = -1;
+  return tree;
 }
 
 /**
- * prints the tree as a graphical representation
+ * free
  **/
-void print_tree(proc_tree_t tree) {
-      int i;
-      char *spacer;
-      if (-1 != tree->parent ) {
-            printf("[%d]\n   +---",tree->parent);
-            spacer= strdup("     ");
-      }
-      else {
-            spacer=strdup("");
-      }
-      printf("<%d>\n",tree->me);
-      for (i=0;i < tree->numChildren; i++) {
-              printf("%s   +--- %d\n", spacer,tree->child[i]);
-      }
-      free(spacer);
+static void free_tree(proc_tree_t tree)
+{
+  xbt_free(tree->child);
+  xbt_free(tree);
 }
-      
+
 /**
- * bcast
+ * Build the tree depending on a process rank (index) and the group size (extent)
+ * @param root the rank of the tree root
+ * @param rank the rank of the calling process
+ * @param size the total number of processes
  **/
-int tree_bcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
-int tree_bcast( void *buf, int count, MPI_Datatype datatype, int root,
-                MPI_Comm comm, proc_tree_t tree) 
+static void build_tree(int root, int rank, int size, proc_tree_t * tree)
 {
-        int system_tag = 999;  // used negative int but smpi_create_request() declares this illegal (to be checked)
-        int rank;
-        int retval = MPI_SUCCESS;
-        int i;
-        smpi_mpi_request_t request;
-        smpi_mpi_request_t * requests;
-
-        rank = smpi_mpi_comm_rank(comm);
-
-        /* wait for data from my parent in the tree */
-        if (!tree->isRoot) {
-                DEBUG4("[%d] recv(%d  from %d, tag=%d)\n",rank,rank, tree->parent, system_tag+rank);
-                retval = smpi_create_request(buf, count, datatype, 
-                                tree->parent, rank, 
-                                system_tag + rank, 
-                                comm, &request);
-                if (MPI_SUCCESS != retval) {
-                        printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
-                                        rank,retval,__FILE__,__LINE__);
-                }
-                smpi_mpi_irecv(request);
-                DEBUG2("[%d] waiting on irecv from %d\n",rank , tree->parent);
-                smpi_mpi_wait(request, MPI_STATUS_IGNORE);
-                xbt_mallocator_release(smpi_global->request_mallocator, request);
-        }
-
-        requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
-        DEBUG2("[%d] creates %d requests\n",rank,tree->numChildren);
-
-        /* iniates sends to ranks lower in the tree */
-        for (i=0; i < tree->numChildren; i++) {
-                if (tree->child[i] != -1) {
-                        DEBUG4("[%d] send(%d->%d, tag=%d)\n",rank,rank, tree->child[i], system_tag+tree->child[i]);
-                        retval = smpi_create_request(buf, count, datatype, 
-                                        rank, tree->child[i], 
-                                        system_tag + tree->child[i], 
-                                        comm, &(requests[i]));
-                        DEBUG5("[%d] after create req[%d]=%p req->(src=%d,dst=%d)\n",rank,i,requests[i],requests[i]->src,requests[i]->dst );
-                        if (MPI_SUCCESS != retval) {
-                              printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
-                                              rank,retval,__FILE__,__LINE__);
-                        }
-                        smpi_mpi_isend(requests[i]);
-                        /* FIXME : we should not wait immediately here. See next FIXME. */
-                        smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
-                        xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
-                }
-        }
-        /* FIXME : normally, we sould wait only once all isend have been issued:
-         * this is the following commented code. It deadlocks, probably because
-         * of a bug in the sender process */
-        
-        /* wait for completion of sends */
-        /* printf("[%d] wait for %d send completions\n",rank,tree->numChildren);
-          smpi_mpi_waitall( tree->numChildren, requests, MPI_STATUS_IGNORE);
-         printf("[%d] reqs completed\n)",rank);
-           */
-        
-        xbt_free(requests);
-        return(retval);
-        /* checked ok with valgrind --leak-check=full*/
+  int index = (rank - root + size) % size;
+  int firstChildIdx = index * (*tree)->PROCTREE_A + 1;
+  int i;
+
+  (*tree)->me = rank;
+  (*tree)->root = root;
+
+  for (i = 0; i < (*tree)->PROCTREE_A && firstChildIdx + i < size; i++) {
+    (*tree)->child[i] = (firstChildIdx + i + root) % size;
+    (*tree)->numChildren++;
+  }
+  if (rank == root) {
+    (*tree)->isRoot = 1;
+  } else {
+    (*tree)->isRoot = 0;
+    (*tree)->parent = (((index - 1) / (*tree)->PROCTREE_A) + root) % size;
+  }
 }
 
-
 /**
- * anti-bcast
+ * bcast
  **/
-int tree_antibcast(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm, proc_tree_t tree);
-int tree_antibcast( void *buf, int count, MPI_Datatype datatype, int root,
-                MPI_Comm comm, proc_tree_t tree) 
+static void tree_bcast(void *buf, int count, MPI_Datatype datatype,
+                       MPI_Comm comm, proc_tree_t tree)
 {
-        int system_tag = 999;  // used negative int but smpi_create_request() declares this illegal (to be checked)
-        int rank;
-        int retval = MPI_SUCCESS;
-        int i;
-        smpi_mpi_request_t request;
-        smpi_mpi_request_t * requests;
-
-        rank = smpi_mpi_comm_rank(comm);
-
-         //------------------anti-bcast-------------------
-        
-        // everyone sends to its parent, except root.
-        if (!tree->isRoot) {
-                retval = smpi_create_request(buf, count, datatype,
-                                rank,tree->parent,  
-                                system_tag + rank, 
-                                comm, &request);
-                if (MPI_SUCCESS != retval) {
-                        printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
-                                        rank,retval,__FILE__,__LINE__);
-                }
-                smpi_mpi_isend(request);
-                smpi_mpi_wait(request, MPI_STATUS_IGNORE);
-                xbt_mallocator_release(smpi_global->request_mallocator, request);
-        }
-
-        //every one receives as many messages as it has children
-        requests = xbt_malloc( tree->numChildren * sizeof(smpi_mpi_request_t));
-        for (i=0; i < tree->numChildren; i++) {
-                if (tree->child[i] != -1) {
-                        retval = smpi_create_request(buf, count, datatype, 
-                                        tree->child[i], rank, 
-                                        system_tag + tree->child[i], 
-                                        comm, &(requests[i]));
-                        if (MPI_SUCCESS != retval) {
-                                printf("** internal error: smpi_create_request() rank=%d returned retval=%d, %s:%d\n",
-                                                rank,retval,__FILE__,__LINE__);
-                        }
-                        smpi_mpi_irecv(requests[i]);
-                        /* FIXME : we should not wait immediately here. See next FIXME. */
-                        smpi_mpi_wait( requests[i], MPI_STATUS_IGNORE);
-                        xbt_mallocator_release(smpi_global->request_mallocator, requests[i]);
-                }
-        }
-        xbt_free(requests);
-        return(retval);
-
-        /* checked ok with valgrind --leak-check=full*/
-} 
+  int system_tag = COLL_TAG_BCAST;
+  int rank, i;
+  MPI_Request *requests;
+
+  rank = smpi_comm_rank(comm);
+  /* wait for data from my parent in the tree */
+  if (!tree->isRoot) {
+    XBT_DEBUG("<%d> tree_bcast(): i am not root: recv from %d, tag=%d)",
+           rank, tree->parent, system_tag + rank);
+    smpi_mpi_recv(buf, count, datatype, tree->parent, system_tag + rank,
+                  comm, MPI_STATUS_IGNORE);
+  }
+  requests = xbt_new(MPI_Request, tree->numChildren);
+  XBT_DEBUG("<%d> creates %d requests (1 per child)", rank,
+         tree->numChildren);
+  /* iniates sends to ranks lower in the tree */
+  for (i = 0; i < tree->numChildren; i++) {
+    if (tree->child[i] == -1) {
+      requests[i] = MPI_REQUEST_NULL;
+    } else {
+      XBT_DEBUG("<%d> send to <%d>, tag=%d", rank, tree->child[i],
+             system_tag + tree->child[i]);
+      requests[i] =
+          smpi_isend_init(buf, count, datatype, tree->child[i],
+                          system_tag + tree->child[i], comm);
+    }
+  }
+  smpi_mpi_startall(tree->numChildren, requests);
+  smpi_mpi_waitall(tree->numChildren, requests, MPI_STATUS_IGNORE);
+  for(i = 0; i < tree->numChildren; i++) {
+    if(requests[i]!=MPI_REQUEST_NULL) smpi_mpi_request_free(&requests[i]);
+  }
+  xbt_free(requests);
+}
 
 /**
- * bcast with a binary, ternary, or whatever tree .. 
+ * anti-bcast
  **/
-int nary_tree_bcast(void *buf, int count, MPI_Datatype datatype, int root,
-                MPI_Comm comm, int arity)
+static void tree_antibcast(void *buf, int count, MPI_Datatype datatype,
+                           MPI_Comm comm, proc_tree_t tree)
 {
-int rank;
-int retval;
-
-        rank = smpi_mpi_comm_rank( comm );
-        // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
-        proc_tree_t tree = alloc_tree( arity ); 
-        build_tree( rank, comm->size, &tree );
-
-        retval = tree_bcast( buf, count, datatype, root, comm, tree );
-
-        free_tree( tree );
-        return( retval );
+  int system_tag = COLL_TAG_BCAST;
+  int rank, i;
+  MPI_Request *requests;
+
+  rank = smpi_comm_rank(comm);
+  // everyone sends to its parent, except root.
+  if (!tree->isRoot) {
+    XBT_DEBUG("<%d> tree_antibcast(): i am not root: send to %d, tag=%d)",
+           rank, tree->parent, system_tag + rank);
+    smpi_mpi_send(buf, count, datatype, tree->parent, system_tag + rank,
+                  comm);
+  }
+  //every one receives as many messages as it has children
+  requests = xbt_new(MPI_Request, tree->numChildren);
+  XBT_DEBUG("<%d> creates %d requests (1 per child)", rank,
+         tree->numChildren);
+  for (i = 0; i < tree->numChildren; i++) {
+    if (tree->child[i] == -1) {
+      requests[i] = MPI_REQUEST_NULL;
+    } else {
+      XBT_DEBUG("<%d> recv from <%d>, tag=%d", rank, tree->child[i],
+             system_tag + tree->child[i]);
+      requests[i] =
+          smpi_irecv_init(buf, count, datatype, tree->child[i],
+                          system_tag + tree->child[i], comm);
+    }
+  }
+  smpi_mpi_startall(tree->numChildren, requests);
+  smpi_mpi_waitall(tree->numChildren, requests, MPI_STATUS_IGNORE);
+  for(i = 0; i < tree->numChildren; i++) {
+    if(requests[i]!=MPI_REQUEST_NULL) smpi_mpi_request_free(&requests[i]);
+  }
+  xbt_free(requests);
 }
 
-
 /**
- * Barrier
+ * bcast with a binary, ternary, or whatever tree ..
  **/
-int nary_tree_barrier( MPI_Comm comm , int arity)
+void nary_tree_bcast(void *buf, int count, MPI_Datatype datatype, int root,
+                     MPI_Comm comm, int arity)
 {
-        int rank;
-        int retval = MPI_SUCCESS;
-        char dummy='$';
-
-        rank = smpi_mpi_comm_rank(comm);
-        // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
-        proc_tree_t tree = alloc_tree( arity ); 
-
-        build_tree( rank, comm->size, &tree );
-
-        retval = tree_antibcast( &dummy, 1, MPI_CHAR, 0, comm, tree);
-        if (MPI_SUCCESS != retval) {
-                printf("[%s:%d] ** Error: tree_antibcast() returned retval=%d\n",__FILE__,__LINE__,retval);
-        }
-        else {
-            retval = tree_bcast(&dummy,  1, MPI_CHAR, 0, comm, tree);
-        }
-
-        free_tree( tree );
-        return(retval);
-
-        /* checked ok with valgrind --leak-check=full*/
+  proc_tree_t tree = alloc_tree(arity);
+  int rank, size;
+
+  rank = smpi_comm_rank(comm);
+  size = smpi_comm_size(comm);
+  build_tree(root, rank, size, &tree);
+  tree_bcast(buf, count, datatype, comm, tree);
+  free_tree(tree);
 }
 
-
 /**
- * Alltoall pairwise
- *
- * this algorithm performs size steps (1<=s<=size) and
- * at each step s, a process p sends iand receive to.from a unique distinct remote process
- * size=5 : s=1:  4->0->1, 0->1->2, 1->2->3, ... 
- *          s=2:  3->0->2, 4->1->3, 0->2->4, 1->3->0 , 2->4->1
- *          .... 
- * Openmpi calls this routine when the message size sent to each rank is greater than 3000 bytes
+ * barrier with a binary, ternary, or whatever tree ..
  **/
-
-int smpi_coll_tuned_alltoall_pairwise (void *sendbuf, int sendcount, MPI_Datatype datatype,
-                   void* recvbuf, int recvcount, MPI_Datatype recvdatatype, MPI_Comm comm)
+void nary_tree_barrier(MPI_Comm comm, int arity)
 {
-        int retval = MPI_SUCCESS;
-         int rank;
-         int size = comm->size;
-         int step;
-         int sendto, recvfrom;
-         int tag_alltoall=999;
-         void * tmpsend, *tmprecv;
-
-         rank = smpi_mpi_comm_rank(comm);
-        INFO1("[%d] algorithm alltoall_pairwise() called.\n",rank);
-
-
-         /* Perform pairwise exchange - starting from 1 so the local copy is last */
-         for (step = 1; step < size+1; step++) {
-
-                   /* who do we talk to in this step? */
-                   sendto  = (rank+step)%size;
-                   recvfrom = (rank+size-step)%size;
-
-                   /* where from are we sending and where from are we receiving actual data ? */
-                   tmpsend = (char*)sendbuf+sendto*datatype->size*sendcount;
-                   tmprecv = (char*)recvbuf+recvfrom*recvdatatype->size*recvcount;
-
-                   /* send and receive */
-                   /* in OpenMPI, they use :
-                        err = ompi_coll_tuned_sendrecv( tmpsend, scount, sdtype, sendto, MCA_COLL_BASE_TAG_ALLTOALL,
-                        tmprecv, rcount, rdtype, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL,
-                        comm, MPI_STATUS_IGNORE, rank);
-                    */
-                   retval = MPI_Sendrecv(tmpsend, sendcount, datatype, sendto, tag_alltoall,
-                                               tmprecv, recvcount, recvdatatype, recvfrom, tag_alltoall,
-                                                 comm, MPI_STATUS_IGNORE);
-         }
-         return(retval);
+  proc_tree_t tree = alloc_tree(arity);
+  int rank, size;
+  char dummy = '$';
+
+  rank = smpi_comm_rank(comm);
+  size = smpi_comm_size(comm);
+  build_tree(0, rank, size, &tree);
+  tree_antibcast(&dummy, 1, MPI_CHAR, comm, tree);
+  tree_bcast(&dummy, 1, MPI_CHAR, comm, tree);
+  free_tree(tree);
 }
 
-/**
- * helper: copy a datatype into another (in the simple case dt1=dt2) 
-**/
-int copy_dt( void *sbuf, int scount, const MPI_Datatype sdtype, void *rbuf, int rcount, const MPI_Datatype rdtype);
-int copy_dt( void *sbuf, int scount, const MPI_Datatype sdtype,
-             void *rbuf, int rcount, const MPI_Datatype rdtype)
+int smpi_coll_tuned_alltoall_ompi2(void *sendbuf, int sendcount,
+                                   MPI_Datatype sendtype, void *recvbuf,
+                                   int recvcount, MPI_Datatype recvtype,
+                                   MPI_Comm comm)
 {
-    /* First check if we really have something to do */
-    if (0 == rcount) {
-        return ((0 == scount) ? MPI_SUCCESS : MPI_ERR_TRUNCATE);
-    }
-   /* If same datatypes used, just copy. */
-   if (sdtype == rdtype) {
-      int count = ( scount < rcount ? scount : rcount );
-      memcpy( rbuf, sbuf, sdtype->size*count);
-      return ((scount > rcount) ? MPI_ERR_TRUNCATE : MPI_SUCCESS);
-   }
-   /* FIXME:  cases 
-    * - If receive packed. 
-    * - If send packed
-    * to be treated once we have the MPI_Pack things ...
-    **/
-     return( MPI_SUCCESS );
+  int size, sendsize;  
+  size = smpi_comm_size(comm); 
+  sendsize = smpi_datatype_size(sendtype) * sendcount; 
+  if (sendsize < 200 && size > 12) {
+    return
+        smpi_coll_tuned_alltoall_bruck(sendbuf, sendcount, sendtype,
+                                       recvbuf, recvcount, recvtype,
+                                       comm);
+  } else if (sendsize < 3000) {
+    return
+        smpi_coll_tuned_alltoall_basic_linear(sendbuf, sendcount,
+                                              sendtype, recvbuf,
+                                              recvcount, recvtype, comm);
+  } else {
+    return
+        smpi_coll_tuned_alltoall_ring(sendbuf, sendcount, sendtype,
+                                      recvbuf, recvcount, recvtype,
+                                      comm);
+  }
 }
 
 /**
+ * Alltoall Bruck
  *
+ * Openmpi calls this routine when the message size sent to each rank < 2000 bytes and size < 12
+ * FIXME: uh, check smpi_pmpi again, but this routine is called for > 12, not
+ * less...
  **/
-int smpi_coll_tuned_alltoall_basic_linear(void *sbuf, int scount, MPI_Datatype sdtype,
-                   void* rbuf, int rcount, MPI_Datatype rdtype, MPI_Comm comm)
+int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount,
+                                   MPI_Datatype sendtype, void *recvbuf,
+                                   int recvcount, MPI_Datatype recvtype,
+                                   MPI_Comm comm)
 {
-         int i;
-        int system_alltoall_tag = 888;
-         int rank;
-         int size = comm->size;
-         int err;
-         char *psnd;
-         char *prcv;
-        int nreq = 0;
-         MPI_Aint lb;
-         MPI_Aint sndinc;
-         MPI_Aint rcvinc;
-         MPI_Request *reqs;
-
-         /* Initialize. */
-         rank = smpi_mpi_comm_rank(comm);
-        INFO1("[%d] algorithm alltoall_basic_linear() called.\n",rank);
-
-        err = smpi_mpi_type_get_extent(sdtype, &lb, &sndinc);
-        err = smpi_mpi_type_get_extent(rdtype, &lb, &rcvinc);
-         /* simple optimization */
-         psnd = ((char *) sbuf) + (rank * sndinc);
-         prcv = ((char *) rbuf) + (rank * rcvinc);
-
-         err = copy_dt( psnd, scount, sdtype, prcv, rcount, rdtype );
-         if (MPI_SUCCESS != err) {
-                   return err;
-         }
-
-         /* If only one process, we're done. */
-         if (1 == size) {
-                   return MPI_SUCCESS;
-         }
-
-         /* Initiate all send/recv to/from others. */
-         reqs =  xbt_malloc(2*(size-1) * sizeof(smpi_mpi_request_t));
-
-         prcv = (char *) rbuf;
-         psnd = (char *) sbuf;
-
-         /* Post all receives first -- a simple optimization */
-         for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
-                   err = smpi_create_request( prcv + (i * rcvinc), rcount, rdtype,
-                                        i, rank,
-                                        system_alltoall_tag,
-                                        comm, &(reqs[nreq]));
-                if (MPI_SUCCESS != err) {
-                        DEBUG2("[%d] failed to create request for rank %d\n",rank,i);
-                        for (i=0;i< nreq;i++) 
-                                xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
-                        return err;
-                }
-                nreq++;
-         }
-         /* Now post all sends in reverse order 
-          *        - We would like to minimize the search time through message queue
-          *                 when messages actually arrive in the order in which they were posted.
-          *                      */
-         for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size ) {
-                   err = smpi_create_request (psnd + (i * sndinc), scount, sdtype, 
-                                         rank, i,
-                                                    system_alltoall_tag, 
-                                         comm, &(reqs[nreq]));
-                if (MPI_SUCCESS != err) {
-                        DEBUG2("[%d] failed to create request for rank %d\n",rank,i);
-                        for (i=0;i< nreq;i++) 
-                                xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
-                        return err;
-                }
-                nreq++;
-        }
-
-        /* Start your engines.  This will never return an error. */
-        for ( i=0; i< nreq/2; i++ ) {
-            smpi_mpi_irecv( reqs[i] );
-        }
-        for ( i= nreq/2; i<nreq; i++ ) {
-            smpi_mpi_isend( reqs[i] );
-        }
-
-
-        /* Wait for them all.  If there's an error, note that we don't
-         * care what the error was -- just that there *was* an error.  The
-         * PML will finish all requests, even if one or more of them fail.
-         * i.e., by the end of this call, all the requests are free-able.
-         * So free them anyway -- even if there was an error, and return
-         * the error after we free everything. */
-
-         err = smpi_mpi_waitall(nreq, reqs, MPI_STATUS_IGNORE);
-
-         /* Free the reqs */
-        for (i=0;i< 2*(size-1);i++) {
-            xbt_mallocator_release(smpi_global->request_mallocator, reqs[i]);
-        }
-         return err;
+  int system_tag = 777;
+  int i, rank, size, err, count;
+  MPI_Aint lb;
+  MPI_Aint sendext = 0;
+  MPI_Aint recvext = 0;
+  MPI_Request *requests;
+
+  // FIXME: check implementation
+  rank = smpi_comm_rank(comm);
+  size = smpi_comm_size(comm);
+  XBT_DEBUG("<%d> algorithm alltoall_bruck() called.", rank);
+  smpi_datatype_extent(sendtype, &lb, &sendext);
+  smpi_datatype_extent(recvtype, &lb, &recvext);
+  /* Local copy from self */
+  err =
+      smpi_datatype_copy((char *)sendbuf + rank * sendcount * sendext, 
+                         sendcount, sendtype, 
+                         (char *)recvbuf + rank * recvcount * recvext,
+                         recvcount, recvtype);
+  if (err == MPI_SUCCESS && size > 1) {
+    /* Initiate all send/recv to/from others. */
+    requests = xbt_new(MPI_Request, 2 * (size - 1));
+    count = 0;
+    /* Create all receives that will be posted first */
+    for (i = 0; i < size; ++i) {
+      if (i == rank) {
+        XBT_DEBUG("<%d> skip request creation [src = %d, recvcount = %d]",
+               rank, i, recvcount);
+        continue;
+      }
+      requests[count] =
+          smpi_irecv_init((char *)recvbuf + i * recvcount * recvext, recvcount,
+                          recvtype, i, system_tag, comm);
+      count++;
+    }
+    /* Now create all sends  */
+    for (i = 0; i < size; ++i) {
+      if (i == rank) {
+        XBT_DEBUG("<%d> skip request creation [dst = %d, sendcount = %d]",
+               rank, i, sendcount);
+        continue;
+      }
+      requests[count] =
+          smpi_isend_init((char *)sendbuf + i * sendcount * sendext, sendcount,
+                          sendtype, i, system_tag, comm);
+      count++;
+    }
+    /* Wait for them all. */
+    smpi_mpi_startall(count, requests);
+    XBT_DEBUG("<%d> wait for %d requests", rank, count);
+    smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
+    for(i = 0; i < count; i++) {
+      if(requests[i]!=MPI_REQUEST_NULL) smpi_mpi_request_free(&requests[i]);
+    }
+    xbt_free(requests);
+  }
+  return MPI_SUCCESS;
 }
 
-
 /**
- * Alltoall Bruck 
- *
- * Openmpi calls this routine when the message size sent to each rank < 2000 bytes and size < 12
+ * Alltoall basic_linear (STARMPI:alltoall-simple)
  **/
-
-
-int smpi_coll_tuned_alltoall_bruck(void *sendbuf, int sendcount, MPI_Datatype sdtype,
-                   void* recvbuf, int recvcount, MPI_Datatype rdtype, MPI_Comm comm)
+int smpi_coll_tuned_alltoall_basic_linear(void *sendbuf, int sendcount,
+                                          MPI_Datatype sendtype,
+                                          void *recvbuf, int recvcount,
+                                          MPI_Datatype recvtype,
+                                          MPI_Comm comm)
 {
-/*       int size = comm->size;
-         int i, k, line = -1;
-         int sendto, recvfrom, distance, *displs=NULL, *blen=NULL;
-         int maxpacksize, packsize, position;
-         char * tmpbuf=NULL, *packbuf=NULL;
-         ptrdiff_t lb, sext, rext;
-         int err = 0;
-         int weallocated = 0;
-         MPI_Datatype iddt;
-
-         rank = smpi_mpi_comm_rank(comm);
-*/
-         INFO0("coll:tuned:alltoall_intra_bruck ** NOT IMPLEMENTED YET**");
-/*
-         displs = xbt_malloc(size*sizeof(int));
-         blen =   xbt_malloc(size*sizeof(int));
-
-         weallocated = 1;
-*/
-         /* Prepare for packing data */
-/*
-         err = MPI_Pack_size( scount*size, sdtype, comm, &maxpacksize );
-         if (err != MPI_SUCCESS) {  }
-*/
-         /* pack buffer allocation */
-/*       packbuf = (char*) malloc((unsigned) maxpacksize);
-         if (packbuf == NULL) { }
-*/
-         /* tmp buffer allocation for message data */
-/*       tmpbuf = xbt_malloc(scount*size*sext);
-         if (tmpbuf == NULL) {  }
-*/
-
-         /* Step 1 - local rotation - shift up by rank */
-/*       err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) ((size-rank)*scount),
-                               tmpbuf, ((char*)sbuf)+rank*scount*sext);
-         if (err<0) {
-                   line = __LINE__; err = -1; goto err_hndl;
-         }
-
-         if (rank != 0) {
-                   err = ompi_ddt_copy_content_same_ddt (sdtype, (int32_t) (rank*scount),
-                                         tmpbuf+(size-rank)*scount*sext, (char*)sbuf);
-                   if (err<0) {
-                               line = __LINE__; err = -1; goto err_hndl;
-                   }
-         }
-*/
-         /* perform communication step */
-/*       for (distance = 1; distance < size; distance<<=1) {
-*/
-                   /* send data to "sendto" */
-/*                 sendto = (rank+distance)%size;
-                   recvfrom = (rank-distance+size)%size;
-                   packsize = 0;
-                   k = 0;
-*/
-                   /* create indexed datatype */
-//                 for (i = 1; i < size; i++) {
-//                             if ((i&distance) == distance) {
-//                                       displs[k] = i*scount; blen[k] = scount;
-//                                       k++;
-//                             }
-//                 }
-                   /* Set indexes and displacements */
-//                 err = MPI_Type_indexed(k, blen, displs, sdtype, &iddt);
-//                 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
-//                 /* Commit the new datatype */
-///                err = MPI_Type_commit(&iddt);
-//                 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
-
-                   /* have the new distribution ddt, pack and exchange data */
-//                 err = MPI_Pack(tmpbuf, 1, iddt, packbuf, maxpacksize, &packsize, comm);
-//                 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
-
-                   /* Sendreceive */
-//                 err = ompi_coll_tuned_sendrecv ( packbuf, packsize, MPI_PACKED, sendto,
-//                                       MCA_COLL_BASE_TAG_ALLTOALL,
-//                                       rbuf, packsize, MPI_PACKED, recvfrom,
-//                                       MCA_COLL_BASE_TAG_ALLTOALL,
-//                                       comm, MPI_STATUS_IGNORE, rank);
-//                 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
-
-                   /* Unpack data from rbuf to tmpbuf */
-//                 position = 0;
-//         err = MPI_Unpack(rbuf, packsize, &position,
-//                                       tmpbuf, 1, iddt, comm);
-//                 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; }
-
-                   /* free ddt */
-//                 err = MPI_Type_free(&iddt);
-//                 if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl;  }
-//       } /* end of for (distance = 1... */
-
-         /* Step 3 - local rotation - */
-//       for (i = 0; i < size; i++) {
-
-//                 err = ompi_ddt_copy_content_same_ddt (rdtype, (int32_t) rcount,
-//                                       ((char*)rbuf)+(((rank-i+size)%size)*rcount*rext),
-//                                       tmpbuf+i*rcount*rext);
-//
-//       if (err<0) {
-//                             line = __LINE__; err = -1; goto err_hndl;
-//                 }
-//       }
-
-         /* Step 4 - clean up */
-/*       if (tmpbuf != NULL) free(tmpbuf);
-         if (packbuf != NULL) free(packbuf);
-         if (weallocated) {
-                   if (displs != NULL) free(displs);
-                   if (blen != NULL) free(blen);
-         }
-         return OMPI_SUCCESS;
-
-err_hndl:
-         OPAL_OUTPUT((ompi_coll_tuned_stream,"%s:%4d\tError occurred %d, rank %2d", __FILE__,line,err,rank));
-         if (tmpbuf != NULL) free(tmpbuf);
-         if (packbuf != NULL) free(packbuf);
-         if (weallocated) {
-                   if (displs != NULL) free(displs);
-                   if (blen != NULL) free(blen);
-         }
-         return err;
-         */
-         return -1; /* FIXME: to be changed*/
+  int system_tag = 888;
+  int i, rank, size, err, count;
+  MPI_Aint lb = 0, sendext = 0, recvext = 0;
+  MPI_Request *requests;
+
+  /* Initialize. */
+  rank = smpi_comm_rank(comm);
+  size = smpi_comm_size(comm);
+  XBT_DEBUG("<%d> algorithm alltoall_basic_linear() called.", rank);
+  smpi_datatype_extent(sendtype, &lb, &sendext);
+  smpi_datatype_extent(recvtype, &lb, &recvext);
+  /* simple optimization */
+  err = smpi_datatype_copy((char *)sendbuf + rank * sendcount * sendext, 
+                           sendcount, sendtype, 
+                           (char *)recvbuf + rank * recvcount * recvext, 
+                           recvcount, recvtype);
+  if (err == MPI_SUCCESS && size > 1) {
+    /* Initiate all send/recv to/from others. */
+    requests = xbt_new(MPI_Request, 2 * (size - 1));
+    /* Post all receives first -- a simple optimization */
+    count = 0;
+    for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
+      requests[count] =
+          smpi_irecv_init((char *)recvbuf + i * recvcount * recvext, recvcount, 
+                          recvtype, i, system_tag, comm);
+      count++;
+    }
+    /* Now post all sends in reverse order
+     *   - We would like to minimize the search time through message queue
+     *     when messages actually arrive in the order in which they were posted.
+     * TODO: check the previous assertion
+     */
+    for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size) {
+      requests[count] =
+          smpi_isend_init((char *)sendbuf + i * sendcount * sendext, sendcount,
+                          sendtype, i, system_tag, comm);
+      count++;
+    }
+    /* Wait for them all. */
+    smpi_mpi_startall(count, requests);
+    XBT_DEBUG("<%d> wait for %d requests", rank, count);
+    smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
+    for(i = 0; i < count; i++) {
+      if(requests[i]!=MPI_REQUEST_NULL) smpi_mpi_request_free(&requests[i]);
+    }
+    xbt_free(requests);
+  }
+  return err;
 }
 
-
-/**
- * -----------------------------------------------------------------------------------------------------
- * example usage
- * -----------------------------------------------------------------------------------------------------
- **/
-/*
- * int main() {
-
-      int rank; 
-      int size=12;
-
-      proc_tree_t tree;
-      for (rank=0;rank<size;rank++) {
-            printf("--------------tree for rank %d ----------\n",rank);
-            tree = alloc_tree( 2 );
-            build_tree( rank, size, &tree );
-            print_tree( tree );
-            free_tree( tree );
-   
+int smpi_coll_basic_alltoallv(void *sendbuf, int *sendcounts,
+                              int *senddisps, MPI_Datatype sendtype,
+                              void *recvbuf, int *recvcounts,
+                              int *recvdisps, MPI_Datatype recvtype,
+                              MPI_Comm comm)
+{
+  int system_tag = 889;
+  int i, rank, size, err, count;
+  MPI_Aint lb = 0, sendext = 0, recvext = 0;
+  MPI_Request *requests;
+
+  /* Initialize. */
+  rank = smpi_comm_rank(comm);
+  size = smpi_comm_size(comm);
+  XBT_DEBUG("<%d> algorithm basic_alltoallv() called.", rank);
+  smpi_datatype_extent(sendtype, &lb, &sendext);
+  smpi_datatype_extent(recvtype, &lb, &recvext);
+  /* Local copy from self */
+  err =
+      smpi_datatype_copy((char *)sendbuf + senddisps[rank] * sendext, 
+                         sendcounts[rank], sendtype,
+                         (char *)recvbuf + recvdisps[rank] * recvext, 
+                         recvcounts[rank], recvtype);
+  if (err == MPI_SUCCESS && size > 1) {
+    /* Initiate all send/recv to/from others. */
+    requests = xbt_new(MPI_Request, 2 * (size - 1));
+    count = 0;
+    /* Create all receives that will be posted first */
+    for (i = 0; i < size; ++i) {
+      if (i == rank || recvcounts[i] == 0) {
+        XBT_DEBUG
+            ("<%d> skip request creation [src = %d, recvcounts[src] = %d]",
+             rank, i, recvcounts[i]);
+        continue;
       }
-      printf("-------------- bcast ----------\n");
-      for (rank=0;rank<size;rank++) {
-              bcast( rank, size );
+      requests[count] =
+          smpi_irecv_init((char *)recvbuf + recvdisps[i] * recvext, 
+                          recvcounts[i], recvtype, i, system_tag, comm);
+      count++;
+    }
+    /* Now create all sends  */
+    for (i = 0; i < size; ++i) {
+      if (i == rank || sendcounts[i] == 0) {
+        XBT_DEBUG
+            ("<%d> skip request creation [dst = %d, sendcounts[dst] = %d]",
+             rank, i, sendcounts[i]);
+        continue;
       }
-
-
+      requests[count] =
+          smpi_isend_init((char *)sendbuf + senddisps[i] * sendext, 
+                          sendcounts[i], sendtype, i, system_tag, comm);
+      count++;
+    }
+    /* Wait for them all. */
+    smpi_mpi_startall(count, requests);
+    XBT_DEBUG("<%d> wait for %d requests", rank, count);
+    smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
+    for(i = 0; i < count; i++) {
+      if(requests[i]!=MPI_REQUEST_NULL) smpi_mpi_request_free(&requests[i]);
+    }
+    xbt_free(requests);
+  }
+  return err;
 }
-*/
-
-                
-