+ // arity=2: a binary tree, arity=4 seem to be a good setting (see P2P-MPI))
+ nary_tree_barrier(comm, 4);
+}
+
+void smpi_mpi_gather(void *sendbuf, int sendcount, MPI_Datatype sendtype,
+ void *recvbuf, int recvcount, MPI_Datatype recvtype,
+ int root, MPI_Comm comm)
+{
+ int system_tag = COLL_TAG_GATHER;
+ int rank, size, src, index;
+ MPI_Aint lb = 0, recvext = 0;
+ MPI_Request *requests;
+
+ rank = smpi_comm_rank(comm);
+ size = smpi_comm_size(comm);
+ if(rank != root) {
+ // Send buffer to root
+ smpi_mpi_send(sendbuf, sendcount, sendtype, root, system_tag, comm);
+ } else {
+ // FIXME: check for errors
+ smpi_datatype_extent(recvtype, &lb, &recvext);
+ // Local copy from root
+ smpi_datatype_copy(sendbuf, sendcount, sendtype,
+ (char *)recvbuf + root * recvcount * recvext, recvcount, recvtype);
+ // Receive buffers from senders
+ requests = xbt_new(MPI_Request, size - 1);
+ index = 0;
+ for(src = 0; src < size; src++) {
+ if(src != root) {
+ requests[index] = smpi_irecv_init((char *)recvbuf + src * recvcount * recvext,
+ recvcount, recvtype,
+ src, system_tag, comm);
+ index++;
+ }
+ }
+ // Wait for completion of irecv's.
+ smpi_mpi_startall(size - 1, requests);
+ smpi_mpi_waitall(size - 1, requests, MPI_STATUS_IGNORE);
+ xbt_free(requests);
+ }
+}
+
+
+void smpi_mpi_reduce_scatter(void *sendbuf, void *recvbuf, int *recvcounts,
+ MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+ int i, size, count;
+ int *displs;
+ int rank = smpi_process_index();
+ void *tmpbuf;
+
+ /* arbitrarily choose root as rank 0 */
+ size = smpi_comm_size(comm);
+ count = 0;
+ displs = xbt_new(int, size);
+ for (i = 0; i < size; i++) {
+ displs[i] = count;
+ count += recvcounts[i];
+ }
+ tmpbuf=(void*)xbt_malloc(count*smpi_datatype_get_extent(datatype));
+ mpi_coll_reduce_fun(sendbuf, tmpbuf, count, datatype, op, 0, comm);
+ smpi_mpi_scatterv(tmpbuf, recvcounts, displs, datatype, recvbuf,
+ recvcounts[rank], datatype, 0, comm);
+ xbt_free(displs);
+ xbt_free(tmpbuf);
+}
+
+void smpi_mpi_gatherv(void *sendbuf, int sendcount, MPI_Datatype sendtype,
+ void *recvbuf, int *recvcounts, int *displs,
+ MPI_Datatype recvtype, int root, MPI_Comm comm)
+{
+ int system_tag = COLL_TAG_GATHERV;
+ int rank, size, src, index;
+ MPI_Aint lb = 0, recvext = 0;
+ MPI_Request *requests;
+
+ rank = smpi_comm_rank(comm);
+ size = smpi_comm_size(comm);
+ if(rank != root) {
+ // Send buffer to root
+ smpi_mpi_send(sendbuf, sendcount, sendtype, root, system_tag, comm);
+ } else {
+ // FIXME: check for errors
+ smpi_datatype_extent(recvtype, &lb, &recvext);
+ // Local copy from root
+ smpi_datatype_copy(sendbuf, sendcount, sendtype,
+ (char *)recvbuf + displs[root] * recvext,
+ recvcounts[root], recvtype);
+ // Receive buffers from senders
+ requests = xbt_new(MPI_Request, size - 1);
+ index = 0;
+ for(src = 0; src < size; src++) {
+ if(src != root) {
+ requests[index] =
+ smpi_irecv_init((char *)recvbuf + displs[src] * recvext,
+ recvcounts[src], recvtype, src, system_tag, comm);
+ index++;
+ }
+ }
+ // Wait for completion of irecv's.
+ smpi_mpi_startall(size - 1, requests);
+ smpi_mpi_waitall(size - 1, requests, MPI_STATUS_IGNORE);
+ xbt_free(requests);
+ }
+}
+
+void smpi_mpi_allgather(void *sendbuf, int sendcount,
+ MPI_Datatype sendtype, void *recvbuf,
+ int recvcount, MPI_Datatype recvtype,
+ MPI_Comm comm)
+{
+ int system_tag = COLL_TAG_ALLGATHER;
+ int rank, size, other, index;
+ MPI_Aint lb = 0, recvext = 0;
+ MPI_Request *requests;
+
+ rank = smpi_comm_rank(comm);
+ size = smpi_comm_size(comm);
+ // FIXME: check for errors
+ smpi_datatype_extent(recvtype, &lb, &recvext);
+ // Local copy from self
+ smpi_datatype_copy(sendbuf, sendcount, sendtype,
+ (char *)recvbuf + rank * recvcount * recvext, recvcount,
+ recvtype);
+ // Send/Recv buffers to/from others;
+ requests = xbt_new(MPI_Request, 2 * (size - 1));
+ index = 0;
+ for(other = 0; other < size; other++) {
+ if(other != rank) {
+ requests[index] =
+ smpi_isend_init(sendbuf, sendcount, sendtype, other, system_tag,
+ comm);
+ index++;
+ requests[index] = smpi_irecv_init((char *)recvbuf + other * recvcount * recvext,
+ recvcount, recvtype, other,
+ system_tag, comm);
+ index++;
+ }
+ }
+ // Wait for completion of all comms.
+ smpi_mpi_startall(2 * (size - 1), requests);
+ smpi_mpi_waitall(2 * (size - 1), requests, MPI_STATUS_IGNORE);
+ xbt_free(requests);
+}
+
+void smpi_mpi_allgatherv(void *sendbuf, int sendcount,
+ MPI_Datatype sendtype, void *recvbuf,
+ int *recvcounts, int *displs,
+ MPI_Datatype recvtype, MPI_Comm comm)
+{
+ int system_tag = COLL_TAG_ALLGATHERV;
+ int rank, size, other, index;
+ MPI_Aint lb = 0, recvext = 0;
+ MPI_Request *requests;
+
+ rank = smpi_comm_rank(comm);
+ size = smpi_comm_size(comm);
+ // FIXME: check for errors
+ smpi_datatype_extent(recvtype, &lb, &recvext);
+ // Local copy from self
+ smpi_datatype_copy(sendbuf, sendcount, sendtype,
+ (char *)recvbuf + displs[rank] * recvext,
+ recvcounts[rank], recvtype);
+ // Send buffers to others;
+ requests = xbt_new(MPI_Request, 2 * (size - 1));
+ index = 0;
+ for(other = 0; other < size; other++) {
+ if(other != rank) {
+ requests[index] =
+ smpi_isend_init(sendbuf, sendcount, sendtype, other, system_tag,
+ comm);
+ index++;
+ requests[index] =
+ smpi_irecv_init((char *)recvbuf + displs[other] * recvext, recvcounts[other],
+ recvtype, other, system_tag, comm);
+ index++;
+ }
+ }
+ // Wait for completion of all comms.
+ smpi_mpi_startall(2 * (size - 1), requests);
+ smpi_mpi_waitall(2 * (size - 1), requests, MPI_STATUS_IGNORE);
+ xbt_free(requests);
+}
+
+void smpi_mpi_scatter(void *sendbuf, int sendcount, MPI_Datatype sendtype,
+ void *recvbuf, int recvcount, MPI_Datatype recvtype,
+ int root, MPI_Comm comm)
+{
+ int system_tag = COLL_TAG_SCATTER;
+ int rank, size, dst, index;
+ MPI_Aint lb = 0, sendext = 0;
+ MPI_Request *requests;
+
+ rank = smpi_comm_rank(comm);
+ size = smpi_comm_size(comm);
+ if(rank != root) {
+ // Recv buffer from root
+ smpi_mpi_recv(recvbuf, recvcount, recvtype, root, system_tag, comm,
+ MPI_STATUS_IGNORE);
+ } else {
+ // FIXME: check for errors
+ smpi_datatype_extent(sendtype, &lb, &sendext);
+ // Local copy from root
+ if(recvbuf!=MPI_IN_PLACE){
+ smpi_datatype_copy((char *)sendbuf + root * sendcount * sendext,
+ sendcount, sendtype, recvbuf, recvcount, recvtype);
+ }
+ // Send buffers to receivers
+ requests = xbt_new(MPI_Request, size - 1);
+ index = 0;
+ for(dst = 0; dst < size; dst++) {
+ if(dst != root) {
+ requests[index] = smpi_isend_init((char *)sendbuf + dst * sendcount * sendext,
+ sendcount, sendtype, dst,
+ system_tag, comm);
+ index++;
+ }
+ }
+ // Wait for completion of isend's.
+ smpi_mpi_startall(size - 1, requests);
+ smpi_mpi_waitall(size - 1, requests, MPI_STATUS_IGNORE);
+ xbt_free(requests);
+ }
+}
+
+void smpi_mpi_scatterv(void *sendbuf, int *sendcounts, int *displs,
+ MPI_Datatype sendtype, void *recvbuf, int recvcount,
+ MPI_Datatype recvtype, int root, MPI_Comm comm)
+{
+ int system_tag = COLL_TAG_SCATTERV;
+ int rank, size, dst, index;
+ MPI_Aint lb = 0, sendext = 0;
+ MPI_Request *requests;
+
+ rank = smpi_comm_rank(comm);
+ size = smpi_comm_size(comm);
+ if(rank != root) {
+ // Recv buffer from root
+ smpi_mpi_recv(recvbuf, recvcount, recvtype, root, system_tag, comm,
+ MPI_STATUS_IGNORE);
+ } else {
+ // FIXME: check for errors
+ smpi_datatype_extent(sendtype, &lb, &sendext);
+ // Local copy from root
+ if(recvbuf!=MPI_IN_PLACE){
+ smpi_datatype_copy((char *)sendbuf + displs[root] * sendext, sendcounts[root],
+ sendtype, recvbuf, recvcount, recvtype);
+ }
+ // Send buffers to receivers
+ requests = xbt_new(MPI_Request, size - 1);
+ index = 0;
+ for(dst = 0; dst < size; dst++) {
+ if(dst != root) {
+ requests[index] =
+ smpi_isend_init((char *)sendbuf + displs[dst] * sendext, sendcounts[dst],
+ sendtype, dst, system_tag, comm);
+ index++;
+ }
+ }
+ // Wait for completion of isend's.
+ smpi_mpi_startall(size - 1, requests);
+ smpi_mpi_waitall(size - 1, requests, MPI_STATUS_IGNORE);
+ xbt_free(requests);
+ }
+}
+
+void smpi_mpi_reduce(void *sendbuf, void *recvbuf, int count,
+ MPI_Datatype datatype, MPI_Op op, int root,
+ MPI_Comm comm)
+{
+ int system_tag = COLL_TAG_REDUCE;
+ int rank, size, src, index;
+ MPI_Aint lb = 0, dataext = 0;
+ MPI_Request *requests;
+ void **tmpbufs;
+
+
+ char* sendtmpbuf = (char*) sendbuf;
+ if( sendbuf == MPI_IN_PLACE ) {
+ sendtmpbuf = (char *)xbt_malloc(count*smpi_datatype_get_extent(datatype));
+ smpi_datatype_copy(recvbuf, count, datatype,sendtmpbuf, count, datatype);
+ }
+
+ rank = smpi_comm_rank(comm);
+ size = smpi_comm_size(comm);
+ //non commutative case, use a working algo from openmpi
+ if(!smpi_op_is_commute(op)){
+ smpi_coll_tuned_reduce_ompi_basic_linear(sendtmpbuf, recvbuf, count,
+ datatype, op, root, comm);
+ return;
+ }
+
+ if(rank != root) {
+ // Send buffer to root
+ smpi_mpi_send(sendtmpbuf, count, datatype, root, system_tag, comm);
+ } else {
+ // FIXME: check for errors
+ smpi_datatype_extent(datatype, &lb, &dataext);
+ // Local copy from root
+ if (sendtmpbuf && recvbuf)
+ smpi_datatype_copy(sendtmpbuf, count, datatype, recvbuf, count, datatype);
+ // Receive buffers from senders
+ //TODO: make a MPI_barrier here ?
+ requests = xbt_new(MPI_Request, size - 1);
+ tmpbufs = xbt_new(void *, size - 1);
+ index = 0;
+ for(src = 0; src < size; src++) {
+ if(src != root) {
+ // FIXME: possibly overkill we we have contiguous/noncontiguous data
+ // mapping...
+ tmpbufs[index] = xbt_malloc(count * dataext);
+ requests[index] =
+ smpi_irecv_init(tmpbufs[index], count, datatype, src,
+ system_tag, comm);
+ index++;
+ }
+ }
+ // Wait for completion of irecv's.
+ smpi_mpi_startall(size - 1, requests);
+ for(src = 0; src < size - 1; src++) {
+ index = smpi_mpi_waitany(size - 1, requests, MPI_STATUS_IGNORE);
+ XBT_DEBUG("finished waiting any request with index %d", index);
+ if(index == MPI_UNDEFINED) {
+ break;
+ }
+ if(op) /* op can be MPI_OP_NULL that does nothing */
+ smpi_op_apply(op, tmpbufs[index], recvbuf, &count, &datatype);
+ }
+ for(index = 0; index < size - 1; index++) {
+ xbt_free(tmpbufs[index]);
+ }
+ xbt_free(tmpbufs);
+ xbt_free(requests);
+
+ if( sendbuf == MPI_IN_PLACE ) {
+ xbt_free(sendtmpbuf);
+ }
+ }
+}
+
+void smpi_mpi_allreduce(void *sendbuf, void *recvbuf, int count,
+ MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+ smpi_mpi_reduce(sendbuf, recvbuf, count, datatype, op, 0, comm);
+ smpi_mpi_bcast(recvbuf, count, datatype, 0, comm);
+}
+
+void smpi_mpi_scan(void *sendbuf, void *recvbuf, int count,
+ MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
+{
+ int system_tag = -888;
+ int rank, size, other, index;
+ MPI_Aint lb = 0, dataext = 0;
+ MPI_Request *requests;
+ void **tmpbufs;
+
+ rank = smpi_comm_rank(comm);
+ size = smpi_comm_size(comm);
+
+ // FIXME: check for errors
+ smpi_datatype_extent(datatype, &lb, &dataext);