--- /dev/null
+
+/*
+ * ompi_coll_tuned_allgatherv_intra_neighborexchange
+ *
+ * Function: allgatherv using N/2 steps (O(N))
+ * Accepts: Same arguments as MPI_Allgatherv
+ * Returns: MPI_SUCCESS or error code
+ *
+ * Description: Neighbor Exchange algorithm for allgather adapted for
+ * allgatherv.
+ * Described by Chen et.al. in
+ * "Performance Evaluation of Allgather Algorithms on
+ * Terascale Linux Cluster with Fast Ethernet",
+ * Proceedings of the Eighth International Conference on
+ * High-Performance Computing inn Asia-Pacific Region
+ * (HPCASIA'05), 2005
+ *
+ * Rank r exchanges message with one of its neighbors and
+ * forwards the data further in the next step.
+ *
+ * No additional memory requirements.
+ *
+ * Limitations: Algorithm works only on even number of processes.
+ * For odd number of processes we switch to ring algorithm.
+ *
+ * Example on 6 nodes:
+ * Initial state
+ * # 0 1 2 3 4 5
+ * [0] [ ] [ ] [ ] [ ] [ ]
+ * [ ] [1] [ ] [ ] [ ] [ ]
+ * [ ] [ ] [2] [ ] [ ] [ ]
+ * [ ] [ ] [ ] [3] [ ] [ ]
+ * [ ] [ ] [ ] [ ] [4] [ ]
+ * [ ] [ ] [ ] [ ] [ ] [5]
+ * Step 0:
+ * # 0 1 2 3 4 5
+ * [0] [0] [ ] [ ] [ ] [ ]
+ * [1] [1] [ ] [ ] [ ] [ ]
+ * [ ] [ ] [2] [2] [ ] [ ]
+ * [ ] [ ] [3] [3] [ ] [ ]
+ * [ ] [ ] [ ] [ ] [4] [4]
+ * [ ] [ ] [ ] [ ] [5] [5]
+ * Step 1:
+ * # 0 1 2 3 4 5
+ * [0] [0] [0] [ ] [ ] [0]
+ * [1] [1] [1] [ ] [ ] [1]
+ * [ ] [2] [2] [2] [2] [ ]
+ * [ ] [3] [3] [3] [3] [ ]
+ * [4] [ ] [ ] [4] [4] [4]
+ * [5] [ ] [ ] [5] [5] [5]
+ * Step 2:
+ * # 0 1 2 3 4 5
+ * [0] [0] [0] [0] [0] [0]
+ * [1] [1] [1] [1] [1] [1]
+ * [2] [2] [2] [2] [2] [2]
+ * [3] [3] [3] [3] [3] [3]
+ * [4] [4] [4] [4] [4] [4]
+ * [5] [5] [5] [5] [5] [5]
+ */
+
+ #include "colls_private.h"
+ #define MCA_COLL_BASE_TAG_ALLGATHERV 444
+
+int
+smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
+ MPI_Datatype sdtype,
+ void* rbuf, int *rcounts, int *rdispls,
+ MPI_Datatype rdtype,
+ MPI_Comm comm)
+{
+ int line = -1;
+ int rank, size;
+ int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
+
+ int i, even_rank;
+ int err = 0;
+ ptrdiff_t slb, rlb, sext, rext;
+ char *tmpsend = NULL, *tmprecv = NULL;
+
+
+ size = smpi_comm_size(comm);
+ rank = smpi_comm_rank(comm);
+
+ if (size % 2) {
+ XBT_DEBUG(
+ "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
+ size);
+ return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
+ rbuf, rcounts,
+ rdispls, rdtype,
+ comm);
+ }
+
+ XBT_DEBUG(
+ "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);
+
+ err = smpi_datatype_extent (sdtype, &slb, &sext);
+ if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+ err = smpi_datatype_extent (rdtype, &rlb, &rext);
+ if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+
+ /* Initialization step:
+ - if send buffer is not MPI_IN_PLACE, copy send buffer to
+ the appropriate block of receive buffer
+ */
+ tmprecv = (char*) rbuf + rdispls[rank] * rext;
+ if (MPI_IN_PLACE != sbuf) {
+ tmpsend = (char*) sbuf;
+ err = smpi_datatype_copy(tmpsend, scount, sdtype,
+ tmprecv, rcounts[rank], rdtype);
+ if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+ }
+
+ /* Determine neighbors, order in which blocks will arrive, etc. */
+ even_rank = !(rank % 2);
+ if (even_rank) {
+ neighbor[0] = (rank + 1) % size;
+ neighbor[1] = (rank - 1 + size) % size;
+ recv_data_from[0] = rank;
+ recv_data_from[1] = rank;
+ offset_at_step[0] = (+2);
+ offset_at_step[1] = (-2);
+ } else {
+ neighbor[0] = (rank - 1 + size) % size;
+ neighbor[1] = (rank + 1) % size;
+ recv_data_from[0] = neighbor[0];
+ recv_data_from[1] = neighbor[0];
+ offset_at_step[0] = (-2);
+ offset_at_step[1] = (+2);
+ }
+
+ /* Communication loop:
+ - First step is special: exchange a single block with neighbor[0].
+ - Rest of the steps:
+ update recv_data_from according to offset, and
+ exchange two blocks with appropriate neighbor.
+ the send location becomes previous receve location.
+ Note, we need to create indexed datatype to send and receive these
+ blocks properly.
+ */
+ tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
+ tmpsend = (char*)rbuf + rdispls[rank] * rext;
+ smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype,
+ neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
+ tmprecv, rcounts[neighbor[0]], rdtype,
+ neighbor[0], MCA_COLL_BASE_TAG_ALLGATHERV,
+ comm, MPI_STATUS_IGNORE);
+
+
+
+
+
+ /* Determine initial sending counts and displacements*/
+ if (even_rank) {
+ send_data_from = rank;
+ } else {
+ send_data_from = recv_data_from[0];
+ }
+
+ for (i = 1; i < (size / 2); i++) {
+ MPI_Datatype new_rdtype, new_sdtype;
+ int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
+ const int i_parity = i % 2;
+ recv_data_from[i_parity] =
+ (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;
+
+ /* Create new indexed types for sending and receiving.
+ We are sending data from ranks (send_data_from) and (send_data_from+1)
+ We are receiving data from ranks (recv_data_from[i_parity]) and
+ (recv_data_from[i_parity]+1).
+ */
+
+ new_scounts[0] = rcounts[send_data_from];
+ new_scounts[1] = rcounts[(send_data_from + 1)];
+ new_sdispls[0] = rdispls[send_data_from];
+ new_sdispls[1] = rdispls[(send_data_from + 1)];
+ err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype,
+ &new_sdtype);
+ if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+ smpi_datatype_commit(&new_sdtype);
+
+ new_rcounts[0] = rcounts[recv_data_from[i_parity]];
+ new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
+ new_rdispls[0] = rdispls[recv_data_from[i_parity]];
+ new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
+ err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype,
+ &new_rdtype);
+ if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
+ smpi_datatype_commit(&new_rdtype);
+
+ tmprecv = (char*)rbuf;
+ tmpsend = (char*)rbuf;
+
+ /* Sendreceive */
+ smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
+ MCA_COLL_BASE_TAG_ALLGATHERV,
+ tmprecv, 1, new_rdtype, neighbor[i_parity],
+ MCA_COLL_BASE_TAG_ALLGATHERV,
+ comm, MPI_STATUS_IGNORE);
+
+ send_data_from = recv_data_from[i_parity];
+
+ smpi_datatype_free(&new_sdtype);
+ smpi_datatype_free(&new_rdtype);
+ }
+
+ return MPI_SUCCESS;
+
+ err_hndl:
+ XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
+ __FILE__, line, err, rank);
+ return err;
+}
- for everything else use ring.
*/
if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
- return smpi_coll_tuned_allgather_intra_recursivedoubling(sbuf, scount, sdtype,
+ return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
- comm, module);
+ comm);
} else if (total_dsize <= 81920) {
- return smpi_coll_tuned_allgather_intra_bruck(sbuf, scount, sdtype,
+ return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
- comm, module);
+ comm);
}
- return smpi_coll_tuned_allgather_intra_ring(sbuf, scount, sdtype,
+ return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
rbuf, rcount, rdtype,
- comm, module);
+ comm);
#endif /* defined(USE_MPICH2_DECISION) */
}
comm);
} else {
-// if (communicator_size % 2) {
+ if (communicator_size % 2) {
return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
comm);
-/* } else {
- return smpi_coll_tuned_allgatherv_intra_neighborexchange(sbuf, scount, sdtype,
+ } else {
+ return smpi_coll_tuned_allgatherv_ompi_neighborexchange(sbuf, scount, sdtype,
rbuf, rcounts, rdispls, rdtype,
- comm, module);
- }*/
+ comm);
+ }
}
}
/*