1 #include "colls_private.h"
6 int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
7 MPI_Datatype stype, void *rbuf,
8 int rcount, MPI_Datatype rtype,
11 int src, dst, comm_size, rank;
12 comm_size = smpi_comm_size(comm);
13 rank = smpi_comm_rank(comm);
14 MPI_Aint rextent, sextent;
15 rextent = smpi_datatype_get_extent(rtype);
16 sextent = smpi_datatype_get_extent(stype);
17 int tag = COLL_TAG_ALLGATHER;
19 int i, send_offset, recv_offset;
20 int intra_rank, inter_rank;
21 intra_rank = rank % NUM_CORE;
22 inter_rank = rank / NUM_CORE;
23 int inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE;
24 int num_core_in_current_smp = NUM_CORE;
26 if(comm_size%NUM_CORE)
27 THROWF(arg_error,0, "allgather SMP NTS algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", NUM_CORE);
29 /* for too small number of processes, use default implementation */
30 if (comm_size <= NUM_CORE) {
31 XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather.");
32 smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
36 // the last SMP node may have fewer number of running processes than all others
37 if (inter_rank == (inter_comm_size - 1)) {
38 num_core_in_current_smp = comm_size - (inter_rank * NUM_CORE);
40 //copy corresponding message from sbuf to rbuf
41 recv_offset = rank * rextent * rcount;
42 smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag,
43 ((char *) rbuf + recv_offset), rcount, rtype, rank, tag, comm,
46 //gather to root of each SMP
48 for (i = 1; i < num_core_in_current_smp; i++) {
51 (inter_rank * NUM_CORE) + (intra_rank + i) % (num_core_in_current_smp);
53 (inter_rank * NUM_CORE) + (intra_rank - i +
54 num_core_in_current_smp) %
55 (num_core_in_current_smp);
56 recv_offset = src * rextent * rcount;
58 smpi_mpi_sendrecv(sbuf, scount, stype, dst, tag,
59 ((char *) rbuf + recv_offset), rcount, rtype, src, tag, comm,
64 // INTER-SMP-ALLGATHER
65 // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message
66 // Use logical ring algorithm
69 if (intra_rank == 0) {
70 MPI_Request *rrequest_array = xbt_new(MPI_Request, inter_comm_size - 1);
71 MPI_Request *srequest_array = xbt_new(MPI_Request, inter_comm_size - 1);
73 src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE;
74 dst = ((inter_rank + 1) % inter_comm_size) * NUM_CORE;
76 // post all inter Irecv
77 for (i = 0; i < inter_comm_size - 1; i++) {
79 ((inter_rank - i - 1 +
80 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
81 rrequest_array[i] = smpi_mpi_irecv((char *)rbuf + recv_offset, rcount * NUM_CORE,
82 rtype, src, tag + i, comm);
88 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
89 srequest_array[0] = smpi_mpi_isend((char *)rbuf + send_offset, scount * NUM_CORE,
90 stype, dst, tag, comm);
92 // loop : recv-inter , send-inter, send-intra (linear-bcast)
93 for (i = 0; i < inter_comm_size - 2; i++) {
95 ((inter_rank - i - 1 +
96 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
97 smpi_mpi_wait(&rrequest_array[i], MPI_STATUS_IGNORE);
98 srequest_array[i + 1] = smpi_mpi_isend((char *)rbuf + recv_offset, scount * NUM_CORE,
99 stype, dst, tag + i + 1, comm);
100 if (num_core_in_current_smp > 1) {
101 smpi_mpi_send((char *)rbuf + recv_offset, scount * NUM_CORE,
102 stype, (rank + 1), tag + i + 1, comm);
106 // recv last message and send_intra
108 ((inter_rank - i - 1 +
109 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
110 //recv_offset = ((inter_rank + 1) % inter_comm_size) * NUM_CORE * sextent * scount;
111 //i=inter_comm_size-2;
112 smpi_mpi_wait(&rrequest_array[i], MPI_STATUS_IGNORE);
113 if (num_core_in_current_smp > 1) {
114 smpi_mpi_send((char *)rbuf + recv_offset, scount * NUM_CORE,
115 stype, (rank + 1), tag + i + 1, comm);
118 smpi_mpi_waitall(inter_comm_size - 1, srequest_array, MPI_STATUSES_IGNORE);
119 xbt_free(rrequest_array);
120 xbt_free(srequest_array);
122 // last rank of each SMP
123 else if (intra_rank == (num_core_in_current_smp - 1)) {
124 for (i = 0; i < inter_comm_size - 1; i++) {
126 ((inter_rank - i - 1 +
127 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
128 smpi_mpi_recv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
129 rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
132 // intermediate rank of each SMP
134 for (i = 0; i < inter_comm_size - 1; i++) {
136 ((inter_rank - i - 1 +
137 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
138 smpi_mpi_recv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
139 rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
140 smpi_mpi_send((char *) rbuf + recv_offset, (scount * NUM_CORE), stype,
141 (rank + 1), tag + i + 1, comm);