1 #include "colls_private.h"
6 int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
7 MPI_Datatype stype, void *rbuf,
8 int rcount, MPI_Datatype rtype,
11 int src, dst, comm_size, rank;
12 comm_size = smpi_comm_size(comm);
13 rank = smpi_comm_rank(comm);
14 MPI_Aint rextent, sextent;
15 rextent = smpi_datatype_get_extent(rtype);
16 sextent = smpi_datatype_get_extent(stype);
17 int tag = COLL_TAG_ALLGATHER;
19 int i, send_offset, recv_offset;
20 int intra_rank, inter_rank;
21 intra_rank = rank % NUM_CORE;
22 inter_rank = rank / NUM_CORE;
23 int inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE;
24 int num_core_in_current_smp = NUM_CORE;
26 /* for too small number of processes, use default implementation */
27 if (comm_size <= NUM_CORE) {
28 XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather.");
29 smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
33 // the last SMP node may have fewer number of running processes than all others
34 if (inter_rank == (inter_comm_size - 1)) {
35 num_core_in_current_smp = comm_size - (inter_rank * NUM_CORE);
37 //copy corresponding message from sbuf to rbuf
38 recv_offset = rank * rextent * rcount;
39 smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag,
40 ((char *) rbuf + recv_offset), rcount, rtype, rank, tag, comm,
43 //gather to root of each SMP
45 for (i = 1; i < num_core_in_current_smp; i++) {
48 (inter_rank * NUM_CORE) + (intra_rank + i) % (num_core_in_current_smp);
50 (inter_rank * NUM_CORE) + (intra_rank - i +
51 num_core_in_current_smp) %
52 (num_core_in_current_smp);
53 recv_offset = src * rextent * rcount;
55 smpi_mpi_sendrecv(sbuf, scount, stype, dst, tag,
56 ((char *) rbuf + recv_offset), rcount, rtype, src, tag, comm,
61 // INTER-SMP-ALLGATHER
62 // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message
63 // Use logical ring algorithm
66 if (intra_rank == 0) {
67 MPI_Request *rrequest_array = xbt_new(MPI_Request, inter_comm_size - 1);
68 MPI_Request *srequest_array = xbt_new(MPI_Request, inter_comm_size - 1);
70 src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE;
71 dst = ((inter_rank + 1) % inter_comm_size) * NUM_CORE;
73 // post all inter Irecv
74 for (i = 0; i < inter_comm_size - 1; i++) {
76 ((inter_rank - i - 1 +
77 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
78 rrequest_array[i] = smpi_mpi_irecv((char *)rbuf + recv_offset, rcount * NUM_CORE,
79 rtype, src, tag + i, comm);
85 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
86 srequest_array[0] = smpi_mpi_isend((char *)rbuf + send_offset, scount * NUM_CORE,
87 stype, dst, tag, comm);
89 // loop : recv-inter , send-inter, send-intra (linear-bcast)
90 for (i = 0; i < inter_comm_size - 2; i++) {
92 ((inter_rank - i - 1 +
93 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
94 smpi_mpi_wait(&rrequest_array[i], MPI_STATUS_IGNORE);
95 srequest_array[i + 1] = smpi_mpi_isend((char *)rbuf + recv_offset, scount * NUM_CORE,
96 stype, dst, tag + i + 1, comm);
97 if (num_core_in_current_smp > 1) {
98 smpi_mpi_send((char *)rbuf + recv_offset, scount * NUM_CORE,
99 stype, (rank + 1), tag + i + 1, comm);
103 // recv last message and send_intra
105 ((inter_rank - i - 1 +
106 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
107 //recv_offset = ((inter_rank + 1) % inter_comm_size) * NUM_CORE * sextent * scount;
108 //i=inter_comm_size-2;
109 smpi_mpi_wait(&rrequest_array[i], MPI_STATUS_IGNORE);
110 if (num_core_in_current_smp > 1) {
111 smpi_mpi_send((char *)rbuf + recv_offset, scount * NUM_CORE,
112 stype, (rank + 1), tag + i + 1, comm);
115 smpi_mpi_waitall(inter_comm_size - 1, srequest_array, MPI_STATUSES_IGNORE);
116 xbt_free(rrequest_array);
117 xbt_free(srequest_array);
119 // last rank of each SMP
120 else if (intra_rank == (num_core_in_current_smp - 1)) {
121 for (i = 0; i < inter_comm_size - 1; i++) {
123 ((inter_rank - i - 1 +
124 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
125 smpi_mpi_recv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
126 rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
129 // intermediate rank of each SMP
131 for (i = 0; i < inter_comm_size - 1; i++) {
133 ((inter_rank - i - 1 +
134 inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
135 smpi_mpi_recv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
136 rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
137 smpi_mpi_send((char *) rbuf + recv_offset, (scount * NUM_CORE), stype,
138 (rank + 1), tag + i + 1, comm);