1 /* extracted from mpig_myreduce.c with
2 :3,$s/MPL/MPI/g and :%s/\\\\$/ \\/ */
4 /* Copyright: Rolf Rabenseifner, 1997
5 * Computing Center University of Stuttgart
6 * rabenseifner@rus.uni-stuttgart.de
8 * The usage of this software is free,
9 * but this header must not be removed.
12 #include "../colls_private.hpp"
16 #define REDUCE_NEW_ALWAYS 1
19 # define SCR_LNG_OPTIM(bytelng) 128 + ((bytelng+127)/256) * 256;
20 /* = 16 + multiple of 32 doubles*/
22 #define REDUCE_LIMITS /* values are lower limits for count arg. */ \
23 /* routine = reduce allreduce */ \
24 /* size = 2, 3,2**n,other 2, 3,2**n,other */ \
25 static int Lsh[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
26 static int Lin[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
27 static int Llg[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
28 static int Lfp[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
29 static int Ldb[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
30 static int Lby[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}};
33 #ifdef REDUCE_NEW_ALWAYS
35 #define REDUCE_LIMITS /* values are lower limits for count arg. */ \
36 /* routine = reduce allreduce */ \
37 /* size = 2, 3,2**n,other 2, 3,2**n,other */ \
38 static int Lsh[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
39 static int Lin[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
40 static int Llg[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
41 static int Lfp[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
42 static int Ldb[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
43 static int Lby[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}};
46 /* Fast reduce and allreduce algorithm for longer buffers and predefined
49 This algorithm is explained with the example of 13 nodes.
50 The nodes are numbered 0, 1, 2, ... 12.
51 The sendbuf content is a, b, c, ... m.
52 The buffer array is notated with ABCDEFGH, this means that
53 e.g. 'C' is the third 1/8 of the buffer.
55 The algorithm computes
57 { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] }
59 This is equivalent to the mostly used binary tree algorithm
62 size := number of nodes in the communicator.
63 2**n := the power of 2 that is next smaller or equal to the size.
66 Exa.: size=13 ==> n=3, r=5 (i.e. size == 13 == 2**n+r == 2**3 + 5)
68 The algorithm needs for the execution of one colls::reduce
71 exec_time = n*(L1+L2) + buf_lng * (1-1/2**n) * (T1 + T2 + O/d)
74 exec_time = (n+1)*(L1+L2) + buf_lng * T1
75 + buf_lng*(1+1/2-1/2**n)* (T2 + O/d)
77 with L1 = latency of a message transfer e.g. by send+recv
78 L2 = latency of a message exchange e.g. by sendrecv
79 T1 = additional time to transfer 1 byte (= 1/bandwidth)
80 T2 = additional time to exchange 1 byte in both directions.
81 O = time for one operation
82 d = size of the datatype
84 On a MPP system it is expected that T1==T2.
86 In Comparison with the binary tree algorithm that needs:
89 exec_time_bin_tree = n * L1 + buf_lng * n * (T1 + O/d)
92 exec_time_bin_tree = (n+1)*L1 + buf_lng*(n+1)*(T1 + O/d)
94 the new algorithm is faster if (assuming T1=T2)
97 for r==0: buf_lng > L2 * n / [ (n-2) * T1 + (n-1) * O/d ]
98 for r>0: buf_lng > L2 * (n+1) / [ (n-1.5)*T1 + (n-0.5)*O/d ]
100 for size = 5, 6, and 7:
101 buf_lng > L2 / [0.25 * T1 + 0.58 * O/d ]
104 buf_lng > L2 / [0.25 * T1 + 0.62 * O/d ]
106 buf_lng > L2 / [ 0.5 * O/d ]
108 and for O/d >> T1 we can summarize:
110 The new algorithm is faster for about buf_lng > 2 * L2 * d / O
112 Example L1 = L2 = 50 us,
113 bandwidth=300MB/s, i.e. T1 = T2 = 3.3 ns/byte
114 and 10 MFLOP/s, i.e. O = 100 ns/FLOP
115 for double, i.e. d = 8 byte/FLOP
117 ==> the new algorithm is faster for about buf_lng>8000 bytes
118 i.e count > 1000 doubles !
127 split the buffer into ABCD and EFGH
129 even myrank: send buffer EFGH to myrank+1 (buf_lng/2 exchange)
130 receive buffer ABCD from myrank+1
131 compute op for ABCD (count/2 ops)
132 receive result EFGH (buf_lng/2 transfer)
133 odd myrank: send buffer ABCD to myrank+1
134 receive buffer EFGH from myrank+1
138 Result: node: 0 2 4 6 8 10 11 12
139 value: a+b c+d e+f g+h i+j k l m
143 The following algorithm uses only the nodes with
145 (myrank is even && myrank < 2*r) || (myrank >= 2*r)
149 define NEWRANK(old) := (old < 2*r ? old/2 : old-r)
150 define OLDRANK(new) := (new < r ? new*2 : new+r)
153 old ranks: 0 2 4 6 8 10 11 12
154 new ranks: 0 1 2 3 4 5 6 7
155 value: a+b c+d e+f g+h i+j k l m
159 Split the buffer (ABCDEFGH) in the middle,
160 the lower half (ABCD) is computed on even (new) ranks,
161 the opper half (EFGH) is computed on odd (new) ranks.
163 exchange: ABCD from 1 to 0, from 3 to 2, from 5 to 4 and from 7 to 6
164 EFGH from 0 to 1, from 2 to 3, from 4 to 5 and from 6 to 7
165 (i.e. buf_lng/2 exchange)
166 compute op in each node on its half: (i.e. count/2 ops)
168 Result: node 0: (a+b)+(c+d) for ABCD
169 node 1: (a+b)+(c+d) for EFGH
170 node 2: (e+f)+(g+h) for ABCD
171 node 3: (e+f)+(g+h) for EFGH
172 node 4: (i+j)+ k for ABCD
173 node 5: (i+j)+ k for EFGH
174 node 6: l + m for ABCD
175 node 7: l + m for EFGH
179 Same with double distance and oncemore the half of the buffer.
180 (i.e. buf_lng/4 exchange)
183 Result: node 0: [(a+b)+(c+d)] + [(e+f)+(g+h)] for AB
184 node 1: [(a+b)+(c+d)] + [(e+f)+(g+h)] for EF
185 node 2: [(a+b)+(c+d)] + [(e+f)+(g+h)] for CD
186 node 3: [(a+b)+(c+d)] + [(e+f)+(g+h)] for GH
187 node 4: [(i+j)+ k ] + [ l + m ] for AB
188 node 5: [(i+j)+ k ] + [ l + m ] for EF
189 node 6: [(i+j)+ k ] + [ l + m ] for CD
190 node 7: [(i+j)+ k ] + [ l + m ] for GH
195 Same with double distance and oncemore the half of the buffer.
196 (i.e. buf_lng/2**n exchange)
197 (i.e. count/2**n ops)
200 0: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for A
201 1: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for E
202 2: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for C
203 3: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for G
204 4: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for B
205 5: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for F
206 6: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for D
207 7: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for H
210 For colls::allreduce:
215 Exchange on the last distance (2*n) the last result
216 (i.e. buf_lng/2**n exchange)
220 Same with half the distance and double of the result
221 (i.e. buf_lng/2**(n-1) exchange)
226 Same with distance 1 and double of the result (== half of the
227 original buffer) (i.e. buf_lng/2 exchange)
229 Result after 6.1 6.2 6.n
231 on node 0: AB ABCD ABCDEFGH
232 on node 1: EF EFGH ABCDEFGH
233 on node 2: CD ABCD ABCDEFGH
234 on node 3: GH EFGH ABCDEFGH
235 on node 4: AB ABCD ABCDEFGH
236 on node 5: EF EFGH ABCDEFGH
237 on node 6: CD ABCD ABCDEFGH
238 on node 7: GH EFGH ABCDEFGH
243 transfer the result from the even nodes with old rank < 2*r
244 to the odd nodes with old rank < 2*r
245 (i.e. buf_lng transfer)
247 { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] }
257 If root node not in the list of the nodes with newranks
260 send last result from node 0 to the root (buf_lng/2**n transfer)
261 and replace the role of node 0 by the root node.
265 Send on the last distance (2**(n-1)) the last result
266 from node with bit '2**(n-1)' in the 'new rank' unequal to that of
267 root's new rank to the node with same '2**(n-1)' bit.
268 (i.e. buf_lng/2**n transfer)
272 Same with half the distance and double of the result
274 (i.e. buf_lng/2**(n-1) transfer)
279 Same with distance 1 and double of the result (== half of the
280 original buffer) and bit '2**0' (i.e. buf_lng/2 transfer)
282 Example: roots old rank: 10
286 action result action result action result
292 on node 4: recv A => AB recv CD => ABCD send ABCD
293 on node 5: recv E => EF recv GH => EFGH recv ABCD => ABCDEFGH
294 on node 6: recv C => CD send CD
295 on node 7: recv G => GH send GH
297 Benchmark results on CRAY T3E
298 -----------------------------
300 uname -a: (sn6307 hwwt3e 1.6.1.51 unicosmk CRAY T3E)
301 MPI: /opt/ctl/mpt/1.1.0.3
303 Ldb[][] = {{ 896,1728, 576, 736},{ 448,1280, 512, 512}}
304 env: export MPI_BUFFER_MAX=4099
305 compiled with: cc -c -O3 -h restrict=f
307 old = binary tree protocol of the vendor
308 new = the new protocol and its implementation
309 mixed = 'new' is used if count > limit(datatype, communicator size)
313 measurement count prot. unit 2 3 4 6 8 12 16 24
314 --------------------------------------------------------------------
315 latency 1 mixed us 20.7 35.1 35.6 49.1 49.2 61.8 62.4 74.8
316 old us 19.0 32.9 33.8 47.1 47.2 61.7 62.1 73.2
317 --------------------------------------------------------------------
318 bandwidth 128k mixed MB/s 75.4 34.9 49.1 27.3 41.1 24.6 38.0 23.7
319 (=buf_lng/time) old MB/s 28.8 16.2 16.3 11.6 11.6 8.8 8.8 7.2
320 ration = mixed/old 2.6 2.1 3.0 2.4 3.5 2.8 4.3 3.3
321 --------------------------------------------------------------------
322 limit doubles 896 1536 576 736 576 736 576 736
323 bandwidth limit mixed MB/s 35.9 20.5 18.6 12.3 13.5 9.2 10.0 8.6
324 old MB/s 35.9 20.3 17.8 13.0 12.5 9.6 9.2 7.8
325 ratio = mixed/old 1.00 1.01 1.04 0.95 1.08 0.96 1.09 1.10
328 measurement count prot. unit 32 48 64 96 128 192 256
329 ---------------------------------------------------------------
330 latency 1 mixed us 77.8 88.5 90.6 102 1)
331 old us 78.6 87.2 90.1 99.7 108 119 120
332 ---------------------------------------------------------------
333 bandwidth 128k mixed MB/s 35.1 23.3 34.1 22.8 34.4 22.4 33.9
334 (=buf_lng/time) old MB/s 6.0 6.0 6.0 5.2 5.2 4.6 4.6
335 ration = mixed/old 5.8 3.9 5.7 4.4 6.6 4.8 7.4 5)
336 ---------------------------------------------------------------
337 limit doubles 576 736 576 736 576 736 576 2)
338 bandwidth limit mixed MB/s 9.7 7.5 8.4 6.5 6.9 5.5 5.1 3)
339 old MB/s 7.7 6.4 6.4 5.7 5.5 4.9 4.7 3)
340 ratio = mixed/old 1.26 1.17 1.31 1.14 1.26 1.12 1.08 4)
344 measurement count prot. unit 2 3 4 6 8 12 16 24
345 --------------------------------------------------------------------
346 latency 1 mixed us 28.2 51.0 44.5 74.4 59.9 102 74.2 133
347 old us 26.9 48.3 42.4 69.8 57.6 96.0 75.7 126
348 --------------------------------------------------------------------
349 bandwidth 128k mixed MB/s 74.0 29.4 42.4 23.3 35.0 20.9 32.8 19.7
350 (=buf_lng/time) old MB/s 20.9 14.4 13.2 9.7 9.6 7.3 7.4 5.8
351 ration = mixed/old 3.5 2.0 3.2 2.4 3.6 2.9 4.4 3.4
352 --------------------------------------------------------------------
353 limit doubles 448 1280 512 512 512 512 512 512
354 bandwidth limit mixed MB/s 26.4 15.1 16.2 8.2 12.4 7.2 10.8 5.7
355 old MB/s 26.1 14.9 15.0 9.1 10.5 6.7 7.3 5.3
356 ratio = mixed/old 1.01 1.01 1.08 0.90 1.18 1.07 1.48 1.08
359 measurement count prot. unit 32 48 64 96 128 192 256
360 ---------------------------------------------------------------
361 latency 1 mixed us 90.3 162 109 190
362 old us 92.7 152 104 179 122 225 135
363 ---------------------------------------------------------------
364 bandwidth 128k mixed MB/s 31.1 19.7 30.1 19.2 29.8 18.7 29.0
365 (=buf_lng/time) old MB/s 5.9 4.8 5.0 4.1 4.4 3.4 3.8
366 ration = mixed/old 5.3 4.1 6.0 4.7 6.8 5.5 7.7
367 ---------------------------------------------------------------
368 limit doubles 512 512 512 512 512 512 512
369 bandwidth limit mixed MB/s 6.6 5.6 5.7 3.5 4.4 3.2 3.8
370 old MB/s 6.3 4.2 5.4 3.6 4.4 3.1
371 ratio = mixed/old 1.05 1.33 1.06 0.97 1.00 1.03
374 1) This line shows that the overhead to decide which protocol
375 should be used can be ignored.
376 2) This line shows the limit for the count argument.
377 If count < limit then the vendor protocol is used,
378 otherwise the new protocol is used (see variable Ldb).
379 3) These lines show the bandwidth (= buffer length / execution time)
381 4) This line shows that the limit is chosen well if the ratio is
382 between 0.95 (losing 5% for buffer length near and >=limit)
383 and 1.10 (not gaining 10% for buffer length near and <limit).
384 5) This line shows that the new protocol is 2..7 times faster
392 #define MPI_I_Sendrecv(sb, sc, sd, dest, st, rb, rc, rd, source, rt, comm, stat) \
395 req = Request::irecv(rb, rc, rd, source, rt, comm); \
396 Request::send(sb, sc, sd, dest, st, comm); \
397 Request::wait(&req, stat); \
401 #define MPI_I_Sendrecv(sb, sc, sd, dest, st, rb, rc, rd, source, rt, comm, stat) \
404 req = mpi_mpi_isend(sb, sc, sd, dest, st, comm); \
405 Request::recv(rb, rc, rd, source, rt, comm, stat); \
406 Request::wait(&req, stat); \
409 #define MPI_I_Sendrecv(sb, sc, sd, dest, st, rb, rc, rd, source, rt, comm, stat) \
410 Request::sendrecv(sb, sc, sd, dest, st, rb, rc, rd, source, rt, comm, stat)
421 MPIM_UNSIGNED_LONG_LONG,
439 #define MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_TYPE, TYPE) \
440 static void MPI_I_do_op_TYPE(TYPE* b1, TYPE* b2, TYPE* rslt, int cnt, MPIM_Op op) \
445 for (i = 0; i < cnt; i++) \
446 rslt[i] = (b1[i] > b2[i] ? b1[i] : b2[i]); \
449 for (i = 0; i < cnt; i++) \
450 rslt[i] = (b1[i] < b2[i] ? b1[i] : b2[i]); \
453 for (i = 0; i < cnt; i++) \
454 rslt[i] = b1[i] + b2[i]; \
457 for (i = 0; i < cnt; i++) \
458 rslt[i] = b1[i] * b2[i]; \
461 for (i = 0; i < cnt; i++) \
462 rslt[i] = b1[i] && b2[i]; \
465 for (i = 0; i < cnt; i++) \
466 rslt[i] = b1[i] || b2[i]; \
469 for (i = 0; i < cnt; i++) \
470 rslt[i] = b1[i] != b2[i]; \
473 for (i = 0; i < cnt; i++) \
474 rslt[i] = b1[i] & b2[i]; \
477 for (i = 0; i < cnt; i++) \
478 rslt[i] = b1[i] | b2[i]; \
481 for (i = 0; i < cnt; i++) \
482 rslt[i] = b1[i] ^ b2[i]; \
489 #define MPI_I_DO_OP_FP(MPI_I_do_op_TYPE, TYPE) \
490 static void MPI_I_do_op_TYPE(TYPE* b1, TYPE* b2, TYPE* rslt, int cnt, MPIM_Op op) \
495 for (i = 0; i < cnt; i++) \
496 rslt[i] = (b1[i] > b2[i] ? b1[i] : b2[i]); \
499 for (i = 0; i < cnt; i++) \
500 rslt[i] = (b1[i] < b2[i] ? b1[i] : b2[i]); \
503 for (i = 0; i < cnt; i++) \
504 rslt[i] = b1[i] + b2[i]; \
507 for (i = 0; i < cnt; i++) \
508 rslt[i] = b1[i] * b2[i]; \
515 #define MPI_I_DO_OP_BYTE(MPI_I_do_op_TYPE, TYPE) \
516 static void MPI_I_do_op_TYPE(TYPE* b1, TYPE* b2, TYPE* rslt, int cnt, MPIM_Op op) \
521 for (i = 0; i < cnt; i++) \
522 rslt[i] = b1[i] & b2[i]; \
525 for (i = 0; i < cnt; i++) \
526 rslt[i] = b1[i] | b2[i]; \
529 for (i = 0; i < cnt; i++) \
530 rslt[i] = b1[i] ^ b2[i]; \
537 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_short, short)
538 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_int, int)
539 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_long, long)
540 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_ushort, unsigned short)
541 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_uint, unsigned int)
542 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_ulong, unsigned long)
543 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_ulonglong, unsigned long long)
544 MPI_I_DO_OP_FP(MPI_I_do_op_float, float)
545 MPI_I_DO_OP_FP(MPI_I_do_op_double, double)
546 MPI_I_DO_OP_BYTE(MPI_I_do_op_byte, char)
548 #define MPI_I_DO_OP_CALL(MPI_I_do_op_TYPE, TYPE) \
549 MPI_I_do_op_TYPE((TYPE*)b1, (TYPE*)b2, (TYPE*)rslt, cnt, op); \
552 static void MPI_I_do_op(void* b1, void* b2, void* rslt, int cnt, MPIM_Datatype datatype, MPIM_Op op)
556 MPI_I_DO_OP_CALL(MPI_I_do_op_short, short)
558 MPI_I_DO_OP_CALL(MPI_I_do_op_int, int)
560 MPI_I_DO_OP_CALL(MPI_I_do_op_long, long)
561 case MPIM_UNSIGNED_SHORT:
562 MPI_I_DO_OP_CALL(MPI_I_do_op_ushort, unsigned short)
564 MPI_I_DO_OP_CALL(MPI_I_do_op_uint, unsigned int)
565 case MPIM_UNSIGNED_LONG:
566 MPI_I_DO_OP_CALL(MPI_I_do_op_ulong, unsigned long)
567 case MPIM_UNSIGNED_LONG_LONG:
568 MPI_I_DO_OP_CALL(MPI_I_do_op_ulonglong, unsigned long long)
570 MPI_I_DO_OP_CALL(MPI_I_do_op_float, float)
572 MPI_I_DO_OP_CALL(MPI_I_do_op_double, double)
574 MPI_I_DO_OP_CALL(MPI_I_do_op_byte, char)
581 static int MPI_I_anyReduce(const void* Sendbuf, void* Recvbuf, int count, MPI_Datatype mpi_datatype, MPI_Op mpi_op,
582 int root, MPI_Comm comm, bool is_all)
584 char *scr1buf, *scr2buf, *scr3buf, *xxx, *sendbuf, *recvbuf;
585 int myrank, size, x_base, x_size, computed, idx;
586 int x_start, x_count = 0, r, n, mynewrank, newroot, partner;
587 int start_even[20], start_odd[20], count_even[20], count_odd[20];
592 MPIM_Datatype datatype = MPIM_INT; MPIM_Op op = MPIM_MAX;
594 if (mpi_datatype==MPI_SHORT ) datatype=MPIM_SHORT;
595 else if(mpi_datatype==MPI_INT ) datatype=MPIM_INT;
596 else if(mpi_datatype==MPI_LONG ) datatype=MPIM_LONG;
597 else if(mpi_datatype==MPI_UNSIGNED_SHORT) datatype=MPIM_UNSIGNED_SHORT;
598 else if(mpi_datatype==MPI_UNSIGNED ) datatype=MPIM_UNSIGNED;
599 else if(mpi_datatype==MPI_UNSIGNED_LONG ) datatype=MPIM_UNSIGNED_LONG;
600 else if(mpi_datatype==MPI_UNSIGNED_LONG_LONG ) datatype=MPIM_UNSIGNED_LONG_LONG;
601 else if(mpi_datatype==MPI_FLOAT ) datatype=MPIM_FLOAT;
602 else if(mpi_datatype==MPI_DOUBLE ) datatype=MPIM_DOUBLE;
603 else if(mpi_datatype==MPI_BYTE ) datatype=MPIM_BYTE;
605 throw std::invalid_argument("reduce rab algorithm can't be used with this datatype!");
607 if (mpi_op==MPI_MAX ) op=MPIM_MAX;
608 else if(mpi_op==MPI_MIN ) op=MPIM_MIN;
609 else if(mpi_op==MPI_SUM ) op=MPIM_SUM;
610 else if(mpi_op==MPI_PROD ) op=MPIM_PROD;
611 else if(mpi_op==MPI_LAND ) op=MPIM_LAND;
612 else if(mpi_op==MPI_BAND ) op=MPIM_BAND;
613 else if(mpi_op==MPI_LOR ) op=MPIM_LOR;
614 else if(mpi_op==MPI_BOR ) op=MPIM_BOR;
615 else if(mpi_op==MPI_LXOR ) op=MPIM_LXOR;
616 else if(mpi_op==MPI_BXOR ) op=MPIM_BXOR;
619 MPI_Comm_size(comm, &size);
620 if (size > 1) /*otherwise no balancing_protocol*/
623 else if (size==3) ss=1;
624 else { int s = size; while (!(s & 1)) s = s >> 1;
625 if (s==1) /* size == power of 2 */ ss = 2;
626 else /* size != power of 2 */ ss = 3; }
628 case MPIM_MAX: case MPIM_MIN: case MPIM_SUM: case MPIM_PROD:
629 case MPIM_LAND: case MPIM_LOR: case MPIM_LXOR:
630 case MPIM_BAND: case MPIM_BOR: case MPIM_BXOR:
632 case MPIM_SHORT: case MPIM_UNSIGNED_SHORT:
633 new_prot = count >= Lsh[is_all][ss]; break;
634 case MPIM_INT: case MPIM_UNSIGNED:
635 new_prot = count >= Lin[is_all][ss]; break;
636 case MPIM_LONG: case MPIM_UNSIGNED_LONG: case MPIM_UNSIGNED_LONG_LONG:
637 new_prot = count >= Llg[is_all][ss]; break;
643 case MPIM_MAX: case MPIM_MIN: case MPIM_SUM: case MPIM_PROD:
646 new_prot = count >= Lfp[is_all][ss]; break;
648 new_prot = count >= Ldb[is_all][ss]; break;
654 case MPIM_BAND: case MPIM_BOR: case MPIM_BXOR:
657 new_prot = count >= Lby[is_all][ss]; break;
663 { char *ss_str[]={"two","three","power of 2","no power of 2"};
664 printf("MPI_(All)Reduce: is_all=%1d, size=%1d=%s, new_prot=%1d\n",
665 is_all, size, ss_str[ss], new_prot); fflush(stdout);
672 sendbuf = (char*) Sendbuf;
673 recvbuf = (char*) Recvbuf;
674 MPI_Comm_rank(comm, &myrank);
675 MPI_Type_extent(mpi_datatype, &typelng);
676 scrlng = typelng * count;
677 #ifdef NO_CACHE_OPTIMIZATION
678 scr1buf = new char[scrlng];
679 scr2buf = new char[scrlng];
680 scr3buf = new char[scrlng];
682 # ifdef SCR_LNG_OPTIM
683 scrlng = SCR_LNG_OPTIM(scrlng);
685 scr2buf = new char[3 * scrlng]; /* To test cache problems. */
686 scr1buf = scr2buf + 1*scrlng; /* scr1buf and scr3buf must not*/
687 scr3buf = scr2buf + 2*scrlng; /* be used for malloc because */
688 /* they are interchanged below.*/
691 if (is_all) root = myrank; /* for correct recvbuf handling */
696 printf("[%2d] step 1 begin\n",myrank); fflush(stdout);
699 while (2*x_size <= size) { n++; x_size = x_size * 2; }
706 printf("[%2d] step 2 begin n=%d, r=%d\n",myrank,n,r);fflush(stdout);
710 if ((myrank % 2) == 0 /*even*/)
712 MPI_I_Sendrecv(sendbuf + (count/2)*typelng,
713 count - count/2, mpi_datatype, myrank+1, 1220,
714 scr2buf, count/2,mpi_datatype, myrank+1, 1221,
716 MPI_I_do_op(sendbuf, scr2buf, scr1buf,
717 count/2, datatype, op);
718 Request::recv(scr1buf + (count/2)*typelng, count - count/2,
719 mpi_datatype, myrank+1, 1223, comm, &status);
722 { int i; printf("[%2d] after step 2: val=",
724 for (i=0; i<count; i++)
725 printf(" %5.0lf",((double*)scr1buf)[i] );
726 printf("\n"); fflush(stdout);
732 MPI_I_Sendrecv(sendbuf, count/2,mpi_datatype, myrank-1, 1221,
733 scr2buf + (count/2)*typelng,
734 count - count/2, mpi_datatype, myrank-1, 1220,
736 MPI_I_do_op(scr2buf + (count/2)*typelng,
737 sendbuf + (count/2)*typelng,
738 scr1buf + (count/2)*typelng,
739 count - count/2, datatype, op);
740 Request::send(scr1buf + (count/2)*typelng, count - count/2,
741 mpi_datatype, myrank-1, 1223, comm);
748 printf("[%2d] step 3+4 begin\n",myrank); fflush(stdout);
750 if ((myrank >= 2*r) || ((myrank%2 == 0) && (myrank < 2*r)))
751 mynewrank = (myrank < 2*r ? myrank/2 : myrank-r);
755 { /* begin -- only for nodes with new rank */
757 # define OLDRANK(new) ((new) < r ? (new)*2 : (new)+r)
763 for (idx=0, x_base=1; idx<n; idx++, x_base=x_base*2)
765 start_even[idx] = x_start;
766 count_even[idx] = x_count / 2;
767 start_odd [idx] = x_start + count_even[idx];
768 count_odd [idx] = x_count - count_even[idx];
769 if (((mynewrank/x_base) % 2) == 0 /*even*/)
772 printf("[%2d](%2d) step 5.%d begin even c=%1d\n",
773 myrank,mynewrank,idx+1,computed); fflush(stdout);
775 x_start = start_even[idx];
776 x_count = count_even[idx];
777 MPI_I_Sendrecv((computed ? scr1buf : sendbuf)
778 + start_odd[idx]*typelng, count_odd[idx],
779 mpi_datatype, OLDRANK(mynewrank+x_base), 1231,
780 scr2buf + x_start*typelng, x_count,
781 mpi_datatype, OLDRANK(mynewrank+x_base), 1232,
783 MPI_I_do_op((computed?scr1buf:sendbuf) + x_start*typelng,
784 scr2buf + x_start*typelng,
785 ((root==myrank) && (idx==(n-1))
786 ? recvbuf + x_start*typelng
787 : scr3buf + x_start*typelng),
788 x_count, datatype, op);
793 printf("[%2d](%2d) step 5.%d begin odd c=%1d\n",
794 myrank,mynewrank,idx+1,computed); fflush(stdout);
796 x_start = start_odd[idx];
797 x_count = count_odd[idx];
798 MPI_I_Sendrecv((computed ? scr1buf : sendbuf)
799 +start_even[idx]*typelng, count_even[idx],
800 mpi_datatype, OLDRANK(mynewrank-x_base), 1232,
801 scr2buf + x_start*typelng, x_count,
802 mpi_datatype, OLDRANK(mynewrank-x_base), 1231,
804 MPI_I_do_op(scr2buf + x_start*typelng,
805 (computed?scr1buf:sendbuf) + x_start*typelng,
806 ((root==myrank) && (idx==(n-1))
807 ? recvbuf + x_start*typelng
808 : scr3buf + x_start*typelng),
809 x_count, datatype, op);
811 xxx = scr3buf; scr3buf = scr1buf; scr1buf = xxx;
814 { int i; printf("[%2d](%2d) after step 5.%d end: start=%2d count=%2d val=",
815 myrank,mynewrank,idx+1,x_start,x_count);
816 for (i=0; i<x_count; i++)
817 printf(" %5.0lf",((double*)((root==myrank)&&(idx==(n-1))
818 ? recvbuf + x_start*typelng
819 : scr1buf + x_start*typelng))[i] );
820 printf("\n"); fflush(stdout);
828 } /* end -- only for nodes with new rank */
832 /*...steps 6.1 to 6.n */
835 { /* begin -- only for nodes with new rank */
837 # define OLDRANK(new) ((new) < r ? (new)*2 : (new)+r)
839 for(idx=n-1, x_base=x_size/2; idx>=0; idx--, x_base=x_base/2)
842 printf("[%2d](%2d) step 6.%d begin\n",myrank,mynewrank,n-idx); fflush(stdout);
844 if (((mynewrank/x_base) % 2) == 0 /*even*/)
846 MPI_I_Sendrecv(recvbuf + start_even[idx]*typelng,
848 mpi_datatype, OLDRANK(mynewrank+x_base),1241,
849 recvbuf + start_odd[idx]*typelng,
851 mpi_datatype, OLDRANK(mynewrank+x_base),1242,
854 x_start = start_odd[idx];
855 x_count = count_odd[idx];
860 MPI_I_Sendrecv(recvbuf + start_odd[idx]*typelng,
862 mpi_datatype, OLDRANK(mynewrank-x_base),1242,
863 recvbuf + start_even[idx]*typelng,
865 mpi_datatype, OLDRANK(mynewrank-x_base),1241,
868 x_start = start_even[idx];
869 x_count = count_even[idx];
873 { int i; printf("[%2d](%2d) after step 6.%d end: start=%2d count=%2d val=",
874 myrank,mynewrank,n-idx,x_start,x_count);
875 for (i=0; i<x_count; i++)
876 printf(" %5.0lf",((double*)(root==myrank
877 ? recvbuf + x_start*typelng
878 : scr1buf + x_start*typelng))[i] );
879 printf("\n"); fflush(stdout);
886 } /* end -- only for nodes with new rank */
893 printf("[%2d] step 7 begin\n",myrank); fflush(stdout);
895 if (myrank%2 == 0 /*even*/)
896 Request::send(recvbuf, count, mpi_datatype, myrank+1, 1253, comm);
898 Request::recv(recvbuf, count, mpi_datatype, myrank-1, 1253, comm, &status);
902 else /* not is_all, i.e. Reduce */
907 if ((root < 2*r) && (root%2 == 1))
910 printf("[%2d] step 6.0 begin\n",myrank); fflush(stdout);
912 if (myrank == 0) /* then mynewrank==0, x_start==0
913 x_count == count/x_size */
915 Request::send(scr1buf,x_count,mpi_datatype,root,1241,comm);
924 for (idx=0, x_base=1; idx<n; idx++, x_base=x_base*2)
926 start_even[idx] = x_start;
927 count_even[idx] = x_count / 2;
928 start_odd [idx] = x_start + count_even[idx];
929 count_odd [idx] = x_count - count_even[idx];
930 /* code for always even in each bit of mynewrank: */
931 x_start = start_even[idx];
932 x_count = count_even[idx];
934 Request::recv(recvbuf,x_count,mpi_datatype,0,1241,comm,&status);
940 newroot = (root < 2*r ? root/2 : root-r);
943 /*...steps 6.1 to 6.n */
946 { /* begin -- only for nodes with new rank */
948 #define OLDRANK(new) ((new) == newroot ? root : ((new) < r ? (new) * 2 : (new) + r))
950 for(idx=n-1, x_base=x_size/2; idx>=0; idx--, x_base=x_base/2)
953 printf("[%2d](%2d) step 6.%d begin\n",myrank,mynewrank,n-idx); fflush(stdout);
955 if ((mynewrank & x_base) != (newroot & x_base))
957 if (((mynewrank/x_base) % 2) == 0 /*even*/)
958 { x_start = start_even[idx]; x_count = count_even[idx];
959 partner = mynewrank+x_base; }
961 { x_start = start_odd[idx]; x_count = count_odd[idx];
962 partner = mynewrank-x_base; }
963 Request::send(scr1buf + x_start*typelng, x_count, mpi_datatype,
964 OLDRANK(partner), 1244, comm);
968 if (((mynewrank/x_base) % 2) == 0 /*even*/)
969 { x_start = start_odd[idx]; x_count = count_odd[idx];
970 partner = mynewrank+x_base; }
972 { x_start = start_even[idx]; x_count = count_even[idx];
973 partner = mynewrank-x_base; }
974 Request::recv((myrank==root ? recvbuf : scr1buf)
975 + x_start*typelng, x_count, mpi_datatype,
976 OLDRANK(partner), 1244, comm, &status);
978 { int i; printf("[%2d](%2d) after step 6.%d end: start=%2d count=%2d val=",
979 myrank,mynewrank,n-idx,x_start,x_count);
980 for (i=0; i<x_count; i++)
981 printf(" %5.0lf",((double*)(root==myrank
982 ? recvbuf + x_start*typelng
983 : scr1buf + x_start*typelng))[i] );
984 printf("\n"); fflush(stdout);
992 } /* end -- only for nodes with new rank */
995 # ifdef NO_CACHE_TESTING
1000 delete[] scr2buf; /* scr1buf and scr3buf are part of scr2buf */
1002 return(MPI_SUCCESS);
1006 return (colls::allreduce(Sendbuf, Recvbuf, count, mpi_datatype, mpi_op, comm));
1008 return (colls::reduce(Sendbuf, Recvbuf, count, mpi_datatype, mpi_op, root, comm));
1010 #endif /*REDUCE_LIMITS*/
1012 int reduce__rab(const void* Sendbuf, void* Recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root,
1015 return MPI_I_anyReduce(Sendbuf, Recvbuf, count, datatype, op, root, comm, false);
1018 int allreduce__rab(const void* Sendbuf, void* Recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
1020 return MPI_I_anyReduce(Sendbuf, Recvbuf, count, datatype, op, -1, comm, true);