1 /* Copyright (c) 2013-2021. The SimGrid Team. All rights reserved. */
3 /* This program is free software; you can redistribute it and/or modify it
4 * under the terms of the license (GNU LGPL) which comes with this package. */
6 /* extracted from mpig_myreduce.c with
7 :3,$s/MPL/MPI/g and :%s/\\\\$/ \\/ */
9 /* Copyright: Rolf Rabenseifner, 1997
10 * Computing Center University of Stuttgart
11 * rabenseifner@rus.uni-stuttgart.de
13 * The usage of this software is free,
14 * but this header must not be removed.
17 #include "../colls_private.hpp"
21 #define REDUCE_NEW_ALWAYS 1
24 # define SCR_LNG_OPTIM(bytelng) 128 + ((bytelng+127)/256) * 256;
25 /* = 16 + multiple of 32 doubles*/
27 #define REDUCE_LIMITS /* values are lower limits for count arg. */ \
28 /* routine = reduce allreduce */ \
29 /* size = 2, 3,2**n,other 2, 3,2**n,other */ \
30 static int Lsh[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
31 static int Lin[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
32 static int Llg[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
33 static int Lfp[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
34 static int Ldb[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}}; \
35 static int Lby[2][4] = {{896, 1728, 576, 736}, {448, 1280, 512, 512}};
38 #ifdef REDUCE_NEW_ALWAYS
40 #define REDUCE_LIMITS /* values are lower limits for count arg. */ \
41 /* routine = reduce allreduce */ \
42 /* size = 2, 3,2**n,other 2, 3,2**n,other */ \
43 static int Lsh[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
44 static int Lin[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
45 static int Llg[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
46 static int Lfp[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
47 static int Ldb[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}}; \
48 static int Lby[2][4] = {{1, 1, 1, 1}, {1, 1, 1, 1}};
51 /* Fast reduce and allreduce algorithm for longer buffers and predefined
54 This algorithm is explained with the example of 13 nodes.
55 The nodes are numbered 0, 1, 2, ... 12.
56 The sendbuf content is a, b, c, ... m.
57 The buffer array is notated with ABCDEFGH, this means that
58 e.g. 'C' is the third 1/8 of the buffer.
60 The algorithm computes
62 { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] }
64 This is equivalent to the mostly used binary tree algorithm
67 size := number of nodes in the communicator.
68 2**n := the power of 2 that is next smaller or equal to the size.
71 Exa.: size=13 ==> n=3, r=5 (i.e. size == 13 == 2**n+r == 2**3 + 5)
73 The algorithm needs for the execution of one colls::reduce
76 exec_time = n*(L1+L2) + buf_lng * (1-1/2**n) * (T1 + T2 + O/d)
79 exec_time = (n+1)*(L1+L2) + buf_lng * T1
80 + buf_lng*(1+1/2-1/2**n)* (T2 + O/d)
82 with L1 = latency of a message transfer e.g. by send+recv
83 L2 = latency of a message exchange e.g. by sendrecv
84 T1 = additional time to transfer 1 byte (= 1/bandwidth)
85 T2 = additional time to exchange 1 byte in both directions.
86 O = time for one operation
87 d = size of the datatype
89 On a MPP system it is expected that T1==T2.
91 In Comparison with the binary tree algorithm that needs:
94 exec_time_bin_tree = n * L1 + buf_lng * n * (T1 + O/d)
97 exec_time_bin_tree = (n+1)*L1 + buf_lng*(n+1)*(T1 + O/d)
99 the new algorithm is faster if (assuming T1=T2)
102 for r==0: buf_lng > L2 * n / [ (n-2) * T1 + (n-1) * O/d ]
103 for r>0: buf_lng > L2 * (n+1) / [ (n-1.5)*T1 + (n-0.5)*O/d ]
105 for size = 5, 6, and 7:
106 buf_lng > L2 / [0.25 * T1 + 0.58 * O/d ]
109 buf_lng > L2 / [0.25 * T1 + 0.62 * O/d ]
111 buf_lng > L2 / [ 0.5 * O/d ]
113 and for O/d >> T1 we can summarize:
115 The new algorithm is faster for about buf_lng > 2 * L2 * d / O
117 Example L1 = L2 = 50 us,
118 bandwidth=300MB/s, i.e. T1 = T2 = 3.3 ns/byte
119 and 10 MFLOP/s, i.e. O = 100 ns/FLOP
120 for double, i.e. d = 8 byte/FLOP
122 ==> the new algorithm is faster for about buf_lng>8000 bytes
123 i.e count > 1000 doubles !
132 split the buffer into ABCD and EFGH
134 even myrank: send buffer EFGH to myrank+1 (buf_lng/2 exchange)
135 receive buffer ABCD from myrank+1
136 compute op for ABCD (count/2 ops)
137 receive result EFGH (buf_lng/2 transfer)
138 odd myrank: send buffer ABCD to myrank+1
139 receive buffer EFGH from myrank+1
143 Result: node: 0 2 4 6 8 10 11 12
144 value: a+b c+d e+f g+h i+j k l m
148 The following algorithm uses only the nodes with
150 (myrank is even && myrank < 2*r) || (myrank >= 2*r)
154 define NEWRANK(old) := (old < 2*r ? old/2 : old-r)
155 define OLDRANK(new) := (new < r ? new*2 : new+r)
158 old ranks: 0 2 4 6 8 10 11 12
159 new ranks: 0 1 2 3 4 5 6 7
160 value: a+b c+d e+f g+h i+j k l m
164 Split the buffer (ABCDEFGH) in the middle,
165 the lower half (ABCD) is computed on even (new) ranks,
166 the opper half (EFGH) is computed on odd (new) ranks.
168 exchange: ABCD from 1 to 0, from 3 to 2, from 5 to 4 and from 7 to 6
169 EFGH from 0 to 1, from 2 to 3, from 4 to 5 and from 6 to 7
170 (i.e. buf_lng/2 exchange)
171 compute op in each node on its half: (i.e. count/2 ops)
173 Result: node 0: (a+b)+(c+d) for ABCD
174 node 1: (a+b)+(c+d) for EFGH
175 node 2: (e+f)+(g+h) for ABCD
176 node 3: (e+f)+(g+h) for EFGH
177 node 4: (i+j)+ k for ABCD
178 node 5: (i+j)+ k for EFGH
179 node 6: l + m for ABCD
180 node 7: l + m for EFGH
184 Same with double distance and oncemore the half of the buffer.
185 (i.e. buf_lng/4 exchange)
188 Result: node 0: [(a+b)+(c+d)] + [(e+f)+(g+h)] for AB
189 node 1: [(a+b)+(c+d)] + [(e+f)+(g+h)] for EF
190 node 2: [(a+b)+(c+d)] + [(e+f)+(g+h)] for CD
191 node 3: [(a+b)+(c+d)] + [(e+f)+(g+h)] for GH
192 node 4: [(i+j)+ k ] + [ l + m ] for AB
193 node 5: [(i+j)+ k ] + [ l + m ] for EF
194 node 6: [(i+j)+ k ] + [ l + m ] for CD
195 node 7: [(i+j)+ k ] + [ l + m ] for GH
200 Same with double distance and oncemore the half of the buffer.
201 (i.e. buf_lng/2**n exchange)
202 (i.e. count/2**n ops)
205 0: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for A
206 1: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for E
207 2: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for C
208 3: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for G
209 4: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for B
210 5: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for F
211 6: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for D
212 7: { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] } for H
215 For colls::allreduce:
220 Exchange on the last distance (2*n) the last result
221 (i.e. buf_lng/2**n exchange)
225 Same with half the distance and double of the result
226 (i.e. buf_lng/2**(n-1) exchange)
231 Same with distance 1 and double of the result (== half of the
232 original buffer) (i.e. buf_lng/2 exchange)
234 Result after 6.1 6.2 6.n
236 on node 0: AB ABCD ABCDEFGH
237 on node 1: EF EFGH ABCDEFGH
238 on node 2: CD ABCD ABCDEFGH
239 on node 3: GH EFGH ABCDEFGH
240 on node 4: AB ABCD ABCDEFGH
241 on node 5: EF EFGH ABCDEFGH
242 on node 6: CD ABCD ABCDEFGH
243 on node 7: GH EFGH ABCDEFGH
248 transfer the result from the even nodes with old rank < 2*r
249 to the odd nodes with old rank < 2*r
250 (i.e. buf_lng transfer)
252 { [(a+b)+(c+d)] + [(e+f)+(g+h)] } + { [(i+j)+k] + [l+m] }
262 If root node not in the list of the nodes with newranks
265 send last result from node 0 to the root (buf_lng/2**n transfer)
266 and replace the role of node 0 by the root node.
270 Send on the last distance (2**(n-1)) the last result
271 from node with bit '2**(n-1)' in the 'new rank' unequal to that of
272 root's new rank to the node with same '2**(n-1)' bit.
273 (i.e. buf_lng/2**n transfer)
277 Same with half the distance and double of the result
279 (i.e. buf_lng/2**(n-1) transfer)
284 Same with distance 1 and double of the result (== half of the
285 original buffer) and bit '2**0' (i.e. buf_lng/2 transfer)
287 Example: roots old rank: 10
291 action result action result action result
297 on node 4: recv A => AB recv CD => ABCD send ABCD
298 on node 5: recv E => EF recv GH => EFGH recv ABCD => ABCDEFGH
299 on node 6: recv C => CD send CD
300 on node 7: recv G => GH send GH
302 Benchmark results on CRAY T3E
303 -----------------------------
305 uname -a: (sn6307 hwwt3e 1.6.1.51 unicosmk CRAY T3E)
306 MPI: /opt/ctl/mpt/1.1.0.3
308 Ldb[][] = {{ 896,1728, 576, 736},{ 448,1280, 512, 512}}
309 env: export MPI_BUFFER_MAX=4099
310 compiled with: cc -c -O3 -h restrict=f
312 old = binary tree protocol of the vendor
313 new = the new protocol and its implementation
314 mixed = 'new' is used if count > limit(datatype, communicator size)
318 measurement count prot. unit 2 3 4 6 8 12 16 24
319 --------------------------------------------------------------------
320 latency 1 mixed us 20.7 35.1 35.6 49.1 49.2 61.8 62.4 74.8
321 old us 19.0 32.9 33.8 47.1 47.2 61.7 62.1 73.2
322 --------------------------------------------------------------------
323 bandwidth 128k mixed MB/s 75.4 34.9 49.1 27.3 41.1 24.6 38.0 23.7
324 (=buf_lng/time) old MB/s 28.8 16.2 16.3 11.6 11.6 8.8 8.8 7.2
325 ration = mixed/old 2.6 2.1 3.0 2.4 3.5 2.8 4.3 3.3
326 --------------------------------------------------------------------
327 limit doubles 896 1536 576 736 576 736 576 736
328 bandwidth limit mixed MB/s 35.9 20.5 18.6 12.3 13.5 9.2 10.0 8.6
329 old MB/s 35.9 20.3 17.8 13.0 12.5 9.6 9.2 7.8
330 ratio = mixed/old 1.00 1.01 1.04 0.95 1.08 0.96 1.09 1.10
333 measurement count prot. unit 32 48 64 96 128 192 256
334 ---------------------------------------------------------------
335 latency 1 mixed us 77.8 88.5 90.6 102 1)
336 old us 78.6 87.2 90.1 99.7 108 119 120
337 ---------------------------------------------------------------
338 bandwidth 128k mixed MB/s 35.1 23.3 34.1 22.8 34.4 22.4 33.9
339 (=buf_lng/time) old MB/s 6.0 6.0 6.0 5.2 5.2 4.6 4.6
340 ration = mixed/old 5.8 3.9 5.7 4.4 6.6 4.8 7.4 5)
341 ---------------------------------------------------------------
342 limit doubles 576 736 576 736 576 736 576 2)
343 bandwidth limit mixed MB/s 9.7 7.5 8.4 6.5 6.9 5.5 5.1 3)
344 old MB/s 7.7 6.4 6.4 5.7 5.5 4.9 4.7 3)
345 ratio = mixed/old 1.26 1.17 1.31 1.14 1.26 1.12 1.08 4)
349 measurement count prot. unit 2 3 4 6 8 12 16 24
350 --------------------------------------------------------------------
351 latency 1 mixed us 28.2 51.0 44.5 74.4 59.9 102 74.2 133
352 old us 26.9 48.3 42.4 69.8 57.6 96.0 75.7 126
353 --------------------------------------------------------------------
354 bandwidth 128k mixed MB/s 74.0 29.4 42.4 23.3 35.0 20.9 32.8 19.7
355 (=buf_lng/time) old MB/s 20.9 14.4 13.2 9.7 9.6 7.3 7.4 5.8
356 ration = mixed/old 3.5 2.0 3.2 2.4 3.6 2.9 4.4 3.4
357 --------------------------------------------------------------------
358 limit doubles 448 1280 512 512 512 512 512 512
359 bandwidth limit mixed MB/s 26.4 15.1 16.2 8.2 12.4 7.2 10.8 5.7
360 old MB/s 26.1 14.9 15.0 9.1 10.5 6.7 7.3 5.3
361 ratio = mixed/old 1.01 1.01 1.08 0.90 1.18 1.07 1.48 1.08
364 measurement count prot. unit 32 48 64 96 128 192 256
365 ---------------------------------------------------------------
366 latency 1 mixed us 90.3 162 109 190
367 old us 92.7 152 104 179 122 225 135
368 ---------------------------------------------------------------
369 bandwidth 128k mixed MB/s 31.1 19.7 30.1 19.2 29.8 18.7 29.0
370 (=buf_lng/time) old MB/s 5.9 4.8 5.0 4.1 4.4 3.4 3.8
371 ration = mixed/old 5.3 4.1 6.0 4.7 6.8 5.5 7.7
372 ---------------------------------------------------------------
373 limit doubles 512 512 512 512 512 512 512
374 bandwidth limit mixed MB/s 6.6 5.6 5.7 3.5 4.4 3.2 3.8
375 old MB/s 6.3 4.2 5.4 3.6 4.4 3.1
376 ratio = mixed/old 1.05 1.33 1.06 0.97 1.00 1.03
379 1) This line shows that the overhead to decide which protocol
380 should be used can be ignored.
381 2) This line shows the limit for the count argument.
382 If count < limit then the vendor protocol is used,
383 otherwise the new protocol is used (see variable Ldb).
384 3) These lines show the bandwidth (= buffer length / execution time)
386 4) This line shows that the limit is chosen well if the ratio is
387 between 0.95 (losing 5% for buffer length near and >=limit)
388 and 1.10 (not gaining 10% for buffer length near and <limit).
389 5) This line shows that the new protocol is 2..7 times faster
397 #define MPI_I_Sendrecv(sb, sc, sd, dest, st, rb, rc, rd, source, rt, comm, stat) \
400 req = Request::irecv(rb, rc, rd, source, rt, comm); \
401 Request::send(sb, sc, sd, dest, st, comm); \
402 Request::wait(&req, stat); \
406 #define MPI_I_Sendrecv(sb, sc, sd, dest, st, rb, rc, rd, source, rt, comm, stat) \
409 req = mpi_mpi_isend(sb, sc, sd, dest, st, comm); \
410 Request::recv(rb, rc, rd, source, rt, comm, stat); \
411 Request::wait(&req, stat); \
414 #define MPI_I_Sendrecv(sb, sc, sd, dest, st, rb, rc, rd, source, rt, comm, stat) \
415 Request::sendrecv(sb, sc, sd, dest, st, rb, rc, rd, source, rt, comm, stat)
426 MPIM_UNSIGNED_LONG_LONG,
444 #define MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_TYPE, TYPE) \
445 static void MPI_I_do_op_TYPE(TYPE* b1, TYPE* b2, TYPE* rslt, int cnt, MPIM_Op op) \
450 for (i = 0; i < cnt; i++) \
451 rslt[i] = (b1[i] > b2[i] ? b1[i] : b2[i]); \
454 for (i = 0; i < cnt; i++) \
455 rslt[i] = (b1[i] < b2[i] ? b1[i] : b2[i]); \
458 for (i = 0; i < cnt; i++) \
459 rslt[i] = b1[i] + b2[i]; \
462 for (i = 0; i < cnt; i++) \
463 rslt[i] = b1[i] * b2[i]; \
466 for (i = 0; i < cnt; i++) \
467 rslt[i] = b1[i] && b2[i]; \
470 for (i = 0; i < cnt; i++) \
471 rslt[i] = b1[i] || b2[i]; \
474 for (i = 0; i < cnt; i++) \
475 rslt[i] = b1[i] != b2[i]; \
478 for (i = 0; i < cnt; i++) \
479 rslt[i] = b1[i] & b2[i]; \
482 for (i = 0; i < cnt; i++) \
483 rslt[i] = b1[i] | b2[i]; \
486 for (i = 0; i < cnt; i++) \
487 rslt[i] = b1[i] ^ b2[i]; \
494 #define MPI_I_DO_OP_FP(MPI_I_do_op_TYPE, TYPE) \
495 static void MPI_I_do_op_TYPE(TYPE* b1, TYPE* b2, TYPE* rslt, int cnt, MPIM_Op op) \
500 for (i = 0; i < cnt; i++) \
501 rslt[i] = (b1[i] > b2[i] ? b1[i] : b2[i]); \
504 for (i = 0; i < cnt; i++) \
505 rslt[i] = (b1[i] < b2[i] ? b1[i] : b2[i]); \
508 for (i = 0; i < cnt; i++) \
509 rslt[i] = b1[i] + b2[i]; \
512 for (i = 0; i < cnt; i++) \
513 rslt[i] = b1[i] * b2[i]; \
520 #define MPI_I_DO_OP_BYTE(MPI_I_do_op_TYPE, TYPE) \
521 static void MPI_I_do_op_TYPE(TYPE* b1, TYPE* b2, TYPE* rslt, int cnt, MPIM_Op op) \
526 for (i = 0; i < cnt; i++) \
527 rslt[i] = b1[i] & b2[i]; \
530 for (i = 0; i < cnt; i++) \
531 rslt[i] = b1[i] | b2[i]; \
534 for (i = 0; i < cnt; i++) \
535 rslt[i] = b1[i] ^ b2[i]; \
542 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_short, short)
543 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_int, int)
544 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_long, long)
545 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_ushort, unsigned short)
546 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_uint, unsigned int)
547 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_ulong, unsigned long)
548 MPI_I_DO_OP_C_INTEGER(MPI_I_do_op_ulonglong, unsigned long long)
549 MPI_I_DO_OP_FP(MPI_I_do_op_float, float)
550 MPI_I_DO_OP_FP(MPI_I_do_op_double, double)
551 MPI_I_DO_OP_BYTE(MPI_I_do_op_byte, char)
553 #define MPI_I_DO_OP_CALL(MPI_I_do_op_TYPE, TYPE) \
554 MPI_I_do_op_TYPE((TYPE*)b1, (TYPE*)b2, (TYPE*)rslt, cnt, op); \
557 static void MPI_I_do_op(void* b1, void* b2, void* rslt, int cnt, MPIM_Datatype datatype, MPIM_Op op)
561 MPI_I_DO_OP_CALL(MPI_I_do_op_short, short)
563 MPI_I_DO_OP_CALL(MPI_I_do_op_int, int)
565 MPI_I_DO_OP_CALL(MPI_I_do_op_long, long)
566 case MPIM_UNSIGNED_SHORT:
567 MPI_I_DO_OP_CALL(MPI_I_do_op_ushort, unsigned short)
569 MPI_I_DO_OP_CALL(MPI_I_do_op_uint, unsigned int)
570 case MPIM_UNSIGNED_LONG:
571 MPI_I_DO_OP_CALL(MPI_I_do_op_ulong, unsigned long)
572 case MPIM_UNSIGNED_LONG_LONG:
573 MPI_I_DO_OP_CALL(MPI_I_do_op_ulonglong, unsigned long long)
575 MPI_I_DO_OP_CALL(MPI_I_do_op_float, float)
577 MPI_I_DO_OP_CALL(MPI_I_do_op_double, double)
579 MPI_I_DO_OP_CALL(MPI_I_do_op_byte, char)
586 static int MPI_I_anyReduce(const void* Sendbuf, void* Recvbuf, int count, MPI_Datatype mpi_datatype, MPI_Op mpi_op,
587 int root, MPI_Comm comm, bool is_all)
589 char *scr1buf, *scr2buf, *scr3buf, *xxx, *sendbuf, *recvbuf;
590 int myrank, size, x_base, x_size, computed, idx;
591 int x_start, x_count = 0, r, n, mynewrank, newroot, partner;
592 int start_even[20], start_odd[20], count_even[20], count_odd[20];
597 MPIM_Datatype datatype = MPIM_INT; MPIM_Op op = MPIM_MAX;
599 if (mpi_datatype==MPI_SHORT ) datatype=MPIM_SHORT;
600 else if(mpi_datatype==MPI_INT ) datatype=MPIM_INT;
601 else if(mpi_datatype==MPI_LONG ) datatype=MPIM_LONG;
602 else if(mpi_datatype==MPI_UNSIGNED_SHORT) datatype=MPIM_UNSIGNED_SHORT;
603 else if(mpi_datatype==MPI_UNSIGNED ) datatype=MPIM_UNSIGNED;
604 else if(mpi_datatype==MPI_UNSIGNED_LONG ) datatype=MPIM_UNSIGNED_LONG;
605 else if(mpi_datatype==MPI_UNSIGNED_LONG_LONG ) datatype=MPIM_UNSIGNED_LONG_LONG;
606 else if(mpi_datatype==MPI_FLOAT ) datatype=MPIM_FLOAT;
607 else if(mpi_datatype==MPI_DOUBLE ) datatype=MPIM_DOUBLE;
608 else if(mpi_datatype==MPI_BYTE ) datatype=MPIM_BYTE;
610 throw std::invalid_argument("reduce rab algorithm can't be used with this datatype!");
612 if (mpi_op==MPI_MAX ) op=MPIM_MAX;
613 else if(mpi_op==MPI_MIN ) op=MPIM_MIN;
614 else if(mpi_op==MPI_SUM ) op=MPIM_SUM;
615 else if(mpi_op==MPI_PROD ) op=MPIM_PROD;
616 else if(mpi_op==MPI_LAND ) op=MPIM_LAND;
617 else if(mpi_op==MPI_BAND ) op=MPIM_BAND;
618 else if(mpi_op==MPI_LOR ) op=MPIM_LOR;
619 else if(mpi_op==MPI_BOR ) op=MPIM_BOR;
620 else if(mpi_op==MPI_LXOR ) op=MPIM_LXOR;
621 else if(mpi_op==MPI_BXOR ) op=MPIM_BXOR;
624 MPI_Comm_size(comm, &size);
625 if (size > 1) /*otherwise no balancing_protocol*/
628 else if (size==3) ss=1;
629 else { int s = size; while (!(s & 1)) s = s >> 1;
630 if (s==1) /* size == power of 2 */ ss = 2;
631 else /* size != power of 2 */ ss = 3; }
633 case MPIM_MAX: case MPIM_MIN: case MPIM_SUM: case MPIM_PROD:
634 case MPIM_LAND: case MPIM_LOR: case MPIM_LXOR:
635 case MPIM_BAND: case MPIM_BOR: case MPIM_BXOR:
637 case MPIM_SHORT: case MPIM_UNSIGNED_SHORT:
638 new_prot = count >= Lsh[is_all][ss]; break;
639 case MPIM_INT: case MPIM_UNSIGNED:
640 new_prot = count >= Lin[is_all][ss]; break;
641 case MPIM_LONG: case MPIM_UNSIGNED_LONG: case MPIM_UNSIGNED_LONG_LONG:
642 new_prot = count >= Llg[is_all][ss]; break;
648 case MPIM_MAX: case MPIM_MIN: case MPIM_SUM: case MPIM_PROD:
651 new_prot = count >= Lfp[is_all][ss]; break;
653 new_prot = count >= Ldb[is_all][ss]; break;
659 case MPIM_BAND: case MPIM_BOR: case MPIM_BXOR:
662 new_prot = count >= Lby[is_all][ss]; break;
668 { char *ss_str[]={"two","three","power of 2","no power of 2"};
669 printf("MPI_(All)Reduce: is_all=%1d, size=%1d=%s, new_prot=%1d\n",
670 is_all, size, ss_str[ss], new_prot); fflush(stdout);
677 sendbuf = (char*) Sendbuf;
678 recvbuf = (char*) Recvbuf;
679 MPI_Comm_rank(comm, &myrank);
680 MPI_Type_extent(mpi_datatype, &typelng);
681 scrlng = typelng * count;
682 #ifdef NO_CACHE_OPTIMIZATION
683 scr1buf = new char[scrlng];
684 scr2buf = new char[scrlng];
685 scr3buf = new char[scrlng];
687 # ifdef SCR_LNG_OPTIM
688 scrlng = SCR_LNG_OPTIM(scrlng);
690 scr2buf = new char[3 * scrlng]; /* To test cache problems. */
691 scr1buf = scr2buf + 1*scrlng; /* scr1buf and scr3buf must not*/
692 scr3buf = scr2buf + 2*scrlng; /* be used for malloc because */
693 /* they are interchanged below.*/
696 if (is_all) root = myrank; /* for correct recvbuf handling */
701 printf("[%2d] step 1 begin\n",myrank); fflush(stdout);
704 while (2*x_size <= size) { n++; x_size = x_size * 2; }
711 printf("[%2d] step 2 begin n=%d, r=%d\n",myrank,n,r);fflush(stdout);
715 if ((myrank % 2) == 0 /*even*/)
717 MPI_I_Sendrecv(sendbuf + (count/2)*typelng,
718 count - count/2, mpi_datatype, myrank+1, 1220,
719 scr2buf, count/2,mpi_datatype, myrank+1, 1221,
721 MPI_I_do_op(sendbuf, scr2buf, scr1buf,
722 count/2, datatype, op);
723 Request::recv(scr1buf + (count/2)*typelng, count - count/2,
724 mpi_datatype, myrank+1, 1223, comm, &status);
727 { int i; printf("[%2d] after step 2: val=",
729 for (i=0; i<count; i++)
730 printf(" %5.0lf",((double*)scr1buf)[i] );
731 printf("\n"); fflush(stdout);
737 MPI_I_Sendrecv(sendbuf, count/2,mpi_datatype, myrank-1, 1221,
738 scr2buf + (count/2)*typelng,
739 count - count/2, mpi_datatype, myrank-1, 1220,
741 MPI_I_do_op(scr2buf + (count/2)*typelng,
742 sendbuf + (count/2)*typelng,
743 scr1buf + (count/2)*typelng,
744 count - count/2, datatype, op);
745 Request::send(scr1buf + (count/2)*typelng, count - count/2,
746 mpi_datatype, myrank-1, 1223, comm);
753 printf("[%2d] step 3+4 begin\n",myrank); fflush(stdout);
755 if ((myrank >= 2*r) || ((myrank%2 == 0) && (myrank < 2*r)))
756 mynewrank = (myrank < 2*r ? myrank/2 : myrank-r);
760 { /* begin -- only for nodes with new rank */
762 # define OLDRANK(new) ((new) < r ? (new)*2 : (new)+r)
768 for (idx=0, x_base=1; idx<n; idx++, x_base=x_base*2)
770 start_even[idx] = x_start;
771 count_even[idx] = x_count / 2;
772 start_odd [idx] = x_start + count_even[idx];
773 count_odd [idx] = x_count - count_even[idx];
774 if (((mynewrank/x_base) % 2) == 0 /*even*/)
777 printf("[%2d](%2d) step 5.%d begin even c=%1d\n",
778 myrank,mynewrank,idx+1,computed); fflush(stdout);
780 x_start = start_even[idx];
781 x_count = count_even[idx];
782 MPI_I_Sendrecv((computed ? scr1buf : sendbuf)
783 + start_odd[idx]*typelng, count_odd[idx],
784 mpi_datatype, OLDRANK(mynewrank+x_base), 1231,
785 scr2buf + x_start*typelng, x_count,
786 mpi_datatype, OLDRANK(mynewrank+x_base), 1232,
788 MPI_I_do_op((computed?scr1buf:sendbuf) + x_start*typelng,
789 scr2buf + x_start*typelng,
790 ((root==myrank) && (idx==(n-1))
791 ? recvbuf + x_start*typelng
792 : scr3buf + x_start*typelng),
793 x_count, datatype, op);
798 printf("[%2d](%2d) step 5.%d begin odd c=%1d\n",
799 myrank,mynewrank,idx+1,computed); fflush(stdout);
801 x_start = start_odd[idx];
802 x_count = count_odd[idx];
803 MPI_I_Sendrecv((computed ? scr1buf : sendbuf)
804 +start_even[idx]*typelng, count_even[idx],
805 mpi_datatype, OLDRANK(mynewrank-x_base), 1232,
806 scr2buf + x_start*typelng, x_count,
807 mpi_datatype, OLDRANK(mynewrank-x_base), 1231,
809 MPI_I_do_op(scr2buf + x_start*typelng,
810 (computed?scr1buf:sendbuf) + x_start*typelng,
811 ((root==myrank) && (idx==(n-1))
812 ? recvbuf + x_start*typelng
813 : scr3buf + x_start*typelng),
814 x_count, datatype, op);
816 xxx = scr3buf; scr3buf = scr1buf; scr1buf = xxx;
819 { int i; printf("[%2d](%2d) after step 5.%d end: start=%2d count=%2d val=",
820 myrank,mynewrank,idx+1,x_start,x_count);
821 for (i=0; i<x_count; i++)
822 printf(" %5.0lf",((double*)((root==myrank)&&(idx==(n-1))
823 ? recvbuf + x_start*typelng
824 : scr1buf + x_start*typelng))[i] );
825 printf("\n"); fflush(stdout);
833 } /* end -- only for nodes with new rank */
837 /*...steps 6.1 to 6.n */
840 { /* begin -- only for nodes with new rank */
842 # define OLDRANK(new) ((new) < r ? (new)*2 : (new)+r)
844 for(idx=n-1, x_base=x_size/2; idx>=0; idx--, x_base=x_base/2)
847 printf("[%2d](%2d) step 6.%d begin\n",myrank,mynewrank,n-idx); fflush(stdout);
849 if (((mynewrank/x_base) % 2) == 0 /*even*/)
851 MPI_I_Sendrecv(recvbuf + start_even[idx]*typelng,
853 mpi_datatype, OLDRANK(mynewrank+x_base),1241,
854 recvbuf + start_odd[idx]*typelng,
856 mpi_datatype, OLDRANK(mynewrank+x_base),1242,
859 x_start = start_odd[idx];
860 x_count = count_odd[idx];
865 MPI_I_Sendrecv(recvbuf + start_odd[idx]*typelng,
867 mpi_datatype, OLDRANK(mynewrank-x_base),1242,
868 recvbuf + start_even[idx]*typelng,
870 mpi_datatype, OLDRANK(mynewrank-x_base),1241,
873 x_start = start_even[idx];
874 x_count = count_even[idx];
878 { int i; printf("[%2d](%2d) after step 6.%d end: start=%2d count=%2d val=",
879 myrank,mynewrank,n-idx,x_start,x_count);
880 for (i=0; i<x_count; i++)
881 printf(" %5.0lf",((double*)(root==myrank
882 ? recvbuf + x_start*typelng
883 : scr1buf + x_start*typelng))[i] );
884 printf("\n"); fflush(stdout);
891 } /* end -- only for nodes with new rank */
898 printf("[%2d] step 7 begin\n",myrank); fflush(stdout);
900 if (myrank%2 == 0 /*even*/)
901 Request::send(recvbuf, count, mpi_datatype, myrank+1, 1253, comm);
903 Request::recv(recvbuf, count, mpi_datatype, myrank-1, 1253, comm, &status);
907 else /* not is_all, i.e. Reduce */
912 if ((root < 2*r) && (root%2 == 1))
915 printf("[%2d] step 6.0 begin\n",myrank); fflush(stdout);
917 if (myrank == 0) /* then mynewrank==0, x_start==0
918 x_count == count/x_size */
920 Request::send(scr1buf,x_count,mpi_datatype,root,1241,comm);
929 for (idx=0, x_base=1; idx<n; idx++, x_base=x_base*2)
931 start_even[idx] = x_start;
932 count_even[idx] = x_count / 2;
933 start_odd [idx] = x_start + count_even[idx];
934 count_odd [idx] = x_count - count_even[idx];
935 /* code for always even in each bit of mynewrank: */
936 x_start = start_even[idx];
937 x_count = count_even[idx];
939 Request::recv(recvbuf,x_count,mpi_datatype,0,1241,comm,&status);
945 newroot = (root < 2*r ? root/2 : root-r);
948 /*...steps 6.1 to 6.n */
951 { /* begin -- only for nodes with new rank */
953 #define OLDRANK(new) ((new) == newroot ? root : ((new) < r ? (new) * 2 : (new) + r))
955 for(idx=n-1, x_base=x_size/2; idx>=0; idx--, x_base=x_base/2)
958 printf("[%2d](%2d) step 6.%d begin\n",myrank,mynewrank,n-idx); fflush(stdout);
960 if ((mynewrank & x_base) != (newroot & x_base))
962 if (((mynewrank/x_base) % 2) == 0 /*even*/)
963 { x_start = start_even[idx]; x_count = count_even[idx];
964 partner = mynewrank+x_base; }
966 { x_start = start_odd[idx]; x_count = count_odd[idx];
967 partner = mynewrank-x_base; }
968 Request::send(scr1buf + x_start*typelng, x_count, mpi_datatype,
969 OLDRANK(partner), 1244, comm);
973 if (((mynewrank/x_base) % 2) == 0 /*even*/)
974 { x_start = start_odd[idx]; x_count = count_odd[idx];
975 partner = mynewrank+x_base; }
977 { x_start = start_even[idx]; x_count = count_even[idx];
978 partner = mynewrank-x_base; }
979 Request::recv((myrank==root ? recvbuf : scr1buf)
980 + x_start*typelng, x_count, mpi_datatype,
981 OLDRANK(partner), 1244, comm, &status);
983 { int i; printf("[%2d](%2d) after step 6.%d end: start=%2d count=%2d val=",
984 myrank,mynewrank,n-idx,x_start,x_count);
985 for (i=0; i<x_count; i++)
986 printf(" %5.0lf",((double*)(root==myrank
987 ? recvbuf + x_start*typelng
988 : scr1buf + x_start*typelng))[i] );
989 printf("\n"); fflush(stdout);
997 } /* end -- only for nodes with new rank */
1000 # ifdef NO_CACHE_TESTING
1005 delete[] scr2buf; /* scr1buf and scr3buf are part of scr2buf */
1007 return(MPI_SUCCESS);
1011 return (colls::allreduce(Sendbuf, Recvbuf, count, mpi_datatype, mpi_op, comm));
1013 return (colls::reduce(Sendbuf, Recvbuf, count, mpi_datatype, mpi_op, root, comm));
1015 #endif /*REDUCE_LIMITS*/
1017 int reduce__rab(const void* Sendbuf, void* Recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root,
1020 return MPI_I_anyReduce(Sendbuf, Recvbuf, count, datatype, op, root, comm, false);
1023 int allreduce__rab(const void* Sendbuf, void* Recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
1025 return MPI_I_anyReduce(Sendbuf, Recvbuf, count, datatype, op, -1, comm, true);