1 c PING_PONG two-node message exchanges benchmark program
3 C Contributed by Richard Frost <frost@SDSC.EDU>, caused problems
4 C on t3d with -device=t3d -arch=cray_t3d -no_short_longs -nodevdebug
7 C This is a very time-consuming program on a workstation cluster.
8 C For this reason, I've modified it to do fewer tests (1/10 as many)
11 c This is a simple benchmark designed to measure the latency and bandwidth
12 c of a message-passing MIMD computer. It is currently set up to run with
15 c Compile (MPI mpich version 1.0.11 or later) with
16 c % mpif77 -o pong pong.f
18 c (mpif77 is a script that hides details about libraries from the user)
23 c Make sure that ~mpi/bin is in your path.
25 c Note that the MPI-specific calls are:
39 c Some care needs to be taken in using the
40 c appropriate timing routine. Check the value of MPI_Wtick() to see if
41 c the clock resolution is reasonable for your tests.
43 c The benchmark measures
44 c the time to send a message of length N bytes from node 0 to node 1 and
45 c receive an acknowledging copy of that message which node 1 sends back to
46 c node 0. Note that node 1 waits for the completion of its receive before
47 c sending the message back to node 0. Note also that the program is not
48 c necessarily optimal any given system, but is intended
49 c to provide a reasonably transparent baseline measurement.
51 c For message lengths len (= num of doubles * sizedouble),
52 c a total of msgspersample ping-pong message exchanges are made,
53 c and half of the average round-trip time (i.e. the one-way message
54 c time) is then fit by a linear function y(N) = a + b*N via a least squares
55 c linear regression. The coefficient a is then interpreted as the latency
56 c (time to send a 0-length message) and b as the inverse bandwidth (i.e. 1/b =
57 c bandwidth in bytes/sec)
59 c The entire procedure is repeated twice, with the bandwidth, latency, and
60 c measured and fitted values of the message times reported for each instance.
62 c The underlying message passing performance characteristics of a
63 c particular system may not necessarily be accurately modeled by the simple
64 c linear function assumed here. This may be reflected in a dependency of
65 c the observed latency and bandwidth on the range of message sizes used.
68 c R. Leary, San Diego Supercomputer Center
69 c leary@sdsc.edu 9/20/94
71 c Modified for MPI 10/27/95
75 c =========================== program header ===========================
82 c sizedouble = size in bytes of double precision element
84 parameter(sizedouble=8)
86 c Note: set these parameters to one of 3 cases:
87 c 1. size (each sample) < packetization length of architecture
88 c 2. size (each sample) > packetization length of architecture
89 c 3. size (1st sample) < packetization length of architecture
90 c & size (all others) > packetization length of architecture
92 c Some known packetization lengths:
94 c Cray T3D ~1500 bytes
95 c TCP/IP networks 256-1024 bytes
97 c samples = the number of data points collected
100 c initsamplesize = # of elements transmitted in 1st sample
101 integer initsamplesize
102 parameter(initsamplesize=125)
103 c samplesizeinc = sample size increase per iteration (linear rate)
104 integer samplesizeinc
105 parameter(samplesizeinc=125)
106 c parameter(samplesizeinc=1)
107 c msgspersample = the number of messages
108 integer msgspersample
109 c parameter(msgspersample=1000)
110 parameter(msgspersample=100)
112 c The buffer array contains the message , while x(i) is the message size
113 c and y(i) the corresponding measured one-way average time.
114 c Note that buffer is a double precision array
116 c ibufcount = total number of elements in buffer
118 parameter(ibufcount=(initsamplesize+((samples-1)*samplesizeinc)))
120 double precision buffer(ibufcount)
121 double precision x(samples), y(samples)
122 double precision t1, t2
123 double precision a, b, bandw
124 double precision sumx, sumy, sumxx, sumxy
125 double precision det, fit
127 integer stat(MPI_STATUS_SIZE)
128 integer ierr, ierr1, ierr2
129 integer nodenum, numprocs
131 integer i, iter, sample
135 c =========================== begin ===========================
138 call MPI_INIT( ierr )
139 call MPI_COMM_RANK( MPI_COMM_WORLD, nodenum, ierr )
140 call MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )
142 if (numprocs .ne. 2) then
143 write (6,*) 'This program is only valid for 2 processors'
144 write (6,*) 'numprocs = ', numprocs
148 c Put something into array
150 buffer(i) = dfloat(i)
153 if (nodenum .eq. 0) then
154 write (6,*) ' MPI pong test'
155 write (6,*) ' samples = ', samples
156 write (6,*) ' initsamplesize = ', initsamplesize
157 write (6,*) ' samplesizeinc = ', samplesizeinc
158 write (6,*) ' msgspersample = ', msgspersample
159 write (6,*) ' ibufcount = ', ibufcount
160 write (6,98) MPI_Wtick()
163 98 format (' clock resolution = ',e10.5)
165 call MPI_BARRIER(MPI_COMM_WORLD, ierr)
168 c =========================== main loop ===========================
171 c Start main loop - iterate twice to generate two complete sets of timings
173 do 40 sample = 1,samples
174 num = initsamplesize + ((sample-1)*samplesizeinc)
177 write (6,99) nodenum, iter, sample, num
178 call MPI_BARRIER(MPI_COMM_WORLD, ierr)
179 99 format ( 1x, 'PE = ', i1, ', iter = ',i1,
180 + ', sample = ', i3, ', num = ', i5 )
182 c Find initial elapsed time in seconds
184 if(nodenum.eq.0) then
185 c Send message from node 0 to 1 and receive message from 1
188 do 20 i = 1,msgspersample
189 call MPI_SEND(buffer, num, MPI_DOUBLE_PRECISION,
190 + idest, 0, MPI_COMM_WORLD, ierr1)
191 call MPI_RECV(buffer, num, MPI_DOUBLE_PRECISION,
192 + MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
197 c Send message from node 1 to 0 and receive message from 0
200 do 21 i = 1,msgspersample
201 call MPI_RECV(buffer, num, MPI_DOUBLE_PRECISION,
202 + MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
204 call MPI_SEND(buffer, num, MPI_DOUBLE_PRECISION,
205 + idest, 0, MPI_COMM_WORLD, ierr1)
210 c independent variable is message length:
211 x(sample) = dfloat(num * sizedouble)
213 c dependent variable is average one-way transit time:
214 y(sample) = ((t2 - t1) * 0.5) /
215 + dfloat(msgspersample)
219 c now do linear least squares fit to data
222 if (nodenum .eq. 0) then
230 sumxy = sumxy + ( x(i) * y(i) )
231 sumxx = sumxx + ( x(i) * x(i) )
234 det = (dfloat(samples) * sumxx) - (sumx * sumx)
235 a = (1.d6 * ((sumxx * sumy) - (sumx * sumxy))) / det
236 b = (1.d6 * ((dfloat(samples) * sumxy) - (sumx * sumy))) / det
239 write(6,*) ' iter = ', iter
241 write(6,*) ' least squares fit: time = a + b * (msg length)'
247 write(6,*) ' message observed fitted'
248 write(6,*) ' length(bytes) time(usec) time(usec)'
253 write(6,100) x(i),y(i),fit
260 c =========================== end loop ===========================
263 100 format(3x,f8.0,5x,f12.2,5x,f12.2)
264 200 format(5x,'a = latency = ',f8.2,' microseconds')
265 300 format(5x,'b = inverse bandwidth = ' , f8.5,' secs/Mbyte')
266 400 format(5x,'1/b = bandwidth = ',f8.2,' Mbytes/sec')
269 c =========================== end program ===========================
272 call MPI_FINALIZE(ierr)