teshsuite/smpi/mpich-test/pt2pt/pingpong.f

   1 c PING_PONG two-node message exchanges benchmark program
   2 C
   3 C Contributed by Richard Frost <frost@SDSC.EDU>, caused problems
   4 C on t3d with -device=t3d -arch=cray_t3d -no_short_longs -nodevdebug
   5 C
   6 C
   7 C This is a very time-consuming program on a workstation cluster.
   8 C For this reason, I've modified it to do fewer tests (1/10 as many)
   9 C
  10 c
  11 c This is a simple benchmark designed to measure the latency and bandwidth
  12 c of a message-passing MIMD computer.  It is currently set up to run with
  13 c MPI.
  14 c
  15 c Compile (MPI mpich version 1.0.11 or later) with
  16 c     % mpif77 -o pong pong.f
  17 c
  18 c (mpif77 is a script that hides details about libraries from the user)
  19 c
  20 c Execute as
  21 c     % mpirun -np 2 pong
  22 c
  23 c Make sure that ~mpi/bin is in your path.
  24 c
  25 c Note that the MPI-specific calls are:
  26 c
  27 c      MPI_INIT
  28 c      MPI_COMM_RANK
  29 c      MPI_COMM_SIZE
  30 c
  31 c      MPI_Wtime
  32 c      MPI_Wtick
  33 c
  34 c      MPI_SEND
  35 c      MPI_RECV
  36 c
  37 c      MPI_FINALIZE
  38 c
  39 c Some care needs to be taken in using the
  40 c appropriate timing routine.  Check the value of MPI_Wtick() to see if
  41 c the clock resolution is reasonable for your tests.
  42 c
  43 c The benchmark measures
  44 c the time to send a message of length N bytes from node 0 to node 1 and
  45 c receive an acknowledging copy of that message which node 1 sends back to
  46 c node 0.  Note that node 1 waits for the completion of its receive before
  47 c sending the message back to node 0. Note also that the program is not
  48 c necessarily optimal any given system, but is intended
  49 c to provide a reasonably transparent baseline measurement.
  50 c
  51 c For message lengths len (= num of doubles * sizedouble),
  52 c a total of msgspersample ping-pong message exchanges are made,
  53 c and half of the average round-trip time (i.e. the one-way message
  54 c time) is then fit by a linear function y(N) = a + b*N via a least squares
  55 c linear regression.  The coefficient a is then interpreted as the latency
  56 c (time to send a 0-length message) and b as the inverse bandwidth (i.e. 1/b =
  57 c bandwidth in bytes/sec)
  58 c
  59 c The entire procedure is repeated twice, with the bandwidth, latency, and
  60 c measured and fitted values of the message times reported for each instance.
  61 c
  62 c The underlying message passing performance characteristics of a
  63 c particular system may not necessarily be accurately modeled by the simple
  64 c linear function assumed here.  This may be reflected in a dependency of
  65 c the observed latency and bandwidth on the range of message sizes used.
  66 c
  67 c Original author:
  68 c R. Leary, San Diego Supercomputer Center
  69 c leary@sdsc.edu        9/20/94
  70 c
  71 c Modified for MPI     10/27/95
  72 c frost@sdsc.edu
  73
  74 c
  75 c =========================== program header ===========================
  76 c
  77
  78       program pong
  79       implicit none
  80       include 'mpif.h'
  81
  82 c sizedouble = size in bytes of double precision element
  83       integer sizedouble
  84       parameter(sizedouble=8)
  85
  86 c Note: set these parameters to one of 3 cases:
  87 c  1. size (each sample) < packetization length of architecture
  88 c  2. size (each sample) > packetization length of architecture
  89 c  3. size (1st sample) < packetization length of architecture
  90 c   & size (all others) > packetization length of architecture
  91 c
  92 c  Some known packetization lengths:
  93 c    Paragon            ~1500    bytes
  94 c    Cray T3D           ~1500    bytes
  95 c    TCP/IP networks    256-1024 bytes
  96 c
  97 c samples = the number of data points collected
  98       integer samples
  99       parameter(samples=40)
 100 c initsamplesize = # of elements transmitted in 1st sample
 101       integer initsamplesize
 102       parameter(initsamplesize=125)
 103 c samplesizeinc = sample size increase per iteration (linear rate)
 104       integer samplesizeinc
 105       parameter(samplesizeinc=125)
 106 c     parameter(samplesizeinc=1)
 107 c msgspersample = the number of messages
 108       integer msgspersample
 109 c      parameter(msgspersample=1000)
 110        parameter(msgspersample=100)
 111
 112 c The buffer array contains the message , while x(i) is the message size
 113 c and y(i) the corresponding measured one-way average time.
 114 c Note that buffer is a double precision array
 115 c
 116 c ibufcount = total number of elements in buffer
 117       integer ibufcount
 118       parameter(ibufcount=(initsamplesize+((samples-1)*samplesizeinc)))
 119 c
 120       double precision buffer(ibufcount)
 121       double precision x(samples), y(samples)
 122       double precision t1, t2
 123       double precision a, b, bandw
 124       double precision sumx, sumy, sumxx, sumxy
 125       double precision det, fit
 126
 127       integer stat(MPI_STATUS_SIZE)
 128       integer ierr, ierr1, ierr2
 129       integer nodenum, numprocs
 130       integer idest
 131       integer i, iter, sample
 132       integer num
 133
 134 c
 135 c =========================== begin ===========================
 136 c
 137
 138       call MPI_INIT( ierr )
 139       call MPI_COMM_RANK( MPI_COMM_WORLD, nodenum, ierr )
 140       call MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )
 141
 142       if (numprocs .ne. 2) then
 143          write (6,*) 'This program is only valid for 2 processors'
 144          write (6,*) 'numprocs = ', numprocs
 145          stop
 146       endif
 147
 148 c Put something into array
 149       do 2 i=1,ibufcount
 150         buffer(i) = dfloat(i)
 151     2 continue
 152
 153       if (nodenum .eq. 0) then
 154          write (6,*) ' MPI pong test'
 155          write (6,*) ' samples = ', samples
 156          write (6,*) ' initsamplesize = ', initsamplesize
 157          write (6,*) ' samplesizeinc = ', samplesizeinc
 158          write (6,*) ' msgspersample = ', msgspersample
 159          write (6,*) ' ibufcount = ', ibufcount
 160          write (6,98) MPI_Wtick()
 161          write (6,*)
 162       endif
 163    98 format (' clock resolution = ',e10.5)
 164
 165       call MPI_BARRIER(MPI_COMM_WORLD, ierr)
 166
 167 c
 168 c =========================== main loop ===========================
 169 c
 170
 171 c Start main loop - iterate twice to generate two complete sets of timings
 172       do 60 iter = 1,2
 173       do 40 sample = 1,samples
 174       num = initsamplesize + ((sample-1)*samplesizeinc)
 175
 176 c debug
 177       write (6,99) nodenum, iter, sample, num
 178       call MPI_BARRIER(MPI_COMM_WORLD, ierr)
 179    99 format ( 1x, 'PE = ', i1, ', iter = ',i1,
 180      +             ', sample = ', i3, ', num = ', i5 )
 181
 182 c Find initial elapsed time in seconds
 183
 184       if(nodenum.eq.0) then
 185 c Send message from node 0 to 1 and receive message from 1
 186         idest = 1
 187         t1 = MPI_Wtime()
 188         do 20 i = 1,msgspersample
 189            call MPI_SEND(buffer, num, MPI_DOUBLE_PRECISION,
 190      +              idest, 0, MPI_COMM_WORLD, ierr1)
 191            call MPI_RECV(buffer, num, MPI_DOUBLE_PRECISION,
 192      +              MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
 193      +              stat, ierr2)
 194    20   continue
 195         t2 = MPI_Wtime()
 196       else
 197 c Send message from node 1 to 0 and receive message from 0
 198         idest = 0
 199         t1 = MPI_Wtime()
 200         do 21 i = 1,msgspersample
 201            call MPI_RECV(buffer, num, MPI_DOUBLE_PRECISION,
 202      +              MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
 203      +              stat, ierr2)
 204            call MPI_SEND(buffer, num, MPI_DOUBLE_PRECISION,
 205      +              idest, 0, MPI_COMM_WORLD, ierr1)
 206    21   continue
 207         t2 = MPI_Wtime()
 208       endif
 209
 210 c independent variable is message length:
 211       x(sample) = dfloat(num * sizedouble)
 212
 213 c dependent variable is average one-way transit time:
 214       y(sample) = ((t2 - t1) * 0.5) /
 215      +            dfloat(msgspersample)
 216
 217    40 continue
 218
 219 c now do linear least squares fit to data
 220 c time = a + b*x
 221
 222       if (nodenum .eq. 0) then
 223       sumy = 0.d0
 224       sumx = 0.d0
 225       sumxy = 0.d0
 226       sumxx = 0. d0
 227       do 45 i=1,samples
 228          sumx = sumx + x(i)
 229          sumy = sumy + y(i)
 230          sumxy = sumxy + ( x(i) * y(i) )
 231          sumxx = sumxx + ( x(i) * x(i) )
 232    45 continue
 233
 234       det = (dfloat(samples) * sumxx) - (sumx * sumx)
 235       a = (1.d6 * ((sumxx * sumy) - (sumx * sumxy))) / det
 236       b = (1.d6 * ((dfloat(samples) * sumxy) - (sumx * sumy))) / det
 237
 238       write(6,*)
 239       write(6,*) ' iter = ', iter
 240       write(6,*)
 241       write(6,*) ' least squares fit:  time = a + b * (msg length)'
 242       write(6,200) a
 243       write(6,300) b
 244       bandw = 1./b
 245       write(6,400) bandw
 246       write(6,*)
 247       write(6,*) '    message         observed          fitted'
 248       write(6,*) ' length(bytes)     time(usec)       time(usec)'
 249       write(6,*)
 250       do 50 i=1,samples
 251          fit = a + b*x(i)
 252          y(i) = y(i)*1.d6
 253          write(6,100) x(i),y(i),fit
 254    50 continue
 255       endif
 256
 257    60 continue
 258
 259 c
 260 c =========================== end loop ===========================
 261 c
 262
 263   100 format(3x,f8.0,5x,f12.2,5x,f12.2)
 264   200 format(5x,'a = latency = ',f8.2,' microseconds')
 265   300 format(5x,'b = inverse bandwidth = ' , f8.5,' secs/Mbyte')
 266   400 format(5x,'1/b = bandwidth = ',f8.2,' Mbytes/sec')
 267
 268 c
 269 c =========================== end program ===========================
 270 c
 271
 272       call MPI_FINALIZE(ierr)
 273
 274       end