c PING_PONG two-node message exchanges benchmark program
C 
C Contributed by Richard Frost <frost@SDSC.EDU>, caused problems
C on t3d with -device=t3d -arch=cray_t3d -no_short_longs -nodevdebug
C 
C
C This is a very time-consuming program on a workstation cluster.
C For this reason, I've modified it to do fewer tests (1/10 as many)
C
c
c This is a simple benchmark designed to measure the latency and bandwidth
c of a message-passing MIMD computer.  It is currently set up to run with
c MPI.
c
c Compile (MPI mpich version 1.0.11 or later) with
c     % mpif77 -o pong pong.f
c
c (mpif77 is a script that hides details about libraries from the user)
c
c Execute as
c     % mpirun -np 2 pong
c
c Make sure that ~mpi/bin is in your path.
c
c Note that the MPI-specific calls are:
c
c      MPI_INIT
c      MPI_COMM_RANK
c      MPI_COMM_SIZE
c
c      MPI_Wtime
c      MPI_Wtick
c
c      MPI_SEND
c      MPI_RECV
c
c      MPI_FINALIZE
c
c Some care needs to be taken in using the
c appropriate timing routine.  Check the value of MPI_Wtick() to see if
c the clock resolution is reasonable for your tests.
c
c The benchmark measures
c the time to send a message of length N bytes from node 0 to node 1 and
c receive an acknowledging copy of that message which node 1 sends back to 
c node 0.  Note that node 1 waits for the completion of its receive before
c sending the message back to node 0. Note also that the program is not
c necessarily optimal any given system, but is intended
c to provide a reasonably transparent baseline measurement. 
c 
c For message lengths len (= num of doubles * sizedouble),
c a total of msgspersample ping-pong message exchanges are made,	
c and half of the average round-trip time (i.e. the one-way message
c time) is then fit by a linear function y(N) = a + b*N via a least squares
c linear regression.  The coefficient a is then interpreted as the latency
c (time to send a 0-length message) and b as the inverse bandwidth (i.e. 1/b =
c bandwidth in bytes/sec)
c
c The entire procedure is repeated twice, with the bandwidth, latency, and
c measured and fitted values of the message times reported for each instance.
c
c The underlying message passing performance characteristics of a 
c particular system may not necessarily be accurately modeled by the simple
c linear function assumed here.  This may be reflected in a dependency of
c the observed latency and bandwidth on the range of message sizes used.
c
c Original author:
c R. Leary, San Diego Supercomputer Center
c leary@sdsc.edu        9/20/94
c
c Modified for MPI     10/27/95
c frost@sdsc.edu

c
c =========================== program header =========================== 
c

      program pong
      implicit none
      include 'mpif.h'

c sizedouble = size in bytes of double precision element
      integer sizedouble
      parameter(sizedouble=8)

c Note: set these parameters to one of 3 cases:
c  1. size (each sample) < packetization length of architecture
c  2. size (each sample) > packetization length of architecture
c  3. size (1st sample) < packetization length of architecture
c   & size (all others) > packetization length of architecture
c
c  Some known packetization lengths:
c    Paragon            ~1500    bytes
c    Cray T3D           ~1500    bytes
c    TCP/IP networks    256-1024 bytes
c
c samples = the number of data points collected
      integer samples
      parameter(samples=40)
c initsamplesize = # of elements transmitted in 1st sample
      integer initsamplesize
      parameter(initsamplesize=125)
c samplesizeinc = sample size increase per iteration (linear rate)
      integer samplesizeinc
      parameter(samplesizeinc=125)
c     parameter(samplesizeinc=1)
c msgspersample = the number of messages
      integer msgspersample
c      parameter(msgspersample=1000)
       parameter(msgspersample=100)

c The buffer array contains the message , while x(i) is the message size 
c and y(i) the corresponding measured one-way average time. 
c Note that buffer is a double precision array
c
c ibufcount = total number of elements in buffer
      integer ibufcount
      parameter(ibufcount=(initsamplesize+((samples-1)*samplesizeinc)))
c
      double precision buffer(ibufcount)
      double precision x(samples), y(samples)
      double precision t1, t2
      double precision a, b, bandw
      double precision sumx, sumy, sumxx, sumxy
      double precision det, fit

      integer stat(MPI_STATUS_SIZE)
      integer ierr, ierr1, ierr2
      integer nodenum, numprocs
      integer idest
      integer i, iter, sample
      integer num

c
c =========================== begin =========================== 
c

      call MPI_INIT( ierr )
      call MPI_COMM_RANK( MPI_COMM_WORLD, nodenum, ierr )
      call MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )

      if (numprocs .ne. 2) then
         write (6,*) 'This program is only valid for 2 processors'
         write (6,*) 'numprocs = ', numprocs
         stop
      endif

c Put something into array
      do 2 i=1,ibufcount
        buffer(i) = dfloat(i)
    2 continue

      if (nodenum .eq. 0) then
         write (6,*) ' MPI pong test'
         write (6,*) ' samples = ', samples
         write (6,*) ' initsamplesize = ', initsamplesize
         write (6,*) ' samplesizeinc = ', samplesizeinc
         write (6,*) ' msgspersample = ', msgspersample
         write (6,*) ' ibufcount = ', ibufcount
         write (6,98) MPI_Wtick()
         write (6,*) 
      endif
   98 format (' clock resolution = ',e10.5)

      call MPI_BARRIER(MPI_COMM_WORLD, ierr)

c
c =========================== main loop =========================== 
c

c Start main loop - iterate twice to generate two complete sets of timings
      do 60 iter = 1,2
      do 40 sample = 1,samples
      num = initsamplesize + ((sample-1)*samplesizeinc)

c debug
      write (6,99) nodenum, iter, sample, num
      call MPI_BARRIER(MPI_COMM_WORLD, ierr)
   99 format ( 1x, 'PE = ', i1, ', iter = ',i1,
     +             ', sample = ', i3, ', num = ', i5 )

c Find initial elapsed time in seconds

      if(nodenum.eq.0) then
c Send message from node 0 to 1 and receive message from 1
        idest = 1
        t1 = MPI_Wtime()
        do 20 i = 1,msgspersample
           call MPI_SEND(buffer, num, MPI_DOUBLE_PRECISION, 
     +              idest, 0, MPI_COMM_WORLD, ierr1)
           call MPI_RECV(buffer, num, MPI_DOUBLE_PRECISION, 
     +              MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
     +              stat, ierr2)
   20   continue
        t2 = MPI_Wtime()
      else
c Send message from node 1 to 0 and receive message from 0
        idest = 0
        t1 = MPI_Wtime()
        do 21 i = 1,msgspersample
           call MPI_RECV(buffer, num, MPI_DOUBLE_PRECISION, 
     +              MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD,
     +              stat, ierr2)
           call MPI_SEND(buffer, num, MPI_DOUBLE_PRECISION, 
     +              idest, 0, MPI_COMM_WORLD, ierr1)
   21   continue
        t2 = MPI_Wtime()
      endif

c independent variable is message length:
      x(sample) = dfloat(num * sizedouble)

c dependent variable is average one-way transit time:
      y(sample) = ((t2 - t1) * 0.5) /
     +            dfloat(msgspersample)

   40 continue

c now do linear least squares fit to data
c time = a + b*x

      if (nodenum .eq. 0) then
      sumy = 0.d0
      sumx = 0.d0
      sumxy = 0.d0
      sumxx = 0. d0
      do 45 i=1,samples
         sumx = sumx + x(i)
         sumy = sumy + y(i)
         sumxy = sumxy + ( x(i) * y(i) )
         sumxx = sumxx + ( x(i) * x(i) )
   45 continue

      det = (dfloat(samples) * sumxx) - (sumx * sumx)
      a = (1.d6 * ((sumxx * sumy) - (sumx * sumxy))) / det
      b = (1.d6 * ((dfloat(samples) * sumxy) - (sumx * sumy))) / det

      write(6,*)
      write(6,*) ' iter = ', iter
      write(6,*)
      write(6,*) ' least squares fit:  time = a + b * (msg length)'
      write(6,200) a
      write(6,300) b
      bandw = 1./b
      write(6,400) bandw
      write(6,*)
      write(6,*) '    message         observed          fitted'
      write(6,*) ' length(bytes)     time(usec)       time(usec)'
      write(6,*)
      do 50 i=1,samples
         fit = a + b*x(i) 
         y(i) = y(i)*1.d6
         write(6,100) x(i),y(i),fit
   50 continue
      endif

   60 continue

c
c =========================== end loop =========================== 
c

  100 format(3x,f8.0,5x,f12.2,5x,f12.2)
  200 format(5x,'a = latency = ',f8.2,' microseconds')
  300 format(5x,'b = inverse bandwidth = ' , f8.5,' secs/Mbyte')     
  400 format(5x,'1/b = bandwidth = ',f8.2,' Mbytes/sec')

c
c =========================== end program =========================== 
c

      call MPI_FINALIZE(ierr)
      
      end