examples/smpi/NAS/EP/ep.f

   1 !-------------------------------------------------------------------------!
   2 !                                                                         !
   3 !        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
   4 !                                                                         !
   5 !                                   E P                                   !
   6 !                                                                         !
   7 !-------------------------------------------------------------------------!
   8 !                                                                         !
   9 !    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
  10 !    It is described in NAS Technical Reports 95-020 and 02-007           !
  11 !                                                                         !
  12 !    Permission to use, copy, distribute and modify this software         !
  13 !    for any purpose with or without fee is hereby granted.  We           !
  14 !    request, however, that all derived work reference the NAS            !
  15 !    Parallel Benchmarks 3.3. This software is provided "as is"           !
  16 !    without express or implied warranty.                                 !
  17 !                                                                         !
  18 !    Information on NPB 3.3, including the technical report, the          !
  19 !    original specifications, source code, results and information        !
  20 !    on how to submit new results, is available at:                       !
  21 !                                                                         !
  22 !           http://www.nas.nasa.gov/Software/NPB/                         !
  23 !                                                                         !
  24 !    Send comments or suggestions to  npb@nas.nasa.gov                    !
  25 !                                                                         !
  26 !          NAS Parallel Benchmarks Group                                  !
  27 !          NASA Ames Research Center                                      !
  28 !          Mail Stop: T27A-1                                              !
  29 !          Moffett Field, CA   94035-1000                                 !
  30 !                                                                         !
  31 !          E-mail:  npb@nas.nasa.gov                                      !
  32 !          Fax:     (650) 604-3957                                        !
  33 !                                                                         !
  34 !-------------------------------------------------------------------------!
  35
  36
  37 c---------------------------------------------------------------------
  38 c
  39 c Authors: P. O. Frederickson
  40 c          D. H. Bailey
  41 c          A. C. Woo
  42 c          R. F. Van der Wijngaart
  43 c---------------------------------------------------------------------
  44
  45 c---------------------------------------------------------------------
  46       program EMBAR
  47 c---------------------------------------------------------------------
  48 C
  49 c   This is the MPI version of the APP Benchmark 1,
  50 c   the "embarassingly parallel" benchmark.
  51 c
  52 c
  53 c   M is the Log_2 of the number of complex pairs of uniform (0, 1) random
  54 c   numbers.  MK is the Log_2 of the size of each batch of uniform random
  55 c   numbers.  MK can be set for convenience on a given system, since it does
  56 c   not affect the results.
  57
  58       implicit none
  59
  60       include 'npbparams.h'
  61       include 'mpinpb.h'
  62
  63       double precision Mops, epsilon, a, s, t1, t2, t3, t4, x, x1,
  64      >                 x2, q, sx, sy, tm, an, tt, gc, dum(3),
  65      >                 timer_read
  66       double precision sx_verify_value, sy_verify_value, sx_err, sy_err
  67       integer          mk, mm, nn, nk, nq, np, ierr, node, no_nodes,
  68      >                 i, ik, kk, l, k, nit, ierrcode, no_large_nodes,
  69      >                 np_add, k_offset, j
  70       logical          verified, timers_enabled
  71       parameter       (timers_enabled = .false.)
  72       external         randlc, timer_read
  73       double precision randlc, qq
  74       character*15     size
  75
  76       parameter (mk = 16, mm = m - mk, nn = 2 ** mm,
  77      >           nk = 2 ** mk, nq = 10, epsilon=1.d-8,
  78      >           a = 1220703125.d0, s = 271828183.d0)
  79
  80       common/storage/ x(2*nk), q(0:nq-1), qq(10000)
  81       data             dum /1.d0, 1.d0, 1.d0/
  82
  83       call mpi_init(ierr)
  84       call mpi_comm_rank(MPI_COMM_WORLD,node,ierr)
  85       call mpi_comm_size(MPI_COMM_WORLD,no_nodes,ierr)
  86
  87       root = 0
  88
  89       if (.not. convertdouble) then
  90          dp_type = MPI_DOUBLE_PRECISION
  91       else
  92          dp_type = MPI_REAL
  93       endif
  94
  95       if (node.eq.root)  then
  96
  97 c   Because the size of the problem is too large to store in a 32-bit
  98 c   integer for some classes, we put it into a string (for printing).
  99 c   Have to strip off the decimal point put in there by the floating
 100 c   point print statement (internal file)
 101
 102           write(*, 1000)
 103           write(size, '(f15.0)' ) 2.d0**(m+1)
 104           j = 15
 105           if (size(j:j) .eq. '.') j = j - 1
 106           write (*,1001) size(1:j)
 107           write(*, 1003) no_nodes
 108
 109  1000 format(/,' NAS Parallel Benchmarks 3.3 -- EP Benchmark',/)
 110  1001     format(' Number of random numbers generated: ', a15)
 111  1003     format(' Number of active processes:         ', 2x, i13, /)
 112
 113       endif
 114
 115       verified = .false.
 116
 117 c   Compute the number of "batches" of random number pairs generated
 118 c   per processor. Adjust if the number of processors does not evenly
 119 c   divide the total number
 120
 121       np = nn / no_nodes
 122       no_large_nodes = mod(nn, no_nodes)
 123       if (node .lt. no_large_nodes) then
 124          np_add = 1
 125       else
 126          np_add = 0
 127       endif
 128       np = np + np_add
 129
 130       if (np .eq. 0) then
 131          write (6, 1) no_nodes, nn
 132  1       format ('Too many nodes:',2i6)
 133          call mpi_abort(MPI_COMM_WORLD,ierrcode,ierr)
 134          stop
 135       endif
 136
 137 c   Call the random number generator functions and initialize
 138 c   the x-array to reduce the effects of paging on the timings.
 139 c   Also, call all mathematical functions that are used. Make
 140 c   sure these initializations cannot be eliminated as dead code.
 141
 142       call vranlc(0, dum(1), dum(2), dum(3))
 143       dum(1) = randlc(dum(2), dum(3))
 144       do 5    i = 1, 2*nk
 145          x(i) = -1.d99
 146  5    continue
 147       Mops = log(sqrt(abs(max(1.d0,1.d0))))
 148
 149 c---------------------------------------------------------------------
 150 c      Synchronize before placing time stamp
 151 c---------------------------------------------------------------------
 152       call mpi_barrier(MPI_COMM_WORLD, ierr)
 153
 154       call timer_clear(1)
 155       call timer_clear(2)
 156       call timer_clear(3)
 157       call timer_start(1)
 158
 159       t1 = a
 160       call vranlc(0, t1, a, x)
 161
 162 c   Compute AN = A ^ (2 * NK) (mod 2^46).
 163
 164       t1 = a
 165
 166       do 100 i = 1, mk + 1
 167          t2 = randlc(t1, t1)
 168  100  continue
 169
 170       an = t1
 171       tt = s
 172       gc = 0.d0
 173       sx = 0.d0
 174       sy = 0.d0
 175
 176       do 110 i = 0, nq - 1
 177          q(i) = 0.d0
 178  110  continue
 179
 180 c   Each instance of this loop may be performed independently. We compute
 181 c   the k offsets separately to take into account the fact that some nodes
 182 c   have more numbers to generate than others
 183
 184       if (np_add .eq. 1) then
 185          k_offset = node * np -1
 186       else
 187          k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1
 188       endif
 189
 190       do 150 k = 1, np
 191          kk = k_offset + k
 192          t1 = s
 193          t2 = an
 194
 195 c        Find starting seed t1 for this kk.
 196
 197          do 120 i = 1, 100
 198             ik = kk / 2
 199             if (2 * ik .ne. kk) t3 = randlc(t1, t2)
 200             if (ik .eq. 0) goto 130
 201             t3 = randlc(t2, t2)
 202             kk = ik
 203  120     continue
 204
 205 c        Compute uniform pseudorandom numbers.
 206  130     continue
 207
 208          if (timers_enabled) call timer_start(3)
 209          call vranlc(2 * nk, t1, a, x)
 210          if (timers_enabled) call timer_stop(3)
 211
 212 c        Compute Gaussian deviates by acceptance-rejection method and
 213 c        tally counts in concentric square annuli.  This loop is not
 214 c        vectorizable.
 215
 216          if (timers_enabled) call timer_start(2)
 217
 218          do 140 i = 1, nk
 219             x1 = 2.d0 * x(2*i-1) - 1.d0
 220             x2 = 2.d0 * x(2*i) - 1.d0
 221             t1 = x1 ** 2 + x2 ** 2
 222             if (t1 .le. 1.d0) then
 223                t2   = sqrt(-2.d0 * log(t1) / t1)
 224                t3   = (x1 * t2)
 225                t4   = (x2 * t2)
 226                l    = max(abs(t3), abs(t4))
 227                q(l) = q(l) + 1.d0
 228                sx   = sx + t3
 229                sy   = sy + t4
 230             endif
 231  140     continue
 232
 233          if (timers_enabled) call timer_stop(2)
 234
 235  150  continue
 236
 237       call mpi_allreduce(sx, x, 1, dp_type,
 238      >                   MPI_SUM, MPI_COMM_WORLD, ierr)
 239       sx = x(1)
 240       call mpi_allreduce(sy, x, 1, dp_type,
 241      >                   MPI_SUM, MPI_COMM_WORLD, ierr)
 242       sy = x(1)
 243       call mpi_allreduce(q, x, nq, dp_type,
 244      >                   MPI_SUM, MPI_COMM_WORLD, ierr)
 245
 246       do i = 1, nq
 247          q(i-1) = x(i)
 248       enddo
 249
 250       do 160 i = 0, nq - 1
 251         gc = gc + q(i)
 252  160  continue
 253
 254       call timer_stop(1)
 255       tm  = timer_read(1)
 256
 257       call mpi_allreduce(tm, x, 1, dp_type,
 258      >                   MPI_MAX, MPI_COMM_WORLD, ierr)
 259       tm = x(1)
 260
 261       if (node.eq.root) then
 262          nit=0
 263          verified = .true.
 264          if (m.eq.24) then
 265             sx_verify_value = -3.247834652034740D+3
 266             sy_verify_value = -6.958407078382297D+3
 267          elseif (m.eq.25) then
 268             sx_verify_value = -2.863319731645753D+3
 269             sy_verify_value = -6.320053679109499D+3
 270          elseif (m.eq.28) then
 271             sx_verify_value = -4.295875165629892D+3
 272             sy_verify_value = -1.580732573678431D+4
 273          elseif (m.eq.30) then
 274             sx_verify_value =  4.033815542441498D+4
 275             sy_verify_value = -2.660669192809235D+4
 276          elseif (m.eq.32) then
 277             sx_verify_value =  4.764367927995374D+4
 278             sy_verify_value = -8.084072988043731D+4
 279          elseif (m.eq.36) then
 280             sx_verify_value =  1.982481200946593D+5
 281             sy_verify_value = -1.020596636361769D+5
 282          elseif (m.eq.40) then
 283             sx_verify_value = -5.319717441530D+05
 284             sy_verify_value = -3.688834557731D+05
 285          else
 286             verified = .false.
 287          endif
 288          if (verified) then
 289             sx_err = abs((sx - sx_verify_value)/sx_verify_value)
 290             sy_err = abs((sy - sy_verify_value)/sy_verify_value)
 291             verified = ((sx_err.le.epsilon) .and. (sy_err.le.epsilon))
 292          endif
 293          Mops = 2.d0**(m+1)/tm/1000000.d0
 294
 295          write (6,11) tm, m, gc, sx, sy, (i, q(i), i = 0, nq - 1)
 296  11      format ('EP Benchmark Results:'//'CPU Time =',f10.4/'N = 2^',
 297      >           i5/'No. Gaussian Pairs =',f15.0/'Sums = ',1p,2d25.15/
 298      >           'Counts:'/(i3,0p,f15.0))
 299
 300          call print_results('EP', class, m+1, 0, 0, nit, npm,
 301      >                      no_nodes, tm, Mops,
 302      >                      'Random numbers generated',
 303      >                      verified, npbversion, compiletime, cs1,
 304      >                      cs2, cs3, cs4, cs5, cs6, cs7)
 305
 306       endif
 307
 308       if (timers_enabled .and. (node .eq. root)) then
 309           print *, 'Total time:     ', timer_read(1)
 310           print *, 'Gaussian pairs: ', timer_read(2)
 311           print *, 'Random numbers: ', timer_read(3)
 312       endif
 313
 314       call mpi_finalize(ierr)
 315
 316       end