1 !-------------------------------------------------------------------------!
3 ! N A S P A R A L L E L B E N C H M A R K S 3.3 !
7 !-------------------------------------------------------------------------!
9 ! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
10 ! It is described in NAS Technical Reports 95-020 and 02-007 !
12 ! Permission to use, copy, distribute and modify this software !
13 ! for any purpose with or without fee is hereby granted. We !
14 ! request, however, that all derived work reference the NAS !
15 ! Parallel Benchmarks 3.3. This software is provided "as is" !
16 ! without express or implied warranty. !
18 ! Information on NPB 3.3, including the technical report, the !
19 ! original specifications, source code, results and information !
20 ! on how to submit new results, is available at: !
22 ! http://www.nas.nasa.gov/Software/NPB/ !
24 ! Send comments or suggestions to npb@nas.nasa.gov !
26 ! NAS Parallel Benchmarks Group !
27 ! NASA Ames Research Center !
29 ! Moffett Field, CA 94035-1000 !
31 ! E-mail: npb@nas.nasa.gov !
32 ! Fax: (650) 604-3957 !
34 !-------------------------------------------------------------------------!
37 c---------------------------------------------------------------------
39 c Authors: P. O. Frederickson
42 c R. F. Van der Wijngaart
43 c---------------------------------------------------------------------
45 c---------------------------------------------------------------------
47 c---------------------------------------------------------------------
49 c This is the MPI version of the APP Benchmark 1,
50 c the "embarassingly parallel" benchmark.
53 c M is the Log_2 of the number of complex pairs of uniform (0, 1) random
54 c numbers. MK is the Log_2 of the size of each batch of uniform random
55 c numbers. MK can be set for convenience on a given system, since it does
56 c not affect the results.
63 double precision Mops, epsilon, a, s, t1, t2, t3, t4, x, x1,
64 > x2, q, sx, sy, tm, an, tt, gc, dum(3),
66 double precision sx_verify_value, sy_verify_value, sx_err, sy_err
67 integer mk, mm, nn, nk, nq, np, ierr, node, no_nodes,
68 > i, ik, kk, l, k, nit, ierrcode, no_large_nodes,
70 logical verified, timers_enabled
71 parameter (timers_enabled = .false.)
72 external randlc, timer_read
73 double precision randlc, qq
76 parameter (mk = 16, mm = m - mk, nn = 2 ** mm,
77 > nk = 2 ** mk, nq = 10, epsilon=1.d-8,
78 > a = 1220703125.d0, s = 271828183.d0)
80 common/storage/ x(2*nk), q(0:nq-1), qq(10000)
81 data dum /1.d0, 1.d0, 1.d0/
84 call mpi_comm_rank(MPI_COMM_WORLD,node,ierr)
85 call mpi_comm_size(MPI_COMM_WORLD,no_nodes,ierr)
89 if (.not. convertdouble) then
90 dp_type = MPI_DOUBLE_PRECISION
95 if (node.eq.root) then
97 c Because the size of the problem is too large to store in a 32-bit
98 c integer for some classes, we put it into a string (for printing).
99 c Have to strip off the decimal point put in there by the floating
100 c point print statement (internal file)
103 write(size, '(f15.0)' ) 2.d0**(m+1)
105 if (size(j:j) .eq. '.') j = j - 1
106 write (*,1001) size(1:j)
107 write(*, 1003) no_nodes
109 1000 format(/,' NAS Parallel Benchmarks 3.3 -- EP Benchmark',/)
110 1001 format(' Number of random numbers generated: ', a15)
111 1003 format(' Number of active processes: ', 2x, i13, /)
117 c Compute the number of "batches" of random number pairs generated
118 c per processor. Adjust if the number of processors does not evenly
119 c divide the total number
122 no_large_nodes = mod(nn, no_nodes)
123 if (node .lt. no_large_nodes) then
131 write (6, 1) no_nodes, nn
132 1 format ('Too many nodes:',2i6)
133 call mpi_abort(MPI_COMM_WORLD,ierrcode,ierr)
137 c Call the random number generator functions and initialize
138 c the x-array to reduce the effects of paging on the timings.
139 c Also, call all mathematical functions that are used. Make
140 c sure these initializations cannot be eliminated as dead code.
142 call vranlc(0, dum(1), dum(2), dum(3))
143 dum(1) = randlc(dum(2), dum(3))
147 Mops = log(sqrt(abs(max(1.d0,1.d0))))
149 c---------------------------------------------------------------------
150 c Synchronize before placing time stamp
151 c---------------------------------------------------------------------
152 call mpi_barrier(MPI_COMM_WORLD, ierr)
160 call vranlc(0, t1, a, x)
162 c Compute AN = A ^ (2 * NK) (mod 2^46).
180 c Each instance of this loop may be performed independently. We compute
181 c the k offsets separately to take into account the fact that some nodes
182 c have more numbers to generate than others
184 if (np_add .eq. 1) then
185 k_offset = node * np -1
187 k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1
195 c Find starting seed t1 for this kk.
199 if (2 * ik .ne. kk) t3 = randlc(t1, t2)
200 if (ik .eq. 0) goto 130
205 c Compute uniform pseudorandom numbers.
208 if (timers_enabled) call timer_start(3)
209 call vranlc(2 * nk, t1, a, x)
210 if (timers_enabled) call timer_stop(3)
212 c Compute Gaussian deviates by acceptance-rejection method and
213 c tally counts in concentric square annuli. This loop is not
216 if (timers_enabled) call timer_start(2)
219 x1 = 2.d0 * x(2*i-1) - 1.d0
220 x2 = 2.d0 * x(2*i) - 1.d0
221 t1 = x1 ** 2 + x2 ** 2
222 if (t1 .le. 1.d0) then
223 t2 = sqrt(-2.d0 * log(t1) / t1)
226 l = max(abs(t3), abs(t4))
233 if (timers_enabled) call timer_stop(2)
237 call mpi_allreduce(sx, x, 1, dp_type,
238 > MPI_SUM, MPI_COMM_WORLD, ierr)
240 call mpi_allreduce(sy, x, 1, dp_type,
241 > MPI_SUM, MPI_COMM_WORLD, ierr)
243 call mpi_allreduce(q, x, nq, dp_type,
244 > MPI_SUM, MPI_COMM_WORLD, ierr)
257 call mpi_allreduce(tm, x, 1, dp_type,
258 > MPI_MAX, MPI_COMM_WORLD, ierr)
261 if (node.eq.root) then
265 sx_verify_value = -3.247834652034740D+3
266 sy_verify_value = -6.958407078382297D+3
267 elseif (m.eq.25) then
268 sx_verify_value = -2.863319731645753D+3
269 sy_verify_value = -6.320053679109499D+3
270 elseif (m.eq.28) then
271 sx_verify_value = -4.295875165629892D+3
272 sy_verify_value = -1.580732573678431D+4
273 elseif (m.eq.30) then
274 sx_verify_value = 4.033815542441498D+4
275 sy_verify_value = -2.660669192809235D+4
276 elseif (m.eq.32) then
277 sx_verify_value = 4.764367927995374D+4
278 sy_verify_value = -8.084072988043731D+4
279 elseif (m.eq.36) then
280 sx_verify_value = 1.982481200946593D+5
281 sy_verify_value = -1.020596636361769D+5
282 elseif (m.eq.40) then
283 sx_verify_value = -5.319717441530D+05
284 sy_verify_value = -3.688834557731D+05
289 sx_err = abs((sx - sx_verify_value)/sx_verify_value)
290 sy_err = abs((sy - sy_verify_value)/sy_verify_value)
291 verified = ((sx_err.le.epsilon) .and. (sy_err.le.epsilon))
293 Mops = 2.d0**(m+1)/tm/1000000.d0
295 write (6,11) tm, m, gc, sx, sy, (i, q(i), i = 0, nq - 1)
296 11 format ('EP Benchmark Results:'//'CPU Time =',f10.4/'N = 2^',
297 > i5/'No. Gaussian Pairs =',f15.0/'Sums = ',1p,2d25.15/
298 > 'Counts:'/(i3,0p,f15.0))
300 call print_results('EP', class, m+1, 0, 0, nit, npm,
301 > no_nodes, tm, Mops,
302 > 'Random numbers generated',
303 > verified, npbversion, compiletime, cs1,
304 > cs2, cs3, cs4, cs5, cs6, cs7)
308 if (timers_enabled .and. (node .eq. root)) then
309 print *, 'Total time: ', timer_read(1)
310 print *, 'Gaussian pairs: ', timer_read(2)
311 print *, 'Random numbers: ', timer_read(3)
314 call mpi_finalize(ierr)