1 !-------------------------------------------------------------------------!
3 ! N A S P A R A L L E L B E N C H M A R K S 3.3 !
7 !-------------------------------------------------------------------------!
9 ! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
10 ! It is described in NAS Technical Reports 95-020 and 02-007 !
12 ! Permission to use, copy, distribute and modify this software !
13 ! for any purpose with or without fee is hereby granted. We !
14 ! request, however, that all derived work reference the NAS !
15 ! Parallel Benchmarks 3.3. This software is provided "as is" !
16 ! without express or implied warranty. !
18 ! Information on NPB 3.3, including the technical report, the !
19 ! original specifications, source code, results and information !
20 ! on how to submit new results, is available at: !
22 ! http://www.nas.nasa.gov/Software/NPB/ !
24 ! Send comments or suggestions to npb@nas.nasa.gov !
26 ! NAS Parallel Benchmarks Group !
27 ! NASA Ames Research Center !
29 ! Moffett Field, CA 94035-1000 !
31 ! E-mail: npb@nas.nasa.gov !
32 ! Fax: (650) 604-3957 !
34 !-------------------------------------------------------------------------!
36 !TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER
37 !STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT
38 !JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID
39 !FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY
40 !USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE
43 !TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY
44 !MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE
45 !NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE
46 !COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS
47 !((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS
48 !THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE
49 !PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A
50 !FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER.
51 !THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL.
53 !UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT
54 !FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE
55 !"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM
56 !DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER
57 !OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1
58 !AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION
59 !OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE
60 !ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO.
62 c---------------------------------------------------------------------
66 c R. F. Van der Wijngaart
68 c---------------------------------------------------------------------
70 c---------------------------------------------------------------------
72 c---------------------------------------------------------------------
74 c---------------------------------------------------------------------
76 c---------------------------------------------------------------------
77 c---------------------------------------------------------------------
81 c---------------------------------------------------------------------
82 c---------------------------------------------------------------------
90 c---------------------------------------------------------------------
91 c u0, u1, u2 are the main arrays in the problem.
92 c Depending on the decomposition, these arrays will have different
93 c dimensions. To accomodate all possibilities, we allocate them as
94 c one-dimensional arrays and pass them to subroutines for different
96 c - u0 contains the initial (transformed) initial condition
97 c - u1 and u2 are working arrays
98 c---------------------------------------------------------------------
100 double complex u0(ntdivnp),
103 double precision twiddle(ntdivnp)
104 c---------------------------------------------------------------------
105 c Large arrays are in common so that they are allocated on the
106 c heap rather than the stack. This common block is not
107 c referenced directly anywhere else. Padding is to avoid accidental
108 c cache problems, since all array sizes are powers of two.
109 c---------------------------------------------------------------------
111 double complex pad1(3), pad2(3), pad3(3)
112 common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
115 double precision total_time, mflops
121 c---------------------------------------------------------------------
122 c Run the entire problem once to make sure all data is touched.
123 c This reduces variable startup costs, which is important for such a
124 c short benchmark. The other NPB 2 implementations are similar.
125 c---------------------------------------------------------------------
131 call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
132 call compute_initial_conditions(u1, dims(1,1), dims(2,1),
134 call fft_init (dims(1,1))
137 c---------------------------------------------------------------------
138 c Start over from the beginning. Note that all operations must
139 c be timed, in contrast to other benchmarks.
140 c---------------------------------------------------------------------
144 call MPI_Barrier(MPI_COMM_WORLD, ierr)
146 call timer_start(T_total)
147 if (timers_enabled) call timer_start(T_setup)
149 call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
150 call compute_initial_conditions(u1, dims(1,1), dims(2,1),
152 call fft_init (dims(1,1))
154 if (timers_enabled) call synchup()
155 if (timers_enabled) call timer_stop(T_setup)
157 if (timers_enabled) call timer_start(T_fft)
159 if (timers_enabled) call timer_stop(T_fft)
162 if (timers_enabled) call timer_start(T_evolve)
163 call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1))
164 if (timers_enabled) call timer_stop(T_evolve)
165 if (timers_enabled) call timer_start(T_fft)
167 if (timers_enabled) call timer_stop(T_fft)
168 if (timers_enabled) call synchup()
169 if (timers_enabled) call timer_start(T_checksum)
170 call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1))
171 if (timers_enabled) call timer_stop(T_checksum)
174 call verify(nx, ny, nz, niter, verified, class)
175 call timer_stop(t_total)
176 if (np .ne. np_min) verified = .false.
177 total_time = timer_read(t_total)
179 if( total_time .ne. 0. ) then
180 mflops = 1.0d-6*ntotal_f *
181 > (14.8157+7.19641*log(ntotal_f)
182 > + (5.23518+7.21113*log(ntotal_f))*niter)
188 call print_results('FT', class, nx, ny, nz, niter, np_min, np,
189 > total_time, mflops, ' floating point', verified,
190 > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
192 if (timers_enabled) call print_timers()
193 call MPI_Finalize(ierr)
196 c---------------------------------------------------------------------
197 c---------------------------------------------------------------------
199 subroutine evolve(u0, u1, twiddle, d1, d2, d3)
201 c---------------------------------------------------------------------
202 c---------------------------------------------------------------------
204 c---------------------------------------------------------------------
205 c evolve u0 -> u1 (t time steps) in fourier space
206 c---------------------------------------------------------------------
212 double complex u0(d1,d2,d3)
213 double complex u1(d1,d2,d3)
214 double precision twiddle(d1,d2,d3)
220 u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k))
221 u1(i,j,k) = u0(i,j,k)
230 c---------------------------------------------------------------------
231 c---------------------------------------------------------------------
233 subroutine compute_initial_conditions(u0, d1, d2, d3)
235 c---------------------------------------------------------------------
236 c---------------------------------------------------------------------
238 c---------------------------------------------------------------------
239 c Fill in array u0 with initial conditions from
240 c random number generator
241 c---------------------------------------------------------------------
245 double complex u0(d1, d2, d3)
247 double precision x0, start, an, dummy
249 c---------------------------------------------------------------------
250 c 0-D and 1-D layouts are easy because each processor gets a contiguous
251 c chunk of the array, in the Fortran ordering sense.
252 c For a 2-D layout, it's a bit more complicated. We always
253 c have entire x-lines (contiguous) in processor.
254 c We can do ny/np1 of them at a time since we have
255 c ny/np1 contiguous in y-direction. But then we jump
256 c by z-planes (nz/np2 of them, total).
257 c For the 0-D and 1-D layouts we could do larger chunks, but
258 c this turns out to have no measurable impact on performance.
259 c---------------------------------------------------------------------
263 c---------------------------------------------------------------------
264 c Jump to the starting element for our first plane.
265 c---------------------------------------------------------------------
266 call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an)
267 dummy = randlc(start, an)
268 call ipow46(a, 2*nx, ny, an)
270 c---------------------------------------------------------------------
271 c Go through by z planes filling in one square at a time.
272 c---------------------------------------------------------------------
273 do k = 1, dims(3, 1) ! nz/np2
275 call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k))
276 if (k .ne. dims(3, 1)) dummy = randlc(start, an)
283 c---------------------------------------------------------------------
284 c---------------------------------------------------------------------
286 subroutine ipow46(a, exp_1, exp_2, result)
288 c---------------------------------------------------------------------
289 c---------------------------------------------------------------------
291 c---------------------------------------------------------------------
292 c compute a^exponent mod 2^46
293 c---------------------------------------------------------------------
296 double precision a, result, dummy, q, r
297 integer exp_1, exp_2, n, n2, ierr
299 double precision randlc
301 c---------------------------------------------------------------------
303 c a^n = a^(n/2)*a^(n/2) if n even else
304 c a^n = a*a^(n-1) if n odd
305 c---------------------------------------------------------------------
307 if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return
315 if (n2 * 2 .eq. n) then
326 if (n2 * 2 .eq. n) then
340 c---------------------------------------------------------------------
341 c---------------------------------------------------------------------
345 c---------------------------------------------------------------------
346 c---------------------------------------------------------------------
352 integer ierr, i, j, fstatus
355 call MPI_Comm_size(MPI_COMM_WORLD, np, ierr)
356 call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr)
358 if (.not. convertdouble) then
359 dc_type = MPI_DOUBLE_COMPLEX
361 dc_type = MPI_COMPLEX
367 open (unit=2,file='inputft.data',status='old', iostat=fstatus)
369 if (fstatus .eq. 0) then
371 233 format(' Reading from input file inputft.data')
373 read (2,*) layout_type
377 c---------------------------------------------------------------------
378 c check to make sure input data is consistent
379 c---------------------------------------------------------------------
382 c---------------------------------------------------------------------
383 c 1. product of processor grid dims must equal number of processors
384 c---------------------------------------------------------------------
386 if (np1 * np2 .ne. np) then
388 238 format(' np1 and np2 given in input file are not valid.')
389 write(*, 239) np1*np2, np
390 239 format(' Product is ', i5, ' and should be ', i5)
391 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
394 c---------------------------------------------------------------------
395 c 2. layout type must be valid
396 c---------------------------------------------------------------------
398 if (layout_type .ne. layout_0D .and.
399 > layout_type .ne. layout_1D .and.
400 > layout_type .ne. layout_2D) then
402 240 format(' Layout type specified in inputft.data is
404 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
407 c---------------------------------------------------------------------
408 c 3. 0D layout must be 1x1 grid
409 c---------------------------------------------------------------------
411 if (layout_type .eq. layout_0D .and.
412 > (np1 .ne.1 .or. np2 .ne. 1)) then
414 241 format(' For 0D layout, both np1 and np2 must be 1 ')
415 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
417 c---------------------------------------------------------------------
418 c 4. 1D layout must be 1xN grid
419 c---------------------------------------------------------------------
421 if (layout_type .eq. layout_1D .and. np1 .ne. 1) then
423 242 format(' For 1D layout, np1 must be 1 ')
424 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
429 niter = niter_default
433 layout_type = layout_0D
434 else if (np .le. nz) then
437 layout_type = layout_1D
441 layout_type = layout_2D
445 if (np .lt. np_min) then
447 10 format(' Error: Compiled for ', I5, ' processors. ')
449 11 format(' Only ', i5, ' processors found ')
450 call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
453 234 format(' No input file inputft.data. Using compiled defaults')
454 write(*, 1001) nx, ny, nz
457 write(*, 1005) np1, np2
458 if (np .ne. np_min) write(*, 1006) np_min
460 if (layout_type .eq. layout_0D) then
462 else if (layout_type .eq. layout_1D) then
468 1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/)
469 1001 format(' Size : ', i4, 'x', i4, 'x', i4)
470 1002 format(' Iterations : ', 7x, i7)
471 1004 format(' Number of processes : ', 7x, i7)
472 1005 format(' Processor array : ', 5x, i4, 'x', i4)
473 1006 format(' WARNING: compiled for ', i5, ' processes. ',
474 > ' Will not verify. ')
475 1010 format(' Layout type : ', 9x, A5)
479 c---------------------------------------------------------------------
480 c Since np1, np2 and layout_type are in a common block,
481 c this sends all three.
482 c---------------------------------------------------------------------
483 call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
484 call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
486 if (np1 .eq. 1 .and. np2 .eq. 1) then
487 layout_type = layout_0D
488 else if (np1 .eq. 1) then
489 layout_type = layout_1D
491 layout_type = layout_2D
494 if (layout_type .eq. layout_0D) then
500 else if (layout_type .eq. layout_1D) then
512 else if (layout_type .eq. layout_2D) then
527 dims(2, i) = dims(2, i) / np1
528 dims(3, i) = dims(3, i) / np2
532 c---------------------------------------------------------------------
533 c Determine processor coordinates of this processor
534 c Processor grid is np1xnp2.
535 c Arrays are always (n1, n2/np1, n3/np2)
536 c Processor coords are zero-based.
537 c---------------------------------------------------------------------
538 me2 = mod(me, np2) ! goes from 0...np2-1
539 me1 = me/np2 ! goes from 0...np1-1
540 c---------------------------------------------------------------------
541 c Communicators for rows/columns of processor grid.
542 c commslice1 is communicator of all procs with same me1, ranked as me2
543 c commslice2 is communicator of all procs with same me2, ranked as me1
544 c mpi_comm_split(comm, color, key, ...)
545 c---------------------------------------------------------------------
546 call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr)
547 call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr)
548 if (timers_enabled) call synchup()
550 if (debug) print *, 'proc coords: ', me, me1, me2
552 c---------------------------------------------------------------------
553 c Determine which section of the grid is owned by this
555 c---------------------------------------------------------------------
556 if (layout_type .eq. layout_0d) then
567 else if (layout_type .eq. layout_1d) then
573 zstart(1) = 1 + me2 * nz/np2
574 zend(1) = (me2+1) * nz/np2
580 zstart(2) = 1 + me2 * nz/np2
581 zend(2) = (me2+1) * nz/np2
585 ystart(3) = 1 + me2 * ny/np2
586 yend(3) = (me2+1) * ny/np2
590 else if (layout_type .eq. layout_2d) then
594 ystart(1) = 1 + me1 * ny/np1
595 yend(1) = (me1+1) * ny/np1
596 zstart(1) = 1 + me2 * nz/np2
597 zend(1) = (me2+1) * nz/np2
599 xstart(2) = 1 + me1 * nx/np1
600 xend(2) = (me1+1)*nx/np1
603 zstart(2) = zstart(1)
606 xstart(3) = xstart(2)
608 ystart(3) = 1 + me2 *ny/np2
609 yend(3) = (me2+1)*ny/np2
614 c---------------------------------------------------------------------
615 c Set up info for blocking of ffts and transposes. This improves
616 c performance on cache-based systems. Blocking involves
617 c working on a chunk of the problem at a time, taking chunks
618 c along the first, second, or third dimension.
620 c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
621 c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
623 c Since 1st dim is always in processor, we'll assume it's long enough
624 c (default blocking factor is 16 so min size for 1st dim is 16)
625 c The only case we have to worry about is cffts1 in a 2d decomposition.
626 c so the blocking factor should not be larger than the 2nd dimension.
627 c---------------------------------------------------------------------
629 fftblock = fftblock_default
630 fftblockpad = fftblockpad_default
632 if (layout_type .eq. layout_2d) then
633 if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1)
634 if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2)
635 if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3)
638 if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3
644 c---------------------------------------------------------------------
645 c---------------------------------------------------------------------
647 subroutine compute_indexmap(twiddle, d1, d2, d3)
649 c---------------------------------------------------------------------
650 c---------------------------------------------------------------------
652 c---------------------------------------------------------------------
653 c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
654 c for time evolution exponent.
655 c---------------------------------------------------------------------
661 integer i, j, k, ii, ii2, jj, ij2, kk
662 double precision ap, twiddle(d1, d2, d3)
664 c---------------------------------------------------------------------
665 c this function is very different depending on whether
666 c we are in the 0d, 1d or 2d layout. Compute separately.
667 c basically we want to convert the fortran indices
670 c 0 1 2 3 -4 -3 -2 -1
671 c The following magic formula does the trick:
672 c mod(i-1+n/2, n) - n/2
673 c---------------------------------------------------------------------
675 ap = - 4.d0 * alpha * pi *pi
677 if (layout_type .eq. layout_0d) then ! xyz layout
679 ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
682 jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
685 kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
686 twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2))
690 else if (layout_type .eq. layout_1d) then ! zxy layout
692 ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
695 jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
698 kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
699 twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
703 else if (layout_type .eq. layout_2d) then ! zxy layout
705 ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
708 jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
711 kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
712 twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
717 print *, ' Unknown layout type ', layout_type
726 c---------------------------------------------------------------------
727 c---------------------------------------------------------------------
729 subroutine print_timers()
731 c---------------------------------------------------------------------
732 c---------------------------------------------------------------------
737 character*25 tstrings(T_max)
738 data tstrings / ' total ',
746 > ' transpose1_loc ',
747 > ' transpose1_glo ',
748 > ' transpose1_fin ',
749 > ' transpose2_loc ',
750 > ' transpose2_glo ',
751 > ' transpose2_fin ',
754 if (me .ne. 0) return
756 if (timer_read(i) .ne. 0.0d0) then
757 write(*, 100) i, tstrings(i), timer_read(i)
760 100 format(' timer ', i2, '(', A16, ') :', F10.6)
766 c---------------------------------------------------------------------
767 c---------------------------------------------------------------------
769 subroutine fft(dir, x1, x2)
771 c---------------------------------------------------------------------
772 c---------------------------------------------------------------------
777 double complex x1(ntdivnp), x2(ntdivnp)
779 double complex scratch(fftblockpad_default*maxdim*2)
781 c---------------------------------------------------------------------
782 c note: args x1, x2 must be different arrays
783 c note: args for cfftsx are (direction, layout, xin, xout, scratch)
784 c xin/xout may be the same and it can be somewhat faster
786 c note: args for transpose are (layout1, layout2, xin, xout)
787 c xin/xout must be different
788 c---------------------------------------------------------------------
791 if (layout_type .eq. layout_0d) then
792 call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
794 call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
796 call cffts3(1, dims(1,3), dims(2,3), dims(3,3),
798 else if (layout_type .eq. layout_1d) then
799 call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
801 call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
803 if (timers_enabled) call timer_start(T_transpose)
804 call transpose_xy_z(2, 3, x1, x2)
805 if (timers_enabled) call timer_stop(T_transpose)
806 call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
808 else if (layout_type .eq. layout_2d) then
809 call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
811 if (timers_enabled) call timer_start(T_transpose)
812 call transpose_x_y(1, 2, x1, x2)
813 if (timers_enabled) call timer_stop(T_transpose)
814 call cffts1(1, dims(1,2), dims(2,2), dims(3,2),
816 if (timers_enabled) call timer_start(T_transpose)
817 call transpose_x_z(2, 3, x2, x1)
818 if (timers_enabled) call timer_stop(T_transpose)
819 call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
823 if (layout_type .eq. layout_0d) then
824 call cffts3(-1, dims(1,3), dims(2,3), dims(3,3),
826 call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
828 call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
830 else if (layout_type .eq. layout_1d) then
831 call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
833 if (timers_enabled) call timer_start(T_transpose)
834 call transpose_x_yz(3, 2, x1, x2)
835 if (timers_enabled) call timer_stop(T_transpose)
836 call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
838 call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
840 else if (layout_type .eq. layout_2d) then
841 call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
843 if (timers_enabled) call timer_start(T_transpose)
844 call transpose_x_z(3, 2, x1, x2)
845 if (timers_enabled) call timer_stop(T_transpose)
846 call cffts1(-1, dims(1,2), dims(2,2), dims(3,2),
848 if (timers_enabled) call timer_start(T_transpose)
849 call transpose_x_y(2, 1, x2, x1)
850 if (timers_enabled) call timer_stop(T_transpose)
851 call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
860 c---------------------------------------------------------------------
861 c---------------------------------------------------------------------
863 subroutine cffts1(is, d1, d2, d3, x, xout, y)
865 c---------------------------------------------------------------------
866 c---------------------------------------------------------------------
871 integer is, d1, d2, d3, logd1
872 double complex x(d1,d2,d3)
873 double complex xout(d1,d2,d3)
874 double complex y(fftblockpad, d1, 2)
880 do jj = 0, d2 - fftblock, fftblock
881 if (timers_enabled) call timer_start(T_fftcopy)
884 y(j,i,1) = x(i,j+jj,k)
887 if (timers_enabled) call timer_stop(T_fftcopy)
889 if (timers_enabled) call timer_start(T_fftlow)
890 call cfftz (is, logd1, d1, y, y(1,1,2))
891 if (timers_enabled) call timer_stop(T_fftlow)
893 if (timers_enabled) call timer_start(T_fftcopy)
896 xout(i,j+jj,k) = y(j,i,1)
899 if (timers_enabled) call timer_stop(T_fftcopy)
907 c---------------------------------------------------------------------
908 c---------------------------------------------------------------------
910 subroutine cffts2(is, d1, d2, d3, x, xout, y)
912 c---------------------------------------------------------------------
913 c---------------------------------------------------------------------
918 integer is, d1, d2, d3, logd2
919 double complex x(d1,d2,d3)
920 double complex xout(d1,d2,d3)
921 double complex y(fftblockpad, d2, 2)
927 do ii = 0, d1 - fftblock, fftblock
928 if (timers_enabled) call timer_start(T_fftcopy)
931 y(i,j,1) = x(i+ii,j,k)
934 if (timers_enabled) call timer_stop(T_fftcopy)
936 if (timers_enabled) call timer_start(T_fftlow)
937 call cfftz (is, logd2, d2, y, y(1, 1, 2))
938 if (timers_enabled) call timer_stop(T_fftlow)
940 if (timers_enabled) call timer_start(T_fftcopy)
943 xout(i+ii,j,k) = y(i,j,1)
946 if (timers_enabled) call timer_stop(T_fftcopy)
954 c---------------------------------------------------------------------
955 c---------------------------------------------------------------------
957 subroutine cffts3(is, d1, d2, d3, x, xout, y)
959 c---------------------------------------------------------------------
960 c---------------------------------------------------------------------
965 integer is, d1, d2, d3, logd3
966 double complex x(d1,d2,d3)
967 double complex xout(d1,d2,d3)
968 double complex y(fftblockpad, d3, 2)
974 do ii = 0, d1 - fftblock, fftblock
975 if (timers_enabled) call timer_start(T_fftcopy)
978 y(i,k,1) = x(i+ii,j,k)
981 if (timers_enabled) call timer_stop(T_fftcopy)
983 if (timers_enabled) call timer_start(T_fftlow)
984 call cfftz (is, logd3, d3, y, y(1, 1, 2))
985 if (timers_enabled) call timer_stop(T_fftlow)
987 if (timers_enabled) call timer_start(T_fftcopy)
990 xout(i+ii,j,k) = y(i,k,1)
993 if (timers_enabled) call timer_stop(T_fftcopy)
1001 c---------------------------------------------------------------------
1002 c---------------------------------------------------------------------
1004 subroutine fft_init (n)
1006 c---------------------------------------------------------------------
1007 c---------------------------------------------------------------------
1009 c---------------------------------------------------------------------
1010 c compute the roots-of-unity array that will be used for subsequent FFTs.
1011 c---------------------------------------------------------------------
1016 integer m,n,nu,ku,i,j,ln
1017 double precision t, ti
1020 c---------------------------------------------------------------------
1021 c Initialize the U array with sines and cosines in a manner that permits
1022 c stride one access at each FFT iteration.
1023 c---------------------------------------------------------------------
1035 u(i+ku) = dcmplx (cos (ti), sin(ti))
1045 c---------------------------------------------------------------------
1046 c---------------------------------------------------------------------
1048 subroutine cfftz (is, m, n, x, y)
1050 c---------------------------------------------------------------------
1051 c---------------------------------------------------------------------
1053 c---------------------------------------------------------------------
1054 c Computes NY N-point complex-to-complex FFTs of X using an algorithm due
1055 c to Swarztrauber. X is both the input and the output array, while Y is a
1056 c scratch array. It is assumed that N = 2^M. Before calling CFFTZ to
1057 c perform FFTs, the array U must be initialized by calling CFFTZ with IS
1058 c set to 0 and M set to MX, where MX is the maximum value of M for any
1060 c---------------------------------------------------------------------
1065 integer is,m,n,i,j,l,mx
1068 dimension x(fftblockpad,n), y(fftblockpad,n)
1070 c---------------------------------------------------------------------
1071 c Check if input parameters are invalid.
1072 c---------------------------------------------------------------------
1074 if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)
1076 write (*, 1) is, m, mx
1077 1 format ('CFFTZ: Either U has not been initialized, or else'/
1078 > 'one of the input parameters is invalid', 3I5)
1082 c---------------------------------------------------------------------
1083 c Perform one variant of the Stockham FFT.
1084 c---------------------------------------------------------------------
1086 call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
1087 if (l .eq. m) goto 160
1088 call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
1093 c---------------------------------------------------------------------
1095 c---------------------------------------------------------------------
1107 c---------------------------------------------------------------------
1108 c---------------------------------------------------------------------
1110 subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
1112 c---------------------------------------------------------------------
1113 c---------------------------------------------------------------------
1115 c---------------------------------------------------------------------
1116 c Performs the L-th iteration of the second variant of the Stockham FFT.
1117 c---------------------------------------------------------------------
1121 integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
1122 double complex u,x,y,u1,x11,x21
1123 dimension u(n), x(ny1,n), y(ny1,n)
1126 c---------------------------------------------------------------------
1127 c Set initial parameters.
1128 c---------------------------------------------------------------------
1144 u1 = dconjg (u(ku+i))
1147 c---------------------------------------------------------------------
1148 c This loop is vectorizable.
1149 c---------------------------------------------------------------------
1154 y(j,i21+k) = x11 + x21
1155 y(j,i22+k) = u1 * (x11 - x21)
1163 c---------------------------------------------------------------------
1166 c---------------------------------------------------------------------
1167 c---------------------------------------------------------------------
1169 integer function ilog2(n)
1171 c---------------------------------------------------------------------
1172 c---------------------------------------------------------------------
1182 do while (nn .lt. n)
1191 c---------------------------------------------------------------------
1192 c---------------------------------------------------------------------
1194 subroutine transpose_x_yz(l1, l2, xin, xout)
1196 c---------------------------------------------------------------------
1197 c---------------------------------------------------------------------
1202 double complex xin(ntdivnp), xout(ntdivnp)
1204 call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1),
1207 call transpose2_global(xout, xin)
1209 call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1),
1216 c---------------------------------------------------------------------
1217 c---------------------------------------------------------------------
1219 subroutine transpose_xy_z(l1, l2, xin, xout)
1221 c---------------------------------------------------------------------
1222 c---------------------------------------------------------------------
1227 double complex xin(ntdivnp), xout(ntdivnp)
1229 call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1),
1231 call transpose2_global(xout, xin)
1232 call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1),
1240 c---------------------------------------------------------------------
1241 c---------------------------------------------------------------------
1243 subroutine transpose2_local(n1, n2, xin, xout)
1245 c---------------------------------------------------------------------
1246 c---------------------------------------------------------------------
1252 double complex xin(n1, n2), xout(n2, n1)
1254 double complex z(transblockpad, transblock)
1256 integer i, j, ii, jj
1258 if (timers_enabled) call timer_start(T_transxzloc)
1260 c---------------------------------------------------------------------
1261 c If possible, block the transpose for cache memory systems.
1262 c How much does this help? Example: R8000 Power Challenge (90 MHz)
1263 c Blocked version decreases time spend in this routine
1264 c from 14 seconds to 5.2 seconds on 8 nodes class A.
1265 c---------------------------------------------------------------------
1267 if (n1 .lt. transblock .or. n2 .lt. transblock) then
1268 if (n1 .ge. n2) then
1271 xout(j, i) = xin(i, j)
1277 xout(j, i) = xin(i, j)
1282 do j = 0, n2-1, transblock
1283 do i = 0, n1-1, transblock
1285 c---------------------------------------------------------------------
1286 c Note: compiler should be able to take j+jj out of inner loop
1287 c---------------------------------------------------------------------
1288 do jj = 1, transblock
1289 do ii = 1, transblock
1290 z(jj,ii) = xin(i+ii, j+jj)
1294 do ii = 1, transblock
1295 do jj = 1, transblock
1296 xout(j+jj, i+ii) = z(jj,ii)
1303 if (timers_enabled) call timer_stop(T_transxzloc)
1309 c---------------------------------------------------------------------
1310 c---------------------------------------------------------------------
1312 subroutine transpose2_global(xin, xout)
1314 c---------------------------------------------------------------------
1315 c---------------------------------------------------------------------
1320 double complex xin(ntdivnp)
1321 double complex xout(ntdivnp)
1324 if (timers_enabled) call synchup()
1326 if (timers_enabled) call timer_start(T_transxzglo)
1327 call mpi_alltoall(xin, ntdivnp/np, dc_type,
1328 > xout, ntdivnp/np, dc_type,
1330 if (timers_enabled) call timer_stop(T_transxzglo)
1337 c---------------------------------------------------------------------
1338 c---------------------------------------------------------------------
1340 subroutine transpose2_finish(n1, n2, xin, xout)
1342 c---------------------------------------------------------------------
1343 c---------------------------------------------------------------------
1347 integer n1, n2, ioff
1348 double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2)
1352 if (timers_enabled) call timer_start(T_transxzfin)
1357 xout(i+ioff, j) = xin(i, j, p)
1361 if (timers_enabled) call timer_stop(T_transxzfin)
1367 c---------------------------------------------------------------------
1368 c---------------------------------------------------------------------
1370 subroutine transpose_x_z(l1, l2, xin, xout)
1372 c---------------------------------------------------------------------
1373 c---------------------------------------------------------------------
1378 double complex xin(ntdivnp), xout(ntdivnp)
1380 call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1),
1382 call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1),
1384 call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2),
1390 c---------------------------------------------------------------------
1391 c---------------------------------------------------------------------
1393 subroutine transpose_x_z_local(d1, d2, d3, xin, xout)
1395 c---------------------------------------------------------------------
1396 c---------------------------------------------------------------------
1401 double complex xin(d1,d2,d3)
1402 double complex xout(d3,d2,d1)
1403 integer block1, block3
1404 integer i, j, k, kk, ii, i1, k1
1406 double complex buf(transblockpad, maxdim)
1407 if (timers_enabled) call timer_start(T_transxzloc)
1408 if (d1 .lt. 32) goto 100
1410 if (block3 .eq. 1) goto 100
1411 if (block3 .gt. transblock) block3 = transblock
1413 if (block1*block3 .gt. transblock*transblock)
1414 > block1 = transblock*transblock/block3
1415 c---------------------------------------------------------------------
1417 c---------------------------------------------------------------------
1419 do kk = 0, d3-block3, block3
1420 do ii = 0, d1-block1, block1
1425 buf(k, i) = xin(i+ii, j, k1)
1432 xout(k+kk, j, i1) = buf(k, i)
1442 c---------------------------------------------------------------------
1444 c---------------------------------------------------------------------
1450 xout(k, j, i) = xin(i, j, k)
1455 c---------------------------------------------------------------------
1457 c---------------------------------------------------------------------
1460 if (timers_enabled) call timer_stop(T_transxzloc)
1465 c---------------------------------------------------------------------
1466 c---------------------------------------------------------------------
1468 subroutine transpose_x_z_global(d1, d2, d3, xin, xout)
1470 c---------------------------------------------------------------------
1471 c---------------------------------------------------------------------
1477 double complex xin(d3,d2,d1)
1478 double complex xout(d3,d2,d1) ! not real layout, but right size
1481 if (timers_enabled) call synchup()
1483 c---------------------------------------------------------------------
1484 c do transpose among all processes with same 1-coord (me1)
1485 c---------------------------------------------------------------------
1486 if (timers_enabled)call timer_start(T_transxzglo)
1487 call mpi_alltoall(xin, d1*d2*d3/np2, dc_type,
1488 > xout, d1*d2*d3/np2, dc_type,
1490 if (timers_enabled) call timer_stop(T_transxzglo)
1495 c---------------------------------------------------------------------
1496 c---------------------------------------------------------------------
1498 subroutine transpose_x_z_finish(d1, d2, d3, xin, xout)
1500 c---------------------------------------------------------------------
1501 c---------------------------------------------------------------------
1506 double complex xin(d1/np2, d2, d3, 0:np2-1)
1507 double complex xout(d1,d2,d3)
1508 integer i, j, k, p, ioff
1509 if (timers_enabled) call timer_start(T_transxzfin)
1510 c---------------------------------------------------------------------
1511 c this is the most straightforward way of doing it. the
1512 c calculation in the inner loop doesn't help.
1518 c xout(ii, j, k) = xin(i, j, k, p)
1523 c---------------------------------------------------------------------
1530 xout(i+ioff, j, k) = xin(i, j, k, p)
1535 if (timers_enabled) call timer_stop(T_transxzfin)
1540 c---------------------------------------------------------------------
1541 c---------------------------------------------------------------------
1543 subroutine transpose_x_y(l1, l2, xin, xout)
1545 c---------------------------------------------------------------------
1546 c---------------------------------------------------------------------
1551 double complex xin(ntdivnp), xout(ntdivnp)
1553 c---------------------------------------------------------------------
1554 c xy transpose is a little tricky, since we don't want
1555 c to touch 3rd axis. But alltoall must involve 3rd axis (most
1556 c slowly varying) to be efficient. So we do
1557 c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local)
1558 c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global)
1559 c then local finish.
1560 c---------------------------------------------------------------------
1563 call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1),
1565 call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1),
1567 call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2),
1574 c---------------------------------------------------------------------
1575 c---------------------------------------------------------------------
1577 subroutine transpose_x_y_local(d1, d2, d3, xin, xout)
1579 c---------------------------------------------------------------------
1580 c---------------------------------------------------------------------
1585 double complex xin(d1, d2, d3)
1586 double complex xout(d2, d3, d1)
1588 if (timers_enabled) call timer_start(T_transxyloc)
1593 xout(j,k,i)=xin(i,j,k)
1597 if (timers_enabled) call timer_stop(T_transxyloc)
1602 c---------------------------------------------------------------------
1603 c---------------------------------------------------------------------
1605 subroutine transpose_x_y_global(d1, d2, d3, xin, xout)
1607 c---------------------------------------------------------------------
1608 c---------------------------------------------------------------------
1614 c---------------------------------------------------------------------
1615 c array is in form (ny/np1, nz/np2, nx)
1616 c---------------------------------------------------------------------
1617 double complex xin(d2,d3,d1)
1618 double complex xout(d2,d3,d1) ! not real layout but right size
1621 if (timers_enabled) call synchup()
1623 c---------------------------------------------------------------------
1624 c do transpose among all processes with same 1-coord (me1)
1625 c---------------------------------------------------------------------
1626 if (timers_enabled) call timer_start(T_transxyglo)
1627 call mpi_alltoall(xin, d1*d2*d3/np1, dc_type,
1628 > xout, d1*d2*d3/np1, dc_type,
1630 if (timers_enabled) call timer_stop(T_transxyglo)
1636 c---------------------------------------------------------------------
1637 c---------------------------------------------------------------------
1639 subroutine transpose_x_y_finish(d1, d2, d3, xin, xout)
1641 c---------------------------------------------------------------------
1642 c---------------------------------------------------------------------
1647 double complex xin(d1/np1, d3, d2, 0:np1-1)
1648 double complex xout(d1,d2,d3)
1649 integer i, j, k, p, ioff
1650 if (timers_enabled) call timer_start(T_transxyfin)
1651 c---------------------------------------------------------------------
1652 c this is the most straightforward way of doing it. the
1653 c calculation in the inner loop doesn't help.
1659 c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2)
1660 c xout(ii, j, k) = xin(i, k, j, p)
1665 c---------------------------------------------------------------------
1672 xout(i+ioff, j, k) = xin(i, k, j, p)
1677 if (timers_enabled) call timer_stop(T_transxyfin)
1682 c---------------------------------------------------------------------
1683 c---------------------------------------------------------------------
1685 subroutine checksum(i, u1, d1, d2, d3)
1687 c---------------------------------------------------------------------
1688 c---------------------------------------------------------------------
1693 integer i, d1, d2, d3
1694 double complex u1(d1, d2, d3)
1695 integer j, q,r,s, ierr
1696 double complex chk,allchk
1701 if (q .ge. xstart(1) .and. q .le. xend(1)) then
1703 if (r .ge. ystart(1) .and. r .le. yend(1)) then
1705 if (s .ge. zstart(1) .and. s .le. zend(1)) then
1706 chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1)
1713 call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM,
1714 > 0, MPI_COMM_WORLD, ierr)
1716 write (*, 30) i, allchk
1717 30 format (' T =',I5,5X,'Checksum =',1P2D22.12)
1721 c If we compute the checksum for diagnostic purposes, we let i be
1722 c negative, so the result will not be stored in an array
1723 if (i .gt. 0) sums(i) = allchk
1728 c---------------------------------------------------------------------
1729 c---------------------------------------------------------------------
1733 c---------------------------------------------------------------------
1734 c---------------------------------------------------------------------
1740 call timer_start(T_synch)
1741 call mpi_barrier(MPI_COMM_WORLD, ierr)
1742 call timer_stop(T_synch)
1747 c---------------------------------------------------------------------
1748 c---------------------------------------------------------------------
1750 subroutine verify (d1, d2, d3, nt, verified, class)
1752 c---------------------------------------------------------------------
1753 c---------------------------------------------------------------------
1758 integer d1, d2, d3, nt
1761 integer ierr, size, i
1762 double precision err, epsilon
1764 c---------------------------------------------------------------------
1765 c Reference checksums
1766 c---------------------------------------------------------------------
1767 double complex csum_ref(25)
1772 if (me .ne. 0) return
1777 if (d1 .eq. 64 .and.
1781 c---------------------------------------------------------------------
1782 c Sample size reference checksums
1783 c---------------------------------------------------------------------
1785 csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
1786 csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
1787 csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
1788 csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
1789 csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
1790 csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
1792 else if (d1 .eq. 128 .and.
1796 c---------------------------------------------------------------------
1797 c Class W size reference checksums
1798 c---------------------------------------------------------------------
1800 csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
1801 csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
1802 csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
1803 csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
1804 csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
1805 csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
1807 else if (d1 .eq. 256 .and.
1811 c---------------------------------------------------------------------
1812 c Class A size reference checksums
1813 c---------------------------------------------------------------------
1815 csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
1816 csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
1817 csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
1818 csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
1819 csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
1820 csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
1822 else if (d1 .eq. 512 .and.
1826 c---------------------------------------------------------------------
1827 c Class B size reference checksums
1828 c---------------------------------------------------------------------
1830 csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02)
1831 csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02)
1832 csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02)
1833 csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02)
1834 csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02)
1835 csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02)
1836 csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02)
1837 csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02)
1838 csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02)
1839 csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
1840 csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
1841 csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
1842 csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
1843 csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
1844 csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
1845 csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
1846 csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
1847 csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
1848 csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
1849 csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
1851 else if (d1 .eq. 512 .and.
1855 c---------------------------------------------------------------------
1856 c Class C size reference checksums
1857 c---------------------------------------------------------------------
1859 csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02)
1860 csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02)
1861 csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02)
1862 csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02)
1863 csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02)
1864 csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02)
1865 csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02)
1866 csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02)
1867 csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02)
1868 csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
1869 csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
1870 csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
1871 csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
1872 csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
1873 csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
1874 csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
1875 csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
1876 csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
1877 csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
1878 csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
1880 else if (d1 .eq. 2048 .and.
1881 > d2 .eq. 1024 .and.
1882 > d3 .eq. 1024 .and.
1884 c---------------------------------------------------------------------
1885 c Class D size reference checksums
1886 c---------------------------------------------------------------------
1888 csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02)
1889 csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02)
1890 csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02)
1891 csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02)
1892 csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02)
1893 csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02)
1894 csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02)
1895 csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02)
1896 csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02)
1897 csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
1898 csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
1899 csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
1900 csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
1901 csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
1902 csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
1903 csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
1904 csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
1905 csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
1906 csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
1907 csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
1908 csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
1909 csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
1910 csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
1911 csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
1912 csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
1914 else if (d1 .eq. 4096 .and.
1915 > d2 .eq. 2048 .and.
1916 > d3 .eq. 2048 .and.
1918 c---------------------------------------------------------------------
1919 c Class E size reference checksums
1920 c---------------------------------------------------------------------
1922 csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02)
1923 csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02)
1924 csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02)
1925 csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02)
1926 csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02)
1927 csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02)
1928 csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02)
1929 csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02)
1930 csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02)
1931 csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
1932 csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
1933 csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
1934 csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
1935 csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
1936 csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
1937 csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
1938 csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
1939 csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
1940 csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
1941 csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
1942 csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
1943 csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
1944 csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
1945 csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
1946 csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
1951 if (class .ne. 'U') then
1954 err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
1955 if (.not.(err .le. epsilon)) goto 100
1962 call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
1963 if (size .ne. np) then
1967 c---------------------------------------------------------------------
1968 c multiple statements because some Fortran compilers have
1969 c problems with long strings.
1970 c---------------------------------------------------------------------
1971 4010 format( ' Warning: benchmark was compiled for ', i5,
1973 4011 format( ' Must be run on this many processors for official',
1975 4012 format( ' so memory access is repeatable')
1979 if (class .ne. 'U') then
1982 2000 format(' Result verification successful')
1985 2001 format(' Result verification failed')
1988 print *, 'class = ', class