examples/smpi/NAS/FT/ft.f

   1 !-------------------------------------------------------------------------!
   2 !                                                                         !
   3 !        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
   4 !                                                                         !
   5 !                                   F T                                   !
   6 !                                                                         !
   7 !-------------------------------------------------------------------------!
   8 !                                                                         !
   9 !    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
  10 !    It is described in NAS Technical Reports 95-020 and 02-007           !
  11 !                                                                         !
  12 !    Permission to use, copy, distribute and modify this software         !
  13 !    for any purpose with or without fee is hereby granted.  We           !
  14 !    request, however, that all derived work reference the NAS            !
  15 !    Parallel Benchmarks 3.3. This software is provided "as is"           !
  16 !    without express or implied warranty.                                 !
  17 !                                                                         !
  18 !    Information on NPB 3.3, including the technical report, the          !
  19 !    original specifications, source code, results and information        !
  20 !    on how to submit new results, is available at:                       !
  21 !                                                                         !
  22 !           http://www.nas.nasa.gov/Software/NPB/                         !
  23 !                                                                         !
  24 !    Send comments or suggestions to  npb@nas.nasa.gov                    !
  25 !                                                                         !
  26 !          NAS Parallel Benchmarks Group                                  !
  27 !          NASA Ames Research Center                                      !
  28 !          Mail Stop: T27A-1                                              !
  29 !          Moffett Field, CA   94035-1000                                 !
  30 !                                                                         !
  31 !          E-mail:  npb@nas.nasa.gov                                      !
  32 !          Fax:     (650) 604-3957                                        !
  33 !                                                                         !
  34 !-------------------------------------------------------------------------!
  35
  36 !TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER
  37 !STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT
  38 !JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID
  39 !FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY
  40 !USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE
  41 !NPB3.0-SER)
  42
  43 !TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY
  44 !MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE
  45 !NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE
  46 !COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS
  47 !((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS
  48 !THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE
  49 !PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A
  50 !FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER.
  51 !THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL.
  52
  53 !UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT
  54 !FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE
  55 !"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM
  56 !DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER
  57 !OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1
  58 !AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION
  59 !OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE
  60 !ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO.
  61
  62 c---------------------------------------------------------------------
  63 c
  64 c Authors: D. Bailey
  65 c          W. Saphir
  66 c          R. F. Van der Wijngaart
  67 c
  68 c---------------------------------------------------------------------
  69
  70 c---------------------------------------------------------------------
  71
  72 c---------------------------------------------------------------------
  73 c FT benchmark
  74 c---------------------------------------------------------------------
  75
  76 c---------------------------------------------------------------------
  77 c---------------------------------------------------------------------
  78
  79       program ft
  80
  81 c---------------------------------------------------------------------
  82 c---------------------------------------------------------------------
  83
  84       implicit none
  85
  86       include 'mpif.h'
  87       include 'global.h'
  88       integer i, ierr
  89
  90 c---------------------------------------------------------------------
  91 c u0, u1, u2 are the main arrays in the problem.
  92 c Depending on the decomposition, these arrays will have different
  93 c dimensions. To accomodate all possibilities, we allocate them as
  94 c one-dimensional arrays and pass them to subroutines for different
  95 c views
  96 c  - u0 contains the initial (transformed) initial condition
  97 c  - u1 and u2 are working arrays
  98 c---------------------------------------------------------------------
  99
 100       double complex   u0(ntdivnp),
 101      >                 u1(ntdivnp),
 102      >                 u2(ntdivnp)
 103       double precision twiddle(ntdivnp)
 104 c---------------------------------------------------------------------
 105 c Large arrays are in common so that they are allocated on the
 106 c heap rather than the stack. This common block is not
 107 c referenced directly anywhere else. Padding is to avoid accidental
 108 c cache problems, since all array sizes are powers of two.
 109 c---------------------------------------------------------------------
 110
 111       double complex pad1(3), pad2(3), pad3(3)
 112       common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
 113
 114       integer iter
 115       double precision total_time, mflops
 116       logical verified
 117       character class
 118
 119       call MPI_Init(ierr)
 120
 121 c---------------------------------------------------------------------
 122 c Run the entire problem once to make sure all data is touched.
 123 c This reduces variable startup costs, which is important for such a
 124 c short benchmark. The other NPB 2 implementations are similar.
 125 c---------------------------------------------------------------------
 126       do i = 1, t_max
 127          call timer_clear(i)
 128       end do
 129
 130       call setup()
 131       call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
 132       call compute_initial_conditions(u1, dims(1,1), dims(2,1),
 133      >                                dims(3,1))
 134       call fft_init (dims(1,1))
 135       call fft(1, u1, u0)
 136
 137 c---------------------------------------------------------------------
 138 c Start over from the beginning. Note that all operations must
 139 c be timed, in contrast to other benchmarks.
 140 c---------------------------------------------------------------------
 141       do i = 1, t_max
 142          call timer_clear(i)
 143       end do
 144       call MPI_Barrier(MPI_COMM_WORLD, ierr)
 145
 146       call timer_start(T_total)
 147       if (timers_enabled) call timer_start(T_setup)
 148
 149       call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
 150       call compute_initial_conditions(u1, dims(1,1), dims(2,1),
 151      >                                dims(3,1))
 152       call fft_init (dims(1,1))
 153
 154       if (timers_enabled) call synchup()
 155       if (timers_enabled) call timer_stop(T_setup)
 156
 157       if (timers_enabled) call timer_start(T_fft)
 158       call fft(1, u1, u0)
 159       if (timers_enabled) call timer_stop(T_fft)
 160
 161       do iter = 1, niter
 162          if (timers_enabled) call timer_start(T_evolve)
 163          call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1))
 164          if (timers_enabled) call timer_stop(T_evolve)
 165          if (timers_enabled) call timer_start(T_fft)
 166          call fft(-1, u1, u2)
 167          if (timers_enabled) call timer_stop(T_fft)
 168          if (timers_enabled) call synchup()
 169          if (timers_enabled) call timer_start(T_checksum)
 170          call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1))
 171          if (timers_enabled) call timer_stop(T_checksum)
 172       end do
 173
 174       call verify(nx, ny, nz, niter, verified, class)
 175       call timer_stop(t_total)
 176       if (np .ne. np_min) verified = .false.
 177       total_time = timer_read(t_total)
 178
 179       if( total_time .ne. 0. ) then
 180          mflops = 1.0d-6*ntotal_f *
 181      >             (14.8157+7.19641*log(ntotal_f)
 182      >          +  (5.23518+7.21113*log(ntotal_f))*niter)
 183      >                 /total_time
 184       else
 185          mflops = 0.0
 186       endif
 187       if (me .eq. 0) then
 188          call print_results('FT', class, nx, ny, nz, niter, np_min, np,
 189      >     total_time, mflops, '          floating point', verified,
 190      >     npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
 191       endif
 192       if (timers_enabled) call print_timers()
 193       call MPI_Finalize(ierr)
 194       end
 195
 196 c---------------------------------------------------------------------
 197 c---------------------------------------------------------------------
 198
 199       subroutine evolve(u0, u1, twiddle, d1, d2, d3)
 200
 201 c---------------------------------------------------------------------
 202 c---------------------------------------------------------------------
 203
 204 c---------------------------------------------------------------------
 205 c evolve u0 -> u1 (t time steps) in fourier space
 206 c---------------------------------------------------------------------
 207
 208       implicit none
 209       include 'global.h'
 210       integer d1, d2, d3
 211       double precision exi
 212       double complex u0(d1,d2,d3)
 213       double complex u1(d1,d2,d3)
 214       double precision twiddle(d1,d2,d3)
 215       integer i, j, k
 216
 217       do k = 1, d3
 218          do j = 1, d2
 219             do i = 1, d1
 220                u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k))
 221                u1(i,j,k) = u0(i,j,k)
 222             end do
 223          end do
 224       end do
 225
 226       return
 227       end
 228
 229
 230 c---------------------------------------------------------------------
 231 c---------------------------------------------------------------------
 232
 233       subroutine compute_initial_conditions(u0, d1, d2, d3)
 234
 235 c---------------------------------------------------------------------
 236 c---------------------------------------------------------------------
 237
 238 c---------------------------------------------------------------------
 239 c Fill in array u0 with initial conditions from
 240 c random number generator
 241 c---------------------------------------------------------------------
 242       implicit none
 243       include 'global.h'
 244       integer d1, d2, d3
 245       double complex u0(d1, d2, d3)
 246       integer k
 247       double precision x0, start, an, dummy
 248
 249 c---------------------------------------------------------------------
 250 c 0-D and 1-D layouts are easy because each processor gets a contiguous
 251 c chunk of the array, in the Fortran ordering sense.
 252 c For a 2-D layout, it's a bit more complicated. We always
 253 c have entire x-lines (contiguous) in processor.
 254 c We can do ny/np1 of them at a time since we have
 255 c ny/np1 contiguous in y-direction. But then we jump
 256 c by z-planes (nz/np2 of them, total).
 257 c For the 0-D and 1-D layouts we could do larger chunks, but
 258 c this turns out to have no measurable impact on performance.
 259 c---------------------------------------------------------------------
 260
 261
 262       start = seed
 263 c---------------------------------------------------------------------
 264 c Jump to the starting element for our first plane.
 265 c---------------------------------------------------------------------
 266       call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an)
 267       dummy = randlc(start, an)
 268       call ipow46(a, 2*nx, ny, an)
 269
 270 c---------------------------------------------------------------------
 271 c Go through by z planes filling in one square at a time.
 272 c---------------------------------------------------------------------
 273       do k = 1, dims(3, 1) ! nz/np2
 274          x0 = start
 275          call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k))
 276          if (k .ne. dims(3, 1)) dummy = randlc(start, an)
 277       end do
 278
 279       return
 280       end
 281
 282
 283 c---------------------------------------------------------------------
 284 c---------------------------------------------------------------------
 285
 286       subroutine ipow46(a, exp_1, exp_2, result)
 287
 288 c---------------------------------------------------------------------
 289 c---------------------------------------------------------------------
 290
 291 c---------------------------------------------------------------------
 292 c compute a^exponent mod 2^46
 293 c---------------------------------------------------------------------
 294
 295       implicit none
 296       double precision a, result, dummy, q, r
 297       integer exp_1, exp_2, n, n2, ierr
 298       external randlc
 299       double precision randlc
 300       logical  two_pow
 301 c---------------------------------------------------------------------
 302 c Use
 303 c   a^n = a^(n/2)*a^(n/2) if n even else
 304 c   a^n = a*a^(n-1)       if n odd
 305 c---------------------------------------------------------------------
 306       result = 1
 307       if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return
 308       q = a
 309       r = 1
 310       n = exp_1
 311       two_pow = .true.
 312
 313       do while (two_pow)
 314          n2 = n/2
 315          if (n2 * 2 .eq. n) then
 316             dummy = randlc(q, q)
 317             n = n2
 318          else
 319             n = n * exp_2
 320             two_pow = .false.
 321          endif
 322       end do
 323
 324       do while (n .gt. 1)
 325          n2 = n/2
 326          if (n2 * 2 .eq. n) then
 327             dummy = randlc(q, q)
 328             n = n2
 329          else
 330             dummy = randlc(r, q)
 331             n = n-1
 332          endif
 333       end do
 334       dummy = randlc(r, q)
 335       result = r
 336       return
 337       end
 338
 339
 340 c---------------------------------------------------------------------
 341 c---------------------------------------------------------------------
 342
 343       subroutine setup
 344
 345 c---------------------------------------------------------------------
 346 c---------------------------------------------------------------------
 347
 348       implicit none
 349       include 'mpinpb.h'
 350       include 'global.h'
 351
 352       integer ierr, i, j, fstatus
 353       debug = .FALSE.
 354
 355       call MPI_Comm_size(MPI_COMM_WORLD, np, ierr)
 356       call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr)
 357
 358       if (.not. convertdouble) then
 359          dc_type = MPI_DOUBLE_COMPLEX
 360       else
 361          dc_type = MPI_COMPLEX
 362       endif
 363
 364
 365       if (me .eq. 0) then
 366          write(*, 1000)
 367          open (unit=2,file='inputft.data',status='old', iostat=fstatus)
 368
 369          if (fstatus .eq. 0) then
 370             write(*,233)
 371  233        format(' Reading from input file inputft.data')
 372             read (2,*) niter
 373             read (2,*) layout_type
 374             read (2,*) np1, np2
 375             close(2)
 376
 377 c---------------------------------------------------------------------
 378 c check to make sure input data is consistent
 379 c---------------------------------------------------------------------
 380
 381
 382 c---------------------------------------------------------------------
 383 c 1. product of processor grid dims must equal number of processors
 384 c---------------------------------------------------------------------
 385
 386             if (np1 * np2 .ne. np) then
 387                write(*, 238)
 388  238           format(' np1 and np2 given in input file are not valid.')
 389                write(*, 239) np1*np2, np
 390  239           format(' Product is ', i5, ' and should be ', i5)
 391                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 392             endif
 393
 394 c---------------------------------------------------------------------
 395 c 2. layout type must be valid
 396 c---------------------------------------------------------------------
 397
 398             if (layout_type .ne. layout_0D .and.
 399      >          layout_type .ne. layout_1D .and.
 400      >          layout_type .ne. layout_2D) then
 401                write(*, 240)
 402  240           format(' Layout type specified in inputft.data is
 403      >                  invalid ')
 404                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 405             endif
 406
 407 c---------------------------------------------------------------------
 408 c 3. 0D layout must be 1x1 grid
 409 c---------------------------------------------------------------------
 410
 411             if (layout_type .eq. layout_0D .and.
 412      >            (np1 .ne.1 .or. np2 .ne. 1)) then
 413                write(*, 241)
 414  241           format(' For 0D layout, both np1 and np2 must be 1 ')
 415                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 416             endif
 417 c---------------------------------------------------------------------
 418 c 4. 1D layout must be 1xN grid
 419 c---------------------------------------------------------------------
 420
 421             if (layout_type .eq. layout_1D .and. np1 .ne. 1) then
 422                write(*, 242)
 423  242           format(' For 1D layout, np1 must be 1 ')
 424                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 425             endif
 426
 427          else
 428             write(*,234)
 429             niter = niter_default
 430             if (np .eq. 1) then
 431                np1 = 1
 432                np2 = 1
 433                layout_type = layout_0D
 434             else if (np .le. nz) then
 435                np1 = 1
 436                np2 = np
 437                layout_type = layout_1D
 438             else
 439                np1 = nz
 440                np2 = np/nz
 441                layout_type = layout_2D
 442             endif
 443          endif
 444
 445          if (np .lt. np_min) then
 446             write(*, 10) np_min
 447  10         format(' Error: Compiled for ', I5, ' processors. ')
 448             write(*, 11) np
 449  11         format(' Only ',  i5, ' processors found ')
 450             call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 451          endif
 452
 453  234     format(' No input file inputft.data. Using compiled defaults')
 454          write(*, 1001) nx, ny, nz
 455          write(*, 1002) niter
 456          write(*, 1004) np
 457          write(*, 1005) np1, np2
 458          if (np .ne. np_min) write(*, 1006) np_min
 459
 460          if (layout_type .eq. layout_0D) then
 461             write(*, 1010) '0D'
 462          else if (layout_type .eq. layout_1D) then
 463             write(*, 1010) '1D'
 464          else
 465             write(*, 1010) '2D'
 466          endif
 467
 468  1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/)
 469  1001    format(' Size                : ', i4, 'x', i4, 'x', i4)
 470  1002    format(' Iterations          : ', 7x, i7)
 471  1004    format(' Number of processes : ', 7x, i7)
 472  1005    format(' Processor array     : ', 5x, i4, 'x', i4)
 473  1006    format(' WARNING: compiled for ', i5, ' processes. ',
 474      >          ' Will not verify. ')
 475  1010    format(' Layout type         : ', 9x, A5)
 476       endif
 477
 478
 479 c---------------------------------------------------------------------
 480 c Since np1, np2 and layout_type are in a common block,
 481 c this sends all three.
 482 c---------------------------------------------------------------------
 483       call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
 484       call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
 485
 486       if (np1 .eq. 1 .and. np2 .eq. 1) then
 487         layout_type = layout_0D
 488       else if (np1 .eq. 1) then
 489          layout_type = layout_1D
 490       else
 491          layout_type = layout_2D
 492       endif
 493
 494       if (layout_type .eq. layout_0D) then
 495          do i = 1, 3
 496             dims(1, i) = nx
 497             dims(2, i) = ny
 498             dims(3, i) = nz
 499          end do
 500       else if (layout_type .eq. layout_1D) then
 501          dims(1, 1) = nx
 502          dims(2, 1) = ny
 503          dims(3, 1) = nz
 504
 505          dims(1, 2) = nx
 506          dims(2, 2) = ny
 507          dims(3, 2) = nz
 508
 509          dims(1, 3) = nz
 510          dims(2, 3) = nx
 511          dims(3, 3) = ny
 512       else if (layout_type .eq. layout_2D) then
 513          dims(1, 1) = nx
 514          dims(2, 1) = ny
 515          dims(3, 1) = nz
 516
 517          dims(1, 2) = ny
 518          dims(2, 2) = nx
 519          dims(3, 2) = nz
 520
 521          dims(1, 3) = nz
 522          dims(2, 3) = nx
 523          dims(3, 3) = ny
 524
 525       endif
 526       do i = 1, 3
 527          dims(2, i) = dims(2, i) / np1
 528          dims(3, i) = dims(3, i) / np2
 529       end do
 530
 531
 532 c---------------------------------------------------------------------
 533 c Determine processor coordinates of this processor
 534 c Processor grid is np1xnp2.
 535 c Arrays are always (n1, n2/np1, n3/np2)
 536 c Processor coords are zero-based.
 537 c---------------------------------------------------------------------
 538       me2 = mod(me, np2)  ! goes from 0...np2-1
 539       me1 = me/np2        ! goes from 0...np1-1
 540 c---------------------------------------------------------------------
 541 c Communicators for rows/columns of processor grid.
 542 c commslice1 is communicator of all procs with same me1, ranked as me2
 543 c commslice2 is communicator of all procs with same me2, ranked as me1
 544 c mpi_comm_split(comm, color, key, ...)
 545 c---------------------------------------------------------------------
 546       call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr)
 547       call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr)
 548       if (timers_enabled) call synchup()
 549
 550       if (debug) print *, 'proc coords: ', me, me1, me2
 551
 552 c---------------------------------------------------------------------
 553 c Determine which section of the grid is owned by this
 554 c processor.
 555 c---------------------------------------------------------------------
 556       if (layout_type .eq. layout_0d) then
 557
 558          do i = 1, 3
 559             xstart(i) = 1
 560             xend(i)   = nx
 561             ystart(i) = 1
 562             yend(i)   = ny
 563             zstart(i) = 1
 564             zend(i)   = nz
 565          end do
 566
 567       else if (layout_type .eq. layout_1d) then
 568
 569          xstart(1) = 1
 570          xend(1)   = nx
 571          ystart(1) = 1
 572          yend(1)   = ny
 573          zstart(1) = 1 + me2 * nz/np2
 574          zend(1)   = (me2+1) * nz/np2
 575
 576          xstart(2) = 1
 577          xend(2)   = nx
 578          ystart(2) = 1
 579          yend(2)   = ny
 580          zstart(2) = 1 + me2 * nz/np2
 581          zend(2)   = (me2+1) * nz/np2
 582
 583          xstart(3) = 1
 584          xend(3)   = nx
 585          ystart(3) = 1 + me2 * ny/np2
 586          yend(3)   = (me2+1) * ny/np2
 587          zstart(3) = 1
 588          zend(3)   = nz
 589
 590       else if (layout_type .eq. layout_2d) then
 591
 592          xstart(1) = 1
 593          xend(1)   = nx
 594          ystart(1) = 1 + me1 * ny/np1
 595          yend(1)   = (me1+1) * ny/np1
 596          zstart(1) = 1 + me2 * nz/np2
 597          zend(1)   = (me2+1) * nz/np2
 598
 599          xstart(2) = 1 + me1 * nx/np1
 600          xend(2)   = (me1+1)*nx/np1
 601          ystart(2) = 1
 602          yend(2)   = ny
 603          zstart(2) = zstart(1)
 604          zend(2)   = zend(1)
 605
 606          xstart(3) = xstart(2)
 607          xend(3)   = xend(2)
 608          ystart(3) = 1 + me2 *ny/np2
 609          yend(3)   = (me2+1)*ny/np2
 610          zstart(3) = 1
 611          zend(3)   = nz
 612       endif
 613
 614 c---------------------------------------------------------------------
 615 c Set up info for blocking of ffts and transposes.  This improves
 616 c performance on cache-based systems. Blocking involves
 617 c working on a chunk of the problem at a time, taking chunks
 618 c along the first, second, or third dimension.
 619 c
 620 c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
 621 c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
 622
 623 c Since 1st dim is always in processor, we'll assume it's long enough
 624 c (default blocking factor is 16 so min size for 1st dim is 16)
 625 c The only case we have to worry about is cffts1 in a 2d decomposition.
 626 c so the blocking factor should not be larger than the 2nd dimension.
 627 c---------------------------------------------------------------------
 628
 629       fftblock = fftblock_default
 630       fftblockpad = fftblockpad_default
 631
 632       if (layout_type .eq. layout_2d) then
 633          if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1)
 634          if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2)
 635          if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3)
 636       endif
 637
 638       if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3
 639
 640       return
 641       end
 642
 643
 644 c---------------------------------------------------------------------
 645 c---------------------------------------------------------------------
 646
 647       subroutine compute_indexmap(twiddle, d1, d2, d3)
 648
 649 c---------------------------------------------------------------------
 650 c---------------------------------------------------------------------
 651
 652 c---------------------------------------------------------------------
 653 c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
 654 c for time evolution exponent.
 655 c---------------------------------------------------------------------
 656
 657       implicit none
 658       include 'mpinpb.h'
 659       include 'global.h'
 660       integer d1, d2, d3
 661       integer i, j, k, ii, ii2, jj, ij2, kk
 662       double precision ap, twiddle(d1, d2, d3)
 663
 664 c---------------------------------------------------------------------
 665 c this function is very different depending on whether
 666 c we are in the 0d, 1d or 2d layout. Compute separately.
 667 c basically we want to convert the fortran indices
 668 c   1 2 3 4 5 6 7 8
 669 c to
 670 c   0 1 2 3 -4 -3 -2 -1
 671 c The following magic formula does the trick:
 672 c mod(i-1+n/2, n) - n/2
 673 c---------------------------------------------------------------------
 674
 675       ap = - 4.d0 * alpha * pi *pi
 676
 677       if (layout_type .eq. layout_0d) then ! xyz layout
 678          do i = 1, dims(1,3)
 679             ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
 680             ii2 = ii*ii
 681             do j = 1, dims(2,3)
 682                jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
 683                ij2 = jj*jj+ii2
 684                do k = 1, dims(3,3)
 685                   kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
 686                   twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2))
 687                end do
 688             end do
 689          end do
 690       else if (layout_type .eq. layout_1d) then ! zxy layout
 691          do i = 1,dims(2,3)
 692             ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
 693             ii2 = ii*ii
 694             do j = 1,dims(3,3)
 695                jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
 696                ij2 = jj*jj+ii2
 697                do k = 1,dims(1,3)
 698                   kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
 699                   twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
 700                end do
 701             end do
 702          end do
 703       else if (layout_type .eq. layout_2d) then ! zxy layout
 704          do i = 1,dims(2,3)
 705             ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
 706             ii2 = ii*ii
 707             do j = 1, dims(3,3)
 708                jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
 709                ij2 = jj*jj+ii2
 710                do k =1,dims(1,3)
 711                   kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
 712                   twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
 713                end do
 714             end do
 715          end do
 716       else
 717          print *, ' Unknown layout type ', layout_type
 718          stop
 719       endif
 720
 721       return
 722       end
 723
 724
 725
 726 c---------------------------------------------------------------------
 727 c---------------------------------------------------------------------
 728
 729       subroutine print_timers()
 730
 731 c---------------------------------------------------------------------
 732 c---------------------------------------------------------------------
 733
 734       implicit none
 735       integer i
 736       include 'global.h'
 737       character*25 tstrings(T_max)
 738       data tstrings / '          total ',
 739      >                '          setup ',
 740      >                '            fft ',
 741      >                '         evolve ',
 742      >                '       checksum ',
 743      >                '         fftlow ',
 744      >                '        fftcopy ',
 745      >                '      transpose ',
 746      >                ' transpose1_loc ',
 747      >                ' transpose1_glo ',
 748      >                ' transpose1_fin ',
 749      >                ' transpose2_loc ',
 750      >                ' transpose2_glo ',
 751      >                ' transpose2_fin ',
 752      >                '           sync ' /
 753
 754       if (me .ne. 0) return
 755       do i = 1, t_max
 756          if (timer_read(i) .ne. 0.0d0) then
 757             write(*, 100) i, tstrings(i), timer_read(i)
 758          endif
 759       end do
 760  100  format(' timer ', i2, '(', A16,  ') :', F10.6)
 761       return
 762       end
 763
 764
 765
 766 c---------------------------------------------------------------------
 767 c---------------------------------------------------------------------
 768
 769       subroutine fft(dir, x1, x2)
 770
 771 c---------------------------------------------------------------------
 772 c---------------------------------------------------------------------
 773
 774       implicit none
 775       include 'global.h'
 776       integer dir
 777       double complex x1(ntdivnp), x2(ntdivnp)
 778
 779       double complex scratch(fftblockpad_default*maxdim*2)
 780
 781 c---------------------------------------------------------------------
 782 c note: args x1, x2 must be different arrays
 783 c note: args for cfftsx are (direction, layout, xin, xout, scratch)
 784 c       xin/xout may be the same and it can be somewhat faster
 785 c       if they are
 786 c note: args for transpose are (layout1, layout2, xin, xout)
 787 c       xin/xout must be different
 788 c---------------------------------------------------------------------
 789
 790       if (dir .eq. 1) then
 791          if (layout_type .eq. layout_0d) then
 792             call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
 793      >                  x1, x1, scratch)
 794             call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
 795      >                  x1, x1, scratch)
 796             call cffts3(1, dims(1,3), dims(2,3), dims(3,3),
 797      >                  x1, x2, scratch)
 798          else if (layout_type .eq. layout_1d) then
 799             call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
 800      >                  x1, x1, scratch)
 801             call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
 802      >                  x1, x1, scratch)
 803             if (timers_enabled) call timer_start(T_transpose)
 804             call transpose_xy_z(2, 3, x1, x2)
 805             if (timers_enabled) call timer_stop(T_transpose)
 806             call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
 807      >                  x2, x2, scratch)
 808          else if (layout_type .eq. layout_2d) then
 809             call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
 810      >                  x1, x1, scratch)
 811             if (timers_enabled) call timer_start(T_transpose)
 812             call transpose_x_y(1, 2, x1, x2)
 813             if (timers_enabled) call timer_stop(T_transpose)
 814             call cffts1(1, dims(1,2), dims(2,2), dims(3,2),
 815      >                  x2, x2, scratch)
 816             if (timers_enabled) call timer_start(T_transpose)
 817             call transpose_x_z(2, 3, x2, x1)
 818             if (timers_enabled) call timer_stop(T_transpose)
 819             call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
 820      >                  x1, x2, scratch)
 821          endif
 822       else
 823          if (layout_type .eq. layout_0d) then
 824             call cffts3(-1, dims(1,3), dims(2,3), dims(3,3),
 825      >                  x1, x1, scratch)
 826             call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
 827      >                  x1, x1, scratch)
 828             call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
 829      >                  x1, x2, scratch)
 830          else if (layout_type .eq. layout_1d) then
 831             call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
 832      >                  x1, x1, scratch)
 833             if (timers_enabled) call timer_start(T_transpose)
 834             call transpose_x_yz(3, 2, x1, x2)
 835             if (timers_enabled) call timer_stop(T_transpose)
 836             call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
 837      >                  x2, x2, scratch)
 838             call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
 839      >                  x2, x2, scratch)
 840          else if (layout_type .eq. layout_2d) then
 841             call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
 842      >                  x1, x1, scratch)
 843             if (timers_enabled) call timer_start(T_transpose)
 844             call transpose_x_z(3, 2, x1, x2)
 845             if (timers_enabled) call timer_stop(T_transpose)
 846             call cffts1(-1, dims(1,2), dims(2,2), dims(3,2),
 847      >                  x2, x2, scratch)
 848             if (timers_enabled) call timer_start(T_transpose)
 849             call transpose_x_y(2, 1, x2, x1)
 850             if (timers_enabled) call timer_stop(T_transpose)
 851             call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
 852      >                  x1, x2, scratch)
 853          endif
 854       endif
 855       return
 856       end
 857
 858
 859
 860 c---------------------------------------------------------------------
 861 c---------------------------------------------------------------------
 862
 863       subroutine cffts1(is, d1, d2, d3, x, xout, y)
 864
 865 c---------------------------------------------------------------------
 866 c---------------------------------------------------------------------
 867
 868       implicit none
 869
 870       include 'global.h'
 871       integer is, d1, d2, d3, logd1
 872       double complex x(d1,d2,d3)
 873       double complex xout(d1,d2,d3)
 874       double complex y(fftblockpad, d1, 2)
 875       integer i, j, k, jj
 876
 877       logd1 = ilog2(d1)
 878
 879       do k = 1, d3
 880          do jj = 0, d2 - fftblock, fftblock
 881             if (timers_enabled) call timer_start(T_fftcopy)
 882             do j = 1, fftblock
 883                do i = 1, d1
 884                   y(j,i,1) = x(i,j+jj,k)
 885                enddo
 886             enddo
 887             if (timers_enabled) call timer_stop(T_fftcopy)
 888
 889             if (timers_enabled) call timer_start(T_fftlow)
 890             call cfftz (is, logd1, d1, y, y(1,1,2))
 891             if (timers_enabled) call timer_stop(T_fftlow)
 892
 893             if (timers_enabled) call timer_start(T_fftcopy)
 894             do j = 1, fftblock
 895                do i = 1, d1
 896                   xout(i,j+jj,k) = y(j,i,1)
 897                enddo
 898             enddo
 899             if (timers_enabled) call timer_stop(T_fftcopy)
 900          enddo
 901       enddo
 902
 903       return
 904       end
 905
 906
 907 c---------------------------------------------------------------------
 908 c---------------------------------------------------------------------
 909
 910       subroutine cffts2(is, d1, d2, d3, x, xout, y)
 911
 912 c---------------------------------------------------------------------
 913 c---------------------------------------------------------------------
 914
 915       implicit none
 916
 917       include 'global.h'
 918       integer is, d1, d2, d3, logd2
 919       double complex x(d1,d2,d3)
 920       double complex xout(d1,d2,d3)
 921       double complex y(fftblockpad, d2, 2)
 922       integer i, j, k, ii
 923
 924       logd2 = ilog2(d2)
 925
 926       do k = 1, d3
 927         do ii = 0, d1 - fftblock, fftblock
 928            if (timers_enabled) call timer_start(T_fftcopy)
 929            do j = 1, d2
 930               do i = 1, fftblock
 931                  y(i,j,1) = x(i+ii,j,k)
 932               enddo
 933            enddo
 934            if (timers_enabled) call timer_stop(T_fftcopy)
 935
 936            if (timers_enabled) call timer_start(T_fftlow)
 937            call cfftz (is, logd2, d2, y, y(1, 1, 2))
 938            if (timers_enabled) call timer_stop(T_fftlow)
 939
 940            if (timers_enabled) call timer_start(T_fftcopy)
 941            do j = 1, d2
 942               do i = 1, fftblock
 943                  xout(i+ii,j,k) = y(i,j,1)
 944               enddo
 945            enddo
 946            if (timers_enabled) call timer_stop(T_fftcopy)
 947         enddo
 948       enddo
 949
 950       return
 951       end
 952
 953
 954 c---------------------------------------------------------------------
 955 c---------------------------------------------------------------------
 956
 957       subroutine cffts3(is, d1, d2, d3, x, xout, y)
 958
 959 c---------------------------------------------------------------------
 960 c---------------------------------------------------------------------
 961
 962       implicit none
 963
 964       include 'global.h'
 965       integer is, d1, d2, d3, logd3
 966       double complex x(d1,d2,d3)
 967       double complex xout(d1,d2,d3)
 968       double complex y(fftblockpad, d3, 2)
 969       integer i, j, k, ii
 970
 971       logd3 = ilog2(d3)
 972
 973       do j = 1, d2
 974         do ii = 0, d1 - fftblock, fftblock
 975            if (timers_enabled) call timer_start(T_fftcopy)
 976            do k = 1, d3
 977               do i = 1, fftblock
 978                  y(i,k,1) = x(i+ii,j,k)
 979               enddo
 980            enddo
 981            if (timers_enabled) call timer_stop(T_fftcopy)
 982
 983            if (timers_enabled) call timer_start(T_fftlow)
 984            call cfftz (is, logd3, d3, y, y(1, 1, 2))
 985            if (timers_enabled) call timer_stop(T_fftlow)
 986
 987            if (timers_enabled) call timer_start(T_fftcopy)
 988            do k = 1, d3
 989               do i = 1, fftblock
 990                  xout(i+ii,j,k) = y(i,k,1)
 991               enddo
 992            enddo
 993            if (timers_enabled) call timer_stop(T_fftcopy)
 994         enddo
 995       enddo
 996
 997       return
 998       end
 999
1000
1001 c---------------------------------------------------------------------
1002 c---------------------------------------------------------------------
1003
1004       subroutine fft_init (n)
1005
1006 c---------------------------------------------------------------------
1007 c---------------------------------------------------------------------
1008
1009 c---------------------------------------------------------------------
1010 c compute the roots-of-unity array that will be used for subsequent FFTs.
1011 c---------------------------------------------------------------------
1012
1013       implicit none
1014       include 'global.h'
1015
1016       integer m,n,nu,ku,i,j,ln
1017       double precision t, ti
1018
1019
1020 c---------------------------------------------------------------------
1021 c   Initialize the U array with sines and cosines in a manner that permits
1022 c   stride one access at each FFT iteration.
1023 c---------------------------------------------------------------------
1024       nu = n
1025       m = ilog2(n)
1026       u(1) = m
1027       ku = 2
1028       ln = 1
1029
1030       do j = 1, m
1031          t = pi / ln
1032
1033          do i = 0, ln - 1
1034             ti = i * t
1035             u(i+ku) = dcmplx (cos (ti), sin(ti))
1036          enddo
1037
1038          ku = ku + ln
1039          ln = 2 * ln
1040       enddo
1041
1042       return
1043       end
1044
1045 c---------------------------------------------------------------------
1046 c---------------------------------------------------------------------
1047
1048       subroutine cfftz (is, m, n, x, y)
1049
1050 c---------------------------------------------------------------------
1051 c---------------------------------------------------------------------
1052
1053 c---------------------------------------------------------------------
1054 c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
1055 c   to Swarztrauber.  X is both the input and the output array, while Y is a
1056 c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
1057 c   perform FFTs, the array U must be initialized by calling CFFTZ with IS
1058 c   set to 0 and M set to MX, where MX is the maximum value of M for any
1059 c   subsequent call.
1060 c---------------------------------------------------------------------
1061
1062       implicit none
1063       include 'global.h'
1064
1065       integer is,m,n,i,j,l,mx
1066       double complex x, y
1067
1068       dimension x(fftblockpad,n), y(fftblockpad,n)
1069
1070 c---------------------------------------------------------------------
1071 c   Check if input parameters are invalid.
1072 c---------------------------------------------------------------------
1073       mx = u(1)
1074       if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)
1075      >  then
1076         write (*, 1)  is, m, mx
1077  1      format ('CFFTZ: Either U has not been initialized, or else'/
1078      >    'one of the input parameters is invalid', 3I5)
1079         stop
1080       endif
1081
1082 c---------------------------------------------------------------------
1083 c   Perform one variant of the Stockham FFT.
1084 c---------------------------------------------------------------------
1085       do l = 1, m, 2
1086         call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
1087         if (l .eq. m) goto 160
1088         call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
1089       enddo
1090
1091       goto 180
1092
1093 c---------------------------------------------------------------------
1094 c   Copy Y to X.
1095 c---------------------------------------------------------------------
1096  160  do j = 1, n
1097         do i = 1, fftblock
1098           x(i,j) = y(i,j)
1099         enddo
1100       enddo
1101
1102  180  continue
1103
1104       return
1105       end
1106
1107 c---------------------------------------------------------------------
1108 c---------------------------------------------------------------------
1109
1110       subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
1111
1112 c---------------------------------------------------------------------
1113 c---------------------------------------------------------------------
1114
1115 c---------------------------------------------------------------------
1116 c   Performs the L-th iteration of the second variant of the Stockham FFT.
1117 c---------------------------------------------------------------------
1118
1119       implicit none
1120
1121       integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
1122       double complex u,x,y,u1,x11,x21
1123       dimension u(n), x(ny1,n), y(ny1,n)
1124
1125
1126 c---------------------------------------------------------------------
1127 c   Set initial parameters.
1128 c---------------------------------------------------------------------
1129
1130       n1 = n / 2
1131       lk = 2 ** (l - 1)
1132       li = 2 ** (m - l)
1133       lj = 2 * lk
1134       ku = li + 1
1135
1136       do i = 0, li - 1
1137         i11 = i * lk + 1
1138         i12 = i11 + n1
1139         i21 = i * lj + 1
1140         i22 = i21 + lk
1141         if (is .ge. 1) then
1142           u1 = u(ku+i)
1143         else
1144           u1 = dconjg (u(ku+i))
1145         endif
1146
1147 c---------------------------------------------------------------------
1148 c   This loop is vectorizable.
1149 c---------------------------------------------------------------------
1150         do k = 0, lk - 1
1151           do j = 1, ny
1152             x11 = x(j,i11+k)
1153             x21 = x(j,i12+k)
1154             y(j,i21+k) = x11 + x21
1155             y(j,i22+k) = u1 * (x11 - x21)
1156           enddo
1157         enddo
1158       enddo
1159
1160       return
1161       end
1162
1163 c---------------------------------------------------------------------
1164
1165
1166 c---------------------------------------------------------------------
1167 c---------------------------------------------------------------------
1168
1169       integer function ilog2(n)
1170
1171 c---------------------------------------------------------------------
1172 c---------------------------------------------------------------------
1173
1174       implicit none
1175       integer n, nn, lg
1176       if (n .eq. 1) then
1177          ilog2=0
1178          return
1179       endif
1180       lg = 1
1181       nn = 2
1182       do while (nn .lt. n)
1183          nn = nn*2
1184          lg = lg+1
1185       end do
1186       ilog2 = lg
1187       return
1188       end
1189
1190
1191 c---------------------------------------------------------------------
1192 c---------------------------------------------------------------------
1193
1194       subroutine transpose_x_yz(l1, l2, xin, xout)
1195
1196 c---------------------------------------------------------------------
1197 c---------------------------------------------------------------------
1198
1199       implicit none
1200       include 'global.h'
1201       integer l1, l2
1202       double complex xin(ntdivnp), xout(ntdivnp)
1203
1204       call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1),
1205      >                          xin, xout)
1206
1207       call transpose2_global(xout, xin)
1208
1209       call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1),
1210      >                          xin, xout)
1211
1212       return
1213       end
1214
1215
1216 c---------------------------------------------------------------------
1217 c---------------------------------------------------------------------
1218
1219       subroutine transpose_xy_z(l1, l2, xin, xout)
1220
1221 c---------------------------------------------------------------------
1222 c---------------------------------------------------------------------
1223
1224       implicit none
1225       include 'global.h'
1226       integer l1, l2
1227       double complex xin(ntdivnp), xout(ntdivnp)
1228
1229       call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1),
1230      >                          xin, xout)
1231       call transpose2_global(xout, xin)
1232       call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1),
1233      >                          xin, xout)
1234
1235       return
1236       end
1237
1238
1239
1240 c---------------------------------------------------------------------
1241 c---------------------------------------------------------------------
1242
1243       subroutine transpose2_local(n1, n2, xin, xout)
1244
1245 c---------------------------------------------------------------------
1246 c---------------------------------------------------------------------
1247
1248       implicit none
1249       include 'mpinpb.h'
1250       include 'global.h'
1251       integer n1, n2
1252       double complex xin(n1, n2), xout(n2, n1)
1253
1254       double complex z(transblockpad, transblock)
1255
1256       integer i, j, ii, jj
1257
1258       if (timers_enabled) call timer_start(T_transxzloc)
1259
1260 c---------------------------------------------------------------------
1261 c If possible, block the transpose for cache memory systems.
1262 c How much does this help? Example: R8000 Power Challenge (90 MHz)
1263 c Blocked version decreases time spend in this routine
1264 c from 14 seconds to 5.2 seconds on 8 nodes class A.
1265 c---------------------------------------------------------------------
1266
1267       if (n1 .lt. transblock .or. n2 .lt. transblock) then
1268          if (n1 .ge. n2) then
1269             do j = 1, n2
1270                do i = 1, n1
1271                   xout(j, i) = xin(i, j)
1272                end do
1273             end do
1274          else
1275             do i = 1, n1
1276                do j = 1, n2
1277                   xout(j, i) = xin(i, j)
1278                end do
1279             end do
1280          endif
1281       else
1282          do j = 0, n2-1, transblock
1283             do i = 0, n1-1, transblock
1284
1285 c---------------------------------------------------------------------
1286 c Note: compiler should be able to take j+jj out of inner loop
1287 c---------------------------------------------------------------------
1288                do jj = 1, transblock
1289                   do ii = 1, transblock
1290                      z(jj,ii) = xin(i+ii, j+jj)
1291                   end do
1292                end do
1293
1294                do ii = 1, transblock
1295                   do jj = 1, transblock
1296                      xout(j+jj, i+ii) = z(jj,ii)
1297                   end do
1298                end do
1299
1300             end do
1301          end do
1302       endif
1303       if (timers_enabled) call timer_stop(T_transxzloc)
1304
1305       return
1306       end
1307
1308
1309 c---------------------------------------------------------------------
1310 c---------------------------------------------------------------------
1311
1312       subroutine transpose2_global(xin, xout)
1313
1314 c---------------------------------------------------------------------
1315 c---------------------------------------------------------------------
1316
1317       implicit none
1318       include 'global.h'
1319       include 'mpinpb.h'
1320       double complex xin(ntdivnp)
1321       double complex xout(ntdivnp)
1322       integer ierr
1323
1324       if (timers_enabled) call synchup()
1325
1326       if (timers_enabled) call timer_start(T_transxzglo)
1327       call mpi_alltoall(xin, ntdivnp/np, dc_type,
1328      >                  xout, ntdivnp/np, dc_type,
1329      >                  commslice1, ierr)
1330       if (timers_enabled) call timer_stop(T_transxzglo)
1331
1332       return
1333       end
1334
1335
1336
1337 c---------------------------------------------------------------------
1338 c---------------------------------------------------------------------
1339
1340       subroutine transpose2_finish(n1, n2, xin, xout)
1341
1342 c---------------------------------------------------------------------
1343 c---------------------------------------------------------------------
1344
1345       implicit none
1346       include 'global.h'
1347       integer n1, n2, ioff
1348       double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2)
1349
1350       integer i, j, p
1351
1352       if (timers_enabled) call timer_start(T_transxzfin)
1353       do p = 0, np2-1
1354          ioff = p*n2
1355          do j = 1, n1/np2
1356             do i = 1, n2
1357                xout(i+ioff, j) = xin(i, j, p)
1358             end do
1359          end do
1360       end do
1361       if (timers_enabled) call timer_stop(T_transxzfin)
1362
1363       return
1364       end
1365
1366
1367 c---------------------------------------------------------------------
1368 c---------------------------------------------------------------------
1369
1370       subroutine transpose_x_z(l1, l2, xin, xout)
1371
1372 c---------------------------------------------------------------------
1373 c---------------------------------------------------------------------
1374
1375       implicit none
1376       include 'global.h'
1377       integer l1, l2
1378       double complex xin(ntdivnp), xout(ntdivnp)
1379
1380       call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1),
1381      >                         xin, xout)
1382       call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1),
1383      >                          xout, xin)
1384       call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2),
1385      >                          xin, xout)
1386       return
1387       end
1388
1389
1390 c---------------------------------------------------------------------
1391 c---------------------------------------------------------------------
1392
1393       subroutine transpose_x_z_local(d1, d2, d3, xin, xout)
1394
1395 c---------------------------------------------------------------------
1396 c---------------------------------------------------------------------
1397
1398       implicit none
1399       include 'global.h'
1400       integer d1, d2, d3
1401       double complex xin(d1,d2,d3)
1402       double complex xout(d3,d2,d1)
1403       integer block1, block3
1404       integer i, j, k, kk, ii, i1, k1
1405
1406       double complex buf(transblockpad, maxdim)
1407       if (timers_enabled) call timer_start(T_transxzloc)
1408       if (d1 .lt. 32) goto 100
1409       block3 = d3
1410       if (block3 .eq. 1)  goto 100
1411       if (block3 .gt. transblock) block3 = transblock
1412       block1 = d1
1413       if (block1*block3 .gt. transblock*transblock)
1414      >          block1 = transblock*transblock/block3
1415 c---------------------------------------------------------------------
1416 c blocked transpose
1417 c---------------------------------------------------------------------
1418       do j = 1, d2
1419          do kk = 0, d3-block3, block3
1420             do ii = 0, d1-block1, block1
1421
1422                do k = 1, block3
1423                   k1 = k + kk
1424                   do i = 1, block1
1425                      buf(k, i) = xin(i+ii, j, k1)
1426                   end do
1427                end do
1428
1429                do i = 1, block1
1430                   i1 = i + ii
1431                   do k = 1, block3
1432                      xout(k+kk, j, i1) = buf(k, i)
1433                   end do
1434                end do
1435
1436             end do
1437          end do
1438       end do
1439       goto 200
1440
1441
1442 c---------------------------------------------------------------------
1443 c basic transpose
1444 c---------------------------------------------------------------------
1445  100  continue
1446
1447       do j = 1, d2
1448          do k = 1, d3
1449             do i = 1, d1
1450                xout(k, j, i) = xin(i, j, k)
1451             end do
1452          end do
1453       end do
1454
1455 c---------------------------------------------------------------------
1456 c all done
1457 c---------------------------------------------------------------------
1458  200  continue
1459
1460       if (timers_enabled) call timer_stop(T_transxzloc)
1461       return
1462       end
1463
1464
1465 c---------------------------------------------------------------------
1466 c---------------------------------------------------------------------
1467
1468       subroutine transpose_x_z_global(d1, d2, d3, xin, xout)
1469
1470 c---------------------------------------------------------------------
1471 c---------------------------------------------------------------------
1472
1473       implicit none
1474       include 'global.h'
1475       include 'mpinpb.h'
1476       integer d1, d2, d3
1477       double complex xin(d3,d2,d1)
1478       double complex xout(d3,d2,d1) ! not real layout, but right size
1479       integer ierr
1480
1481       if (timers_enabled) call synchup()
1482
1483 c---------------------------------------------------------------------
1484 c do transpose among all  processes with same 1-coord (me1)
1485 c---------------------------------------------------------------------
1486       if (timers_enabled)call timer_start(T_transxzglo)
1487       call mpi_alltoall(xin, d1*d2*d3/np2, dc_type,
1488      >                  xout, d1*d2*d3/np2, dc_type,
1489      >                  commslice1, ierr)
1490       if (timers_enabled) call timer_stop(T_transxzglo)
1491       return
1492       end
1493
1494
1495 c---------------------------------------------------------------------
1496 c---------------------------------------------------------------------
1497
1498       subroutine transpose_x_z_finish(d1, d2, d3, xin, xout)
1499
1500 c---------------------------------------------------------------------
1501 c---------------------------------------------------------------------
1502
1503       implicit none
1504       include 'global.h'
1505       integer d1, d2, d3
1506       double complex xin(d1/np2, d2, d3, 0:np2-1)
1507       double complex xout(d1,d2,d3)
1508       integer i, j, k, p, ioff
1509       if (timers_enabled) call timer_start(T_transxzfin)
1510 c---------------------------------------------------------------------
1511 c this is the most straightforward way of doing it. the
1512 c calculation in the inner loop doesn't help.
1513 c      do i = 1, d1/np2
1514 c         do j = 1, d2
1515 c            do k = 1, d3
1516 c               do p = 0, np2-1
1517 c                  ii = i + p*d1/np2
1518 c                  xout(ii, j, k) = xin(i, j, k, p)
1519 c               end do
1520 c            end do
1521 c         end do
1522 c      end do
1523 c---------------------------------------------------------------------
1524
1525       do p = 0, np2-1
1526          ioff = p*d1/np2
1527          do k = 1, d3
1528             do j = 1, d2
1529                do i = 1, d1/np2
1530                   xout(i+ioff, j, k) = xin(i, j, k, p)
1531                end do
1532             end do
1533          end do
1534       end do
1535       if (timers_enabled) call timer_stop(T_transxzfin)
1536       return
1537       end
1538
1539
1540 c---------------------------------------------------------------------
1541 c---------------------------------------------------------------------
1542
1543       subroutine transpose_x_y(l1, l2, xin, xout)
1544
1545 c---------------------------------------------------------------------
1546 c---------------------------------------------------------------------
1547
1548       implicit none
1549       include 'global.h'
1550       integer l1, l2
1551       double complex xin(ntdivnp), xout(ntdivnp)
1552
1553 c---------------------------------------------------------------------
1554 c xy transpose is a little tricky, since we don't want
1555 c to touch 3rd axis. But alltoall must involve 3rd axis (most
1556 c slowly varying) to be efficient. So we do
1557 c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local)
1558 c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global)
1559 c then local finish.
1560 c---------------------------------------------------------------------
1561
1562
1563       call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1),
1564      >                         xin, xout)
1565       call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1),
1566      >                          xout, xin)
1567       call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2),
1568      >                          xin, xout)
1569
1570       return
1571       end
1572
1573
1574 c---------------------------------------------------------------------
1575 c---------------------------------------------------------------------
1576
1577       subroutine transpose_x_y_local(d1, d2, d3, xin, xout)
1578
1579 c---------------------------------------------------------------------
1580 c---------------------------------------------------------------------
1581
1582       implicit none
1583       include 'global.h'
1584       integer d1, d2, d3
1585       double complex xin(d1, d2, d3)
1586       double complex xout(d2, d3, d1)
1587       integer i, j, k
1588       if (timers_enabled) call timer_start(T_transxyloc)
1589
1590       do k = 1, d3
1591          do i = 1, d1
1592             do j = 1, d2
1593                xout(j,k,i)=xin(i,j,k)
1594             end do
1595          end do
1596       end do
1597       if (timers_enabled) call timer_stop(T_transxyloc)
1598       return
1599       end
1600
1601
1602 c---------------------------------------------------------------------
1603 c---------------------------------------------------------------------
1604
1605       subroutine transpose_x_y_global(d1, d2, d3, xin, xout)
1606
1607 c---------------------------------------------------------------------
1608 c---------------------------------------------------------------------
1609
1610       implicit none
1611       include 'global.h'
1612       include 'mpinpb.h'
1613       integer d1, d2, d3
1614 c---------------------------------------------------------------------
1615 c array is in form (ny/np1, nz/np2, nx)
1616 c---------------------------------------------------------------------
1617       double complex xin(d2,d3,d1)
1618       double complex xout(d2,d3,d1) ! not real layout but right size
1619       integer ierr
1620
1621       if (timers_enabled) call synchup()
1622
1623 c---------------------------------------------------------------------
1624 c do transpose among all processes with same 1-coord (me1)
1625 c---------------------------------------------------------------------
1626       if (timers_enabled) call timer_start(T_transxyglo)
1627       call mpi_alltoall(xin, d1*d2*d3/np1, dc_type,
1628      >                  xout, d1*d2*d3/np1, dc_type,
1629      >                  commslice2, ierr)
1630       if (timers_enabled) call timer_stop(T_transxyglo)
1631
1632       return
1633       end
1634
1635
1636 c---------------------------------------------------------------------
1637 c---------------------------------------------------------------------
1638
1639       subroutine transpose_x_y_finish(d1, d2, d3, xin, xout)
1640
1641 c---------------------------------------------------------------------
1642 c---------------------------------------------------------------------
1643
1644       implicit none
1645       include 'global.h'
1646       integer d1, d2, d3
1647       double complex xin(d1/np1, d3, d2, 0:np1-1)
1648       double complex xout(d1,d2,d3)
1649       integer i, j, k, p, ioff
1650       if (timers_enabled) call timer_start(T_transxyfin)
1651 c---------------------------------------------------------------------
1652 c this is the most straightforward way of doing it. the
1653 c calculation in the inner loop doesn't help.
1654 c      do i = 1, d1/np1
1655 c         do j = 1, d2
1656 c            do k = 1, d3
1657 c               do p = 0, np1-1
1658 c                  ii = i + p*d1/np1
1659 c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2)
1660 c                  xout(ii, j, k) = xin(i, k, j, p)
1661 c               end do
1662 c            end do
1663 c         end do
1664 c      end do
1665 c---------------------------------------------------------------------
1666
1667       do p = 0, np1-1
1668          ioff = p*d1/np1
1669          do k = 1, d3
1670             do j = 1, d2
1671                do i = 1, d1/np1
1672                   xout(i+ioff, j, k) = xin(i, k, j, p)
1673                end do
1674             end do
1675          end do
1676       end do
1677       if (timers_enabled) call timer_stop(T_transxyfin)
1678       return
1679       end
1680
1681
1682 c---------------------------------------------------------------------
1683 c---------------------------------------------------------------------
1684
1685       subroutine checksum(i, u1, d1, d2, d3)
1686
1687 c---------------------------------------------------------------------
1688 c---------------------------------------------------------------------
1689
1690       implicit none
1691       include 'global.h'
1692       include 'mpinpb.h'
1693       integer i, d1, d2, d3
1694       double complex u1(d1, d2, d3)
1695       integer j, q,r,s, ierr
1696       double complex chk,allchk
1697       chk = (0.0,0.0)
1698
1699       do j=1,1024
1700          q = mod(j, nx)+1
1701          if (q .ge. xstart(1) .and. q .le. xend(1)) then
1702             r = mod(3*j,ny)+1
1703             if (r .ge. ystart(1) .and. r .le. yend(1)) then
1704                s = mod(5*j,nz)+1
1705                if (s .ge. zstart(1) .and. s .le. zend(1)) then
1706                   chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1)
1707                end if
1708             end if
1709          end if
1710       end do
1711       chk = chk/ntotal_f
1712
1713       call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM,
1714      >                0, MPI_COMM_WORLD, ierr)
1715       if (me .eq. 0) then
1716             write (*, 30) i, allchk
1717  30         format (' T =',I5,5X,'Checksum =',1P2D22.12)
1718       endif
1719
1720 c      sums(i) = allchk
1721 c     If we compute the checksum for diagnostic purposes, we let i be
1722 c     negative, so the result will not be stored in an array
1723       if (i .gt. 0) sums(i) = allchk
1724
1725       return
1726       end
1727
1728 c---------------------------------------------------------------------
1729 c---------------------------------------------------------------------
1730
1731       subroutine synchup
1732
1733 c---------------------------------------------------------------------
1734 c---------------------------------------------------------------------
1735
1736       implicit none
1737       include 'global.h'
1738       include 'mpinpb.h'
1739       integer ierr
1740       call timer_start(T_synch)
1741       call mpi_barrier(MPI_COMM_WORLD, ierr)
1742       call timer_stop(T_synch)
1743       return
1744       end
1745
1746
1747 c---------------------------------------------------------------------
1748 c---------------------------------------------------------------------
1749
1750       subroutine verify (d1, d2, d3, nt, verified, class)
1751
1752 c---------------------------------------------------------------------
1753 c---------------------------------------------------------------------
1754
1755       implicit none
1756       include 'global.h'
1757       include 'mpinpb.h'
1758       integer d1, d2, d3, nt
1759       character class
1760       logical verified
1761       integer ierr, size, i
1762       double precision err, epsilon
1763
1764 c---------------------------------------------------------------------
1765 c   Reference checksums
1766 c---------------------------------------------------------------------
1767       double complex csum_ref(25)
1768
1769
1770       class = 'U'
1771
1772       if (me .ne. 0) return
1773
1774       epsilon = 1.0d-12
1775       verified = .FALSE.
1776
1777       if (d1 .eq. 64 .and.
1778      >    d2 .eq. 64 .and.
1779      >    d3 .eq. 64 .and.
1780      >    nt .eq. 6) then
1781 c---------------------------------------------------------------------
1782 c   Sample size reference checksums
1783 c---------------------------------------------------------------------
1784          class = 'S'
1785          csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
1786          csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
1787          csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
1788          csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
1789          csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
1790          csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
1791
1792       else if (d1 .eq. 128 .and.
1793      >    d2 .eq. 128 .and.
1794      >    d3 .eq. 32 .and.
1795      >    nt .eq. 6) then
1796 c---------------------------------------------------------------------
1797 c   Class W size reference checksums
1798 c---------------------------------------------------------------------
1799          class = 'W'
1800          csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
1801          csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
1802          csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
1803          csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
1804          csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
1805          csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
1806
1807       else if (d1 .eq. 256 .and.
1808      >    d2 .eq. 256 .and.
1809      >    d3 .eq. 128 .and.
1810      >    nt .eq. 6) then
1811 c---------------------------------------------------------------------
1812 c   Class A size reference checksums
1813 c---------------------------------------------------------------------
1814          class = 'A'
1815          csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
1816          csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
1817          csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
1818          csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
1819          csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
1820          csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
1821
1822       else if (d1 .eq. 512 .and.
1823      >    d2 .eq. 256 .and.
1824      >    d3 .eq. 256 .and.
1825      >    nt .eq. 20) then
1826 c---------------------------------------------------------------------
1827 c   Class B size reference checksums
1828 c---------------------------------------------------------------------
1829          class = 'B'
1830          csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
1831          csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
1832          csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
1833          csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
1834          csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
1835          csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
1836          csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
1837          csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
1838          csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
1839          csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
1840          csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
1841          csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
1842          csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
1843          csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
1844          csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
1845          csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
1846          csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
1847          csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
1848          csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
1849          csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
1850
1851       else if (d1 .eq. 512 .and.
1852      >    d2 .eq. 512 .and.
1853      >    d3 .eq. 512 .and.
1854      >    nt .eq. 20) then
1855 c---------------------------------------------------------------------
1856 c   Class C size reference checksums
1857 c---------------------------------------------------------------------
1858          class = 'C'
1859          csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
1860          csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
1861          csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
1862          csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
1863          csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
1864          csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
1865          csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
1866          csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
1867          csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
1868          csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
1869          csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
1870          csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
1871          csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
1872          csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
1873          csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
1874          csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
1875          csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
1876          csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
1877          csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
1878          csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
1879
1880       else if (d1 .eq. 2048 .and.
1881      >    d2 .eq. 1024 .and.
1882      >    d3 .eq. 1024 .and.
1883      >    nt .eq. 25) then
1884 c---------------------------------------------------------------------
1885 c   Class D size reference checksums
1886 c---------------------------------------------------------------------
1887          class = 'D'
1888          csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
1889          csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
1890          csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
1891          csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
1892          csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
1893          csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
1894          csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
1895          csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
1896          csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
1897          csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
1898          csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
1899          csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
1900          csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
1901          csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
1902          csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
1903          csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
1904          csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
1905          csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
1906          csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
1907          csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
1908          csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
1909          csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
1910          csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
1911          csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
1912          csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
1913
1914       else if (d1 .eq. 4096 .and.
1915      >    d2 .eq. 2048 .and.
1916      >    d3 .eq. 2048 .and.
1917      >    nt .eq. 25) then
1918 c---------------------------------------------------------------------
1919 c   Class E size reference checksums
1920 c---------------------------------------------------------------------
1921          class = 'E'
1922          csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
1923          csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
1924          csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
1925          csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
1926          csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
1927          csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
1928          csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
1929          csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
1930          csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
1931          csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
1932          csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
1933          csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
1934          csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
1935          csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
1936          csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
1937          csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
1938          csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
1939          csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
1940          csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
1941          csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
1942          csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
1943          csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
1944          csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
1945          csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
1946          csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
1947
1948       endif
1949
1950
1951       if (class .ne. 'U') then
1952
1953          do i = 1, nt
1954             err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
1955             if (.not.(err .le. epsilon)) goto 100
1956          end do
1957          verified = .TRUE.
1958  100     continue
1959
1960       endif
1961
1962       call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
1963       if (size .ne. np) then
1964          write(*, 4010) np
1965          write(*, 4011)
1966          write(*, 4012)
1967 c---------------------------------------------------------------------
1968 c multiple statements because some Fortran compilers have
1969 c problems with long strings.
1970 c---------------------------------------------------------------------
1971  4010    format( ' Warning: benchmark was compiled for ', i5,
1972      >           'processors')
1973  4011    format( ' Must be run on this many processors for official',
1974      >           ' verification')
1975  4012    format( ' so memory access is repeatable')
1976          verified = .false.
1977       endif
1978
1979       if (class .ne. 'U') then
1980          if (verified) then
1981             write(*,2000)
1982  2000       format(' Result verification successful')
1983          else
1984             write(*,2001)
1985  2001       format(' Result verification failed')
1986          endif
1987       endif
1988       print *, 'class = ', class
1989
1990       return
1991       end
1992
1993