examples/smpi/NAS/FT/ft.f

   1 !-------------------------------------------------------------------------!
   2 !                                                                         !
   3 !        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
   4 !                                                                         !
   5 !                                   F T                                   !
   6 !                                                                         !
   7 !-------------------------------------------------------------------------!
   8 !                                                                         !
   9 !    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
  10 !    It is described in NAS Technical Reports 95-020 and 02-007           !
  11 !                                                                         !
  12 !    Permission to use, copy, distribute and modify this software         !
  13 !    for any purpose with or without fee is hereby granted.  We           !
  14 !    request, however, that all derived work reference the NAS            !
  15 !    Parallel Benchmarks 3.3. This software is provided "as is"           !
  16 !    without express or implied warranty.                                 !
  17 !                                                                         !
  18 !    Information on NPB 3.3, including the technical report, the          !
  19 !    original specifications, source code, results and information        !
  20 !    on how to submit new results, is available at:                       !
  21 !                                                                         !
  22 !           http://www.nas.nasa.gov/Software/NPB/                         !
  23 !                                                                         !
  24 !    Send comments or suggestions to  npb@nas.nasa.gov                    !
  25 !                                                                         !
  26 !          NAS Parallel Benchmarks Group                                  !
  27 !          NASA Ames Research Center                                      !
  28 !          Mail Stop: T27A-1                                              !
  29 !          Moffett Field, CA   94035-1000                                 !
  30 !                                                                         !
  31 !          E-mail:  npb@nas.nasa.gov                                      !
  32 !          Fax:     (650) 604-3957                                        !
  33 !                                                                         !
  34 !-------------------------------------------------------------------------!
  35
  36 !TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER
  37 !STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT
  38 !JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID
  39 !FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY
  40 !USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE
  41 !NPB3.0-SER)
  42
  43 !TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY
  44 !MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE
  45 !NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE
  46 !COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS
  47 !((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS
  48 !THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE
  49 !PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A
  50 !FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER.
  51 !THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL.
  52
  53 !UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT
  54 !FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE
  55 !"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM
  56 !DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER
  57 !OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1
  58 !AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION
  59 !OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE
  60 !ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO.
  61
  62 c---------------------------------------------------------------------
  63 c
  64 c Authors: D. Bailey
  65 c          W. Saphir
  66 c          R. F. Van der Wijngaart
  67 c
  68 c---------------------------------------------------------------------
  69
  70 c---------------------------------------------------------------------
  71
  72 c---------------------------------------------------------------------
  73 c FT benchmark
  74 c---------------------------------------------------------------------
  75
  76 c---------------------------------------------------------------------
  77 c---------------------------------------------------------------------
  78
  79       program ft
  80
  81 c---------------------------------------------------------------------
  82 c---------------------------------------------------------------------
  83
  84       implicit none
  85
  86       include 'mpif.h'
  87       include 'global.h'
  88       integer i, ierr
  89
  90 c---------------------------------------------------------------------
  91 c u0, u1, u2 are the main arrays in the problem.
  92 c Depending on the decomposition, these arrays will have different
  93 c dimensions. To accomodate all possibilities, we allocate them as
  94 c one-dimensional arrays and pass them to subroutines for different
  95 c views
  96 c  - u0 contains the initial (transformed) initial condition
  97 c  - u1 and u2 are working arrays
  98 c---------------------------------------------------------------------
  99
 100       double complex   u0(ntdivnp),
 101      >                 u1(ntdivnp),
 102      >                 u2(ntdivnp)
 103       double precision twiddle(ntdivnp)
 104 c---------------------------------------------------------------------
 105 c Large arrays are in common so that they are allocated on the
 106 c heap rather than the stack. This common block is not
 107 c referenced directly anywhere else. Padding is to avoid accidental
 108 c cache problems, since all array sizes are powers of two.
 109 c---------------------------------------------------------------------
 110
 111       double complex pad1(3), pad2(3), pad3(3)
 112       common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
 113
 114       integer iter
 115       double precision total_time, mflops
 116       logical verified
 117       character class
 118
 119       call MPI_Init(ierr)
 120
 121 c---------------------------------------------------------------------
 122 c Run the entire problem once to make sure all data is touched.
 123 c This reduces variable startup costs, which is important for such a
 124 c short benchmark. The other NPB 2 implementations are similar.
 125 c---------------------------------------------------------------------
 126       do i = 1, t_max
 127          call timer_clear(i)
 128       end do
 129
 130       call setup()
 131       call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
 132       call compute_initial_conditions(u1, dims(1,1), dims(2,1),
 133      >                                dims(3,1))
 134       call fft_init (dims(1,1))
 135       call fft(1, u1, u0)
 136
 137 c---------------------------------------------------------------------
 138 c Start over from the beginning. Note that all operations must
 139 c be timed, in contrast to other benchmarks.
 140 c---------------------------------------------------------------------
 141       do i = 1, t_max
 142          call timer_clear(i)
 143       end do
 144       call MPI_Barrier(MPI_COMM_WORLD, ierr)
 145
 146       call timer_start(T_total)
 147       if (timers_enabled) call timer_start(T_setup)
 148
 149       call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
 150       call compute_initial_conditions(u1, dims(1,1), dims(2,1),
 151      >                                dims(3,1))
 152       call fft_init (dims(1,1))
 153
 154       if (timers_enabled) call synchup()
 155       if (timers_enabled) call timer_stop(T_setup)
 156
 157       if (timers_enabled) call timer_start(T_fft)
 158       call fft(1, u1, u0)
 159       if (timers_enabled) call timer_stop(T_fft)
 160
 161       do iter = 1, niter
 162          if (timers_enabled) call timer_start(T_evolve)
 163          call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1))
 164          if (timers_enabled) call timer_stop(T_evolve)
 165          if (timers_enabled) call timer_start(T_fft)
 166          call fft(-1, u1, u2)
 167          if (timers_enabled) call timer_stop(T_fft)
 168          if (timers_enabled) call synchup()
 169          if (timers_enabled) call timer_start(T_checksum)
 170          call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1))
 171          if (timers_enabled) call timer_stop(T_checksum)
 172       end do
 173
 174       call verify(nx, ny, nz, niter, verified, class)
 175       call timer_stop(t_total)
 176       if (np .ne. np_min) verified = .false.
 177       total_time = timer_read(t_total)
 178
 179       if( total_time .ne. 0. ) then
 180          mflops = 1.0d-6*ntotal_f *
 181      >             (14.8157+7.19641*log(ntotal_f)
 182      >          +  (5.23518+7.21113*log(ntotal_f))*niter)
 183      >                 /total_time
 184       else
 185          mflops = 0.0
 186       endif
 187       if (me .eq. 0) then
 188          call print_results('FT', class, nx, ny, nz, niter, np_min, np,
 189      >     total_time, mflops, '          floating point', verified,
 190      >     npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
 191       endif
 192       if (timers_enabled) call print_timers()
 193       call MPI_Finalize(ierr)
 194       end
 195
 196 c---------------------------------------------------------------------
 197 c---------------------------------------------------------------------
 198
 199       subroutine evolve(u0, u1, twiddle, d1, d2, d3)
 200
 201 c---------------------------------------------------------------------
 202 c---------------------------------------------------------------------
 203
 204 c---------------------------------------------------------------------
 205 c evolve u0 -> u1 (t time steps) in fourier space
 206 c---------------------------------------------------------------------
 207
 208       implicit none
 209       include 'global.h'
 210       integer d1, d2, d3
 211       double precision exi
 212       double complex u0(d1,d2,d3)
 213       double complex u1(d1,d2,d3)
 214       double precision twiddle(d1,d2,d3)
 215       integer i, j, k
 216
 217       do k = 1, d3
 218          do j = 1, d2
 219             do i = 1, d1
 220                u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k))
 221                u1(i,j,k) = u0(i,j,k)
 222             end do
 223          end do
 224       end do
 225
 226       return
 227       end
 228
 229
 230 c---------------------------------------------------------------------
 231 c---------------------------------------------------------------------
 232
 233       subroutine compute_initial_conditions(u0, d1, d2, d3)
 234
 235 c---------------------------------------------------------------------
 236 c---------------------------------------------------------------------
 237
 238 c---------------------------------------------------------------------
 239 c Fill in array u0 with initial conditions from
 240 c random number generator
 241 c---------------------------------------------------------------------
 242       implicit none
 243       include 'global.h'
 244       integer d1, d2, d3
 245       double complex u0(d1, d2, d3)
 246       integer k
 247       double precision x0, start, an, dummy
 248
 249 c---------------------------------------------------------------------
 250 c 0-D and 1-D layouts are easy because each processor gets a contiguous
 251 c chunk of the array, in the Fortran ordering sense.
 252 c For a 2-D layout, it's a bit more complicated. We always
 253 c have entire x-lines (contiguous) in processor.
 254 c We can do ny/np1 of them at a time since we have
 255 c ny/np1 contiguous in y-direction. But then we jump
 256 c by z-planes (nz/np2 of them, total).
 257 c For the 0-D and 1-D layouts we could do larger chunks, but
 258 c this turns out to have no measurable impact on performance.
 259 c---------------------------------------------------------------------
 260
 261
 262       start = seed
 263 c---------------------------------------------------------------------
 264 c Jump to the starting element for our first plane.
 265 c---------------------------------------------------------------------
 266       call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an)
 267       dummy = randlc(start, an)
 268       call ipow46(a, 2*nx, ny, an)
 269
 270 c---------------------------------------------------------------------
 271 c Go through by z planes filling in one square at a time.
 272 c---------------------------------------------------------------------
 273       do k = 1, dims(3, 1) ! nz/np2
 274          x0 = start
 275          call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k))
 276          if (k .ne. dims(3, 1)) dummy = randlc(start, an)
 277       end do
 278
 279       return
 280       end
 281
 282
 283 c---------------------------------------------------------------------
 284 c---------------------------------------------------------------------
 285
 286       subroutine ipow46(a, exp_1, exp_2, result)
 287
 288 c---------------------------------------------------------------------
 289 c---------------------------------------------------------------------
 290
 291 c---------------------------------------------------------------------
 292 c compute a^exponent mod 2^46
 293 c---------------------------------------------------------------------
 294
 295       implicit none
 296       double precision a, result, dummy, q, r
 297       integer exp_1, exp_2, n, n2, ierr
 298       external randlc
 299       double precision randlc
 300       logical  two_pow
 301 c---------------------------------------------------------------------
 302 c Use
 303 c   a^n = a^(n/2)*a^(n/2) if n even else
 304 c   a^n = a*a^(n-1)       if n odd
 305 c---------------------------------------------------------------------
 306       result = 1
 307       if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return
 308       q = a
 309       r = 1
 310       n = exp_1
 311       two_pow = .true.
 312
 313       do while (two_pow)
 314          n2 = n/2
 315          if (n2 * 2 .eq. n) then
 316             dummy = randlc(q, q)
 317             n = n2
 318          else
 319             n = n * exp_2
 320             two_pow = .false.
 321          endif
 322       end do
 323
 324       do while (n .gt. 1)
 325          n2 = n/2
 326          if (n2 * 2 .eq. n) then
 327             dummy = randlc(q, q)
 328             n = n2
 329          else
 330             dummy = randlc(r, q)
 331             n = n-1
 332          endif
 333       end do
 334       dummy = randlc(r, q)
 335       result = r
 336       return
 337       end
 338
 339
 340 c---------------------------------------------------------------------
 341 c---------------------------------------------------------------------
 342
 343       subroutine setup
 344
 345 c---------------------------------------------------------------------
 346 c---------------------------------------------------------------------
 347
 348       implicit none
 349       include 'mpinpb.h'
 350       include 'global.h'
 351
 352       integer ierr, i, j, fstatus
 353       debug = .FALSE.
 354
 355       call MPI_Comm_size(MPI_COMM_WORLD, np, ierr)
 356       call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr)
 357
 358       if (.not. convertdouble) then
 359          dc_type = MPI_DOUBLE_COMPLEX
 360       else
 361          dc_type = MPI_COMPLEX
 362       endif
 363
 364
 365       if (me .eq. 0) then
 366          write(*, 1000)
 367          open (unit=2,file='inputft.data',status='old', iostat=fstatus)
 368
 369          if (fstatus .eq. 0) then
 370             write(*,233)
 371  233        format(' Reading from input file inputft.data')
 372             read (2,*) niter
 373             read (2,*) layout_type
 374             read (2,*) np1, np2
 375             close(2)
 376
 377 c---------------------------------------------------------------------
 378 c check to make sure input data is consistent
 379 c---------------------------------------------------------------------
 380
 381
 382 c---------------------------------------------------------------------
 383 c 1. product of processor grid dims must equal number of processors
 384 c---------------------------------------------------------------------
 385
 386             if (np1 * np2 .ne. np) then
 387                write(*, 238)
 388  238           format(' np1 and np2 given in input file are not valid.')
 389                write(*, 239) np1*np2, np
 390  239           format(' Product is ', i5, ' and should be ', i5)
 391                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 392                stop
 393             endif
 394
 395 c---------------------------------------------------------------------
 396 c 2. layout type must be valid
 397 c---------------------------------------------------------------------
 398
 399             if (layout_type .ne. layout_0D .and.
 400      >          layout_type .ne. layout_1D .and.
 401      >          layout_type .ne. layout_2D) then
 402                write(*, 240)
 403  240           format(' Layout type specified in inputft.data is
 404      >                  invalid ')
 405                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 406                stop
 407             endif
 408
 409 c---------------------------------------------------------------------
 410 c 3. 0D layout must be 1x1 grid
 411 c---------------------------------------------------------------------
 412
 413             if (layout_type .eq. layout_0D .and.
 414      >            (np1 .ne.1 .or. np2 .ne. 1)) then
 415                write(*, 241)
 416  241           format(' For 0D layout, both np1 and np2 must be 1 ')
 417                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 418                stop
 419             endif
 420 c---------------------------------------------------------------------
 421 c 4. 1D layout must be 1xN grid
 422 c---------------------------------------------------------------------
 423
 424             if (layout_type .eq. layout_1D .and. np1 .ne. 1) then
 425                write(*, 242)
 426  242           format(' For 1D layout, np1 must be 1 ')
 427                call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 428                stop
 429             endif
 430
 431          else
 432             write(*,234)
 433             niter = niter_default
 434             if (np .eq. 1) then
 435                np1 = 1
 436                np2 = 1
 437                layout_type = layout_0D
 438             else if (np .le. nz) then
 439                np1 = 1
 440                np2 = np
 441                layout_type = layout_1D
 442             else
 443                np1 = nz
 444                np2 = np/nz
 445                layout_type = layout_2D
 446             endif
 447          endif
 448
 449          if (np .lt. np_min) then
 450             write(*, 10) np_min
 451  10         format(' Error: Compiled for ', I5, ' processors. ')
 452             write(*, 11) np
 453  11         format(' Only ',  i5, ' processors found ')
 454             call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
 455             stop
 456          endif
 457
 458  234     format(' No input file inputft.data. Using compiled defaults')
 459          write(*, 1001) nx, ny, nz
 460          write(*, 1002) niter
 461          write(*, 1004) np
 462          write(*, 1005) np1, np2
 463          if (np .ne. np_min) write(*, 1006) np_min
 464
 465          if (layout_type .eq. layout_0D) then
 466             write(*, 1010) '0D'
 467          else if (layout_type .eq. layout_1D) then
 468             write(*, 1010) '1D'
 469          else
 470             write(*, 1010) '2D'
 471          endif
 472
 473  1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/)
 474  1001    format(' Size                : ', i4, 'x', i4, 'x', i4)
 475  1002    format(' Iterations          : ', 7x, i7)
 476  1004    format(' Number of processes : ', 7x, i7)
 477  1005    format(' Processor array     : ', 5x, i4, 'x', i4)
 478  1006    format(' WARNING: compiled for ', i5, ' processes. ',
 479      >          ' Will not verify. ')
 480  1010    format(' Layout type         : ', 9x, A5)
 481       endif
 482
 483
 484 c---------------------------------------------------------------------
 485 c Since np1, np2 and layout_type are in a common block,
 486 c this sends all three.
 487 c---------------------------------------------------------------------
 488       call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
 489       call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
 490
 491       if (np1 .eq. 1 .and. np2 .eq. 1) then
 492         layout_type = layout_0D
 493       else if (np1 .eq. 1) then
 494          layout_type = layout_1D
 495       else
 496          layout_type = layout_2D
 497       endif
 498
 499       if (layout_type .eq. layout_0D) then
 500          do i = 1, 3
 501             dims(1, i) = nx
 502             dims(2, i) = ny
 503             dims(3, i) = nz
 504          end do
 505       else if (layout_type .eq. layout_1D) then
 506          dims(1, 1) = nx
 507          dims(2, 1) = ny
 508          dims(3, 1) = nz
 509
 510          dims(1, 2) = nx
 511          dims(2, 2) = ny
 512          dims(3, 2) = nz
 513
 514          dims(1, 3) = nz
 515          dims(2, 3) = nx
 516          dims(3, 3) = ny
 517       else if (layout_type .eq. layout_2D) then
 518          dims(1, 1) = nx
 519          dims(2, 1) = ny
 520          dims(3, 1) = nz
 521
 522          dims(1, 2) = ny
 523          dims(2, 2) = nx
 524          dims(3, 2) = nz
 525
 526          dims(1, 3) = nz
 527          dims(2, 3) = nx
 528          dims(3, 3) = ny
 529
 530       endif
 531       do i = 1, 3
 532          dims(2, i) = dims(2, i) / np1
 533          dims(3, i) = dims(3, i) / np2
 534       end do
 535
 536
 537 c---------------------------------------------------------------------
 538 c Determine processor coordinates of this processor
 539 c Processor grid is np1xnp2.
 540 c Arrays are always (n1, n2/np1, n3/np2)
 541 c Processor coords are zero-based.
 542 c---------------------------------------------------------------------
 543       me2 = mod(me, np2)  ! goes from 0...np2-1
 544       me1 = me/np2        ! goes from 0...np1-1
 545 c---------------------------------------------------------------------
 546 c Communicators for rows/columns of processor grid.
 547 c commslice1 is communicator of all procs with same me1, ranked as me2
 548 c commslice2 is communicator of all procs with same me2, ranked as me1
 549 c mpi_comm_split(comm, color, key, ...)
 550 c---------------------------------------------------------------------
 551       call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr)
 552       call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr)
 553       if (timers_enabled) call synchup()
 554
 555       if (debug) print *, 'proc coords: ', me, me1, me2
 556
 557 c---------------------------------------------------------------------
 558 c Determine which section of the grid is owned by this
 559 c processor.
 560 c---------------------------------------------------------------------
 561       if (layout_type .eq. layout_0d) then
 562
 563          do i = 1, 3
 564             xstart(i) = 1
 565             xend(i)   = nx
 566             ystart(i) = 1
 567             yend(i)   = ny
 568             zstart(i) = 1
 569             zend(i)   = nz
 570          end do
 571
 572       else if (layout_type .eq. layout_1d) then
 573
 574          xstart(1) = 1
 575          xend(1)   = nx
 576          ystart(1) = 1
 577          yend(1)   = ny
 578          zstart(1) = 1 + me2 * nz/np2
 579          zend(1)   = (me2+1) * nz/np2
 580
 581          xstart(2) = 1
 582          xend(2)   = nx
 583          ystart(2) = 1
 584          yend(2)   = ny
 585          zstart(2) = 1 + me2 * nz/np2
 586          zend(2)   = (me2+1) * nz/np2
 587
 588          xstart(3) = 1
 589          xend(3)   = nx
 590          ystart(3) = 1 + me2 * ny/np2
 591          yend(3)   = (me2+1) * ny/np2
 592          zstart(3) = 1
 593          zend(3)   = nz
 594
 595       else if (layout_type .eq. layout_2d) then
 596
 597          xstart(1) = 1
 598          xend(1)   = nx
 599          ystart(1) = 1 + me1 * ny/np1
 600          yend(1)   = (me1+1) * ny/np1
 601          zstart(1) = 1 + me2 * nz/np2
 602          zend(1)   = (me2+1) * nz/np2
 603
 604          xstart(2) = 1 + me1 * nx/np1
 605          xend(2)   = (me1+1)*nx/np1
 606          ystart(2) = 1
 607          yend(2)   = ny
 608          zstart(2) = zstart(1)
 609          zend(2)   = zend(1)
 610
 611          xstart(3) = xstart(2)
 612          xend(3)   = xend(2)
 613          ystart(3) = 1 + me2 *ny/np2
 614          yend(3)   = (me2+1)*ny/np2
 615          zstart(3) = 1
 616          zend(3)   = nz
 617       endif
 618
 619 c---------------------------------------------------------------------
 620 c Set up info for blocking of ffts and transposes.  This improves
 621 c performance on cache-based systems. Blocking involves
 622 c working on a chunk of the problem at a time, taking chunks
 623 c along the first, second, or third dimension.
 624 c
 625 c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
 626 c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
 627
 628 c Since 1st dim is always in processor, we'll assume it's long enough
 629 c (default blocking factor is 16 so min size for 1st dim is 16)
 630 c The only case we have to worry about is cffts1 in a 2d decomposition.
 631 c so the blocking factor should not be larger than the 2nd dimension.
 632 c---------------------------------------------------------------------
 633
 634       fftblock = fftblock_default
 635       fftblockpad = fftblockpad_default
 636
 637       if (layout_type .eq. layout_2d) then
 638          if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1)
 639          if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2)
 640          if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3)
 641       endif
 642
 643       if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3
 644
 645       return
 646       end
 647
 648
 649 c---------------------------------------------------------------------
 650 c---------------------------------------------------------------------
 651
 652       subroutine compute_indexmap(twiddle, d1, d2, d3)
 653
 654 c---------------------------------------------------------------------
 655 c---------------------------------------------------------------------
 656
 657 c---------------------------------------------------------------------
 658 c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
 659 c for time evolution exponent.
 660 c---------------------------------------------------------------------
 661
 662       implicit none
 663       include 'mpinpb.h'
 664       include 'global.h'
 665       integer d1, d2, d3
 666       integer i, j, k, ii, ii2, jj, ij2, kk
 667       double precision ap, twiddle(d1, d2, d3)
 668
 669 c---------------------------------------------------------------------
 670 c this function is very different depending on whether
 671 c we are in the 0d, 1d or 2d layout. Compute separately.
 672 c basically we want to convert the fortran indices
 673 c   1 2 3 4 5 6 7 8
 674 c to
 675 c   0 1 2 3 -4 -3 -2 -1
 676 c The following magic formula does the trick:
 677 c mod(i-1+n/2, n) - n/2
 678 c---------------------------------------------------------------------
 679
 680       ap = - 4.d0 * alpha * pi *pi
 681
 682       if (layout_type .eq. layout_0d) then ! xyz layout
 683          do i = 1, dims(1,3)
 684             ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
 685             ii2 = ii*ii
 686             do j = 1, dims(2,3)
 687                jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
 688                ij2 = jj*jj+ii2
 689                do k = 1, dims(3,3)
 690                   kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
 691                   twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2))
 692                end do
 693             end do
 694          end do
 695       else if (layout_type .eq. layout_1d) then ! zxy layout
 696          do i = 1,dims(2,3)
 697             ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
 698             ii2 = ii*ii
 699             do j = 1,dims(3,3)
 700                jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
 701                ij2 = jj*jj+ii2
 702                do k = 1,dims(1,3)
 703                   kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
 704                   twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
 705                end do
 706             end do
 707          end do
 708       else if (layout_type .eq. layout_2d) then ! zxy layout
 709          do i = 1,dims(2,3)
 710             ii =  mod(i+xstart(3)-2+nx/2, nx) - nx/2
 711             ii2 = ii*ii
 712             do j = 1, dims(3,3)
 713                jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
 714                ij2 = jj*jj+ii2
 715                do k =1,dims(1,3)
 716                   kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
 717                   twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
 718                end do
 719             end do
 720          end do
 721       else
 722          print *, ' Unknown layout type ', layout_type
 723          stop
 724       endif
 725
 726       return
 727       end
 728
 729
 730
 731 c---------------------------------------------------------------------
 732 c---------------------------------------------------------------------
 733
 734       subroutine print_timers()
 735
 736 c---------------------------------------------------------------------
 737 c---------------------------------------------------------------------
 738
 739       implicit none
 740       integer i
 741       include 'global.h'
 742       character*25 tstrings(T_max)
 743       data tstrings / '          total ',
 744      >                '          setup ',
 745      >                '            fft ',
 746      >                '         evolve ',
 747      >                '       checksum ',
 748      >                '         fftlow ',
 749      >                '        fftcopy ',
 750      >                '      transpose ',
 751      >                ' transpose1_loc ',
 752      >                ' transpose1_glo ',
 753      >                ' transpose1_fin ',
 754      >                ' transpose2_loc ',
 755      >                ' transpose2_glo ',
 756      >                ' transpose2_fin ',
 757      >                '           sync ' /
 758
 759       if (me .ne. 0) return
 760       do i = 1, t_max
 761          if (timer_read(i) .ne. 0.0d0) then
 762             write(*, 100) i, tstrings(i), timer_read(i)
 763          endif
 764       end do
 765  100  format(' timer ', i2, '(', A16,  ') :', F10.6)
 766       return
 767       end
 768
 769
 770
 771 c---------------------------------------------------------------------
 772 c---------------------------------------------------------------------
 773
 774       subroutine fft(dir, x1, x2)
 775
 776 c---------------------------------------------------------------------
 777 c---------------------------------------------------------------------
 778
 779       implicit none
 780       include 'global.h'
 781       integer dir
 782       double complex x1(ntdivnp), x2(ntdivnp)
 783
 784       double complex scratch(fftblockpad_default*maxdim*2)
 785
 786 c---------------------------------------------------------------------
 787 c note: args x1, x2 must be different arrays
 788 c note: args for cfftsx are (direction, layout, xin, xout, scratch)
 789 c       xin/xout may be the same and it can be somewhat faster
 790 c       if they are
 791 c note: args for transpose are (layout1, layout2, xin, xout)
 792 c       xin/xout must be different
 793 c---------------------------------------------------------------------
 794
 795       if (dir .eq. 1) then
 796          if (layout_type .eq. layout_0d) then
 797             call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
 798      >                  x1, x1, scratch)
 799             call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
 800      >                  x1, x1, scratch)
 801             call cffts3(1, dims(1,3), dims(2,3), dims(3,3),
 802      >                  x1, x2, scratch)
 803          else if (layout_type .eq. layout_1d) then
 804             call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
 805      >                  x1, x1, scratch)
 806             call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
 807      >                  x1, x1, scratch)
 808             if (timers_enabled) call timer_start(T_transpose)
 809             call transpose_xy_z(2, 3, x1, x2)
 810             if (timers_enabled) call timer_stop(T_transpose)
 811             call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
 812      >                  x2, x2, scratch)
 813          else if (layout_type .eq. layout_2d) then
 814             call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
 815      >                  x1, x1, scratch)
 816             if (timers_enabled) call timer_start(T_transpose)
 817             call transpose_x_y(1, 2, x1, x2)
 818             if (timers_enabled) call timer_stop(T_transpose)
 819             call cffts1(1, dims(1,2), dims(2,2), dims(3,2),
 820      >                  x2, x2, scratch)
 821             if (timers_enabled) call timer_start(T_transpose)
 822             call transpose_x_z(2, 3, x2, x1)
 823             if (timers_enabled) call timer_stop(T_transpose)
 824             call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
 825      >                  x1, x2, scratch)
 826          endif
 827       else
 828          if (layout_type .eq. layout_0d) then
 829             call cffts3(-1, dims(1,3), dims(2,3), dims(3,3),
 830      >                  x1, x1, scratch)
 831             call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
 832      >                  x1, x1, scratch)
 833             call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
 834      >                  x1, x2, scratch)
 835          else if (layout_type .eq. layout_1d) then
 836             call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
 837      >                  x1, x1, scratch)
 838             if (timers_enabled) call timer_start(T_transpose)
 839             call transpose_x_yz(3, 2, x1, x2)
 840             if (timers_enabled) call timer_stop(T_transpose)
 841             call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
 842      >                  x2, x2, scratch)
 843             call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
 844      >                  x2, x2, scratch)
 845          else if (layout_type .eq. layout_2d) then
 846             call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
 847      >                  x1, x1, scratch)
 848             if (timers_enabled) call timer_start(T_transpose)
 849             call transpose_x_z(3, 2, x1, x2)
 850             if (timers_enabled) call timer_stop(T_transpose)
 851             call cffts1(-1, dims(1,2), dims(2,2), dims(3,2),
 852      >                  x2, x2, scratch)
 853             if (timers_enabled) call timer_start(T_transpose)
 854             call transpose_x_y(2, 1, x2, x1)
 855             if (timers_enabled) call timer_stop(T_transpose)
 856             call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
 857      >                  x1, x2, scratch)
 858          endif
 859       endif
 860       return
 861       end
 862
 863
 864
 865 c---------------------------------------------------------------------
 866 c---------------------------------------------------------------------
 867
 868       subroutine cffts1(is, d1, d2, d3, x, xout, y)
 869
 870 c---------------------------------------------------------------------
 871 c---------------------------------------------------------------------
 872
 873       implicit none
 874
 875       include 'global.h'
 876       integer is, d1, d2, d3, logd1
 877       double complex x(d1,d2,d3)
 878       double complex xout(d1,d2,d3)
 879       double complex y(fftblockpad, d1, 2)
 880       integer i, j, k, jj
 881
 882       logd1 = ilog2(d1)
 883
 884       do k = 1, d3
 885          do jj = 0, d2 - fftblock, fftblock
 886             if (timers_enabled) call timer_start(T_fftcopy)
 887             do j = 1, fftblock
 888                do i = 1, d1
 889                   y(j,i,1) = x(i,j+jj,k)
 890                enddo
 891             enddo
 892             if (timers_enabled) call timer_stop(T_fftcopy)
 893
 894             if (timers_enabled) call timer_start(T_fftlow)
 895             call cfftz (is, logd1, d1, y, y(1,1,2))
 896             if (timers_enabled) call timer_stop(T_fftlow)
 897
 898             if (timers_enabled) call timer_start(T_fftcopy)
 899             do j = 1, fftblock
 900                do i = 1, d1
 901                   xout(i,j+jj,k) = y(j,i,1)
 902                enddo
 903             enddo
 904             if (timers_enabled) call timer_stop(T_fftcopy)
 905          enddo
 906       enddo
 907
 908       return
 909       end
 910
 911
 912 c---------------------------------------------------------------------
 913 c---------------------------------------------------------------------
 914
 915       subroutine cffts2(is, d1, d2, d3, x, xout, y)
 916
 917 c---------------------------------------------------------------------
 918 c---------------------------------------------------------------------
 919
 920       implicit none
 921
 922       include 'global.h'
 923       integer is, d1, d2, d3, logd2
 924       double complex x(d1,d2,d3)
 925       double complex xout(d1,d2,d3)
 926       double complex y(fftblockpad, d2, 2)
 927       integer i, j, k, ii
 928
 929       logd2 = ilog2(d2)
 930
 931       do k = 1, d3
 932         do ii = 0, d1 - fftblock, fftblock
 933            if (timers_enabled) call timer_start(T_fftcopy)
 934            do j = 1, d2
 935               do i = 1, fftblock
 936                  y(i,j,1) = x(i+ii,j,k)
 937               enddo
 938            enddo
 939            if (timers_enabled) call timer_stop(T_fftcopy)
 940
 941            if (timers_enabled) call timer_start(T_fftlow)
 942            call cfftz (is, logd2, d2, y, y(1, 1, 2))
 943            if (timers_enabled) call timer_stop(T_fftlow)
 944
 945            if (timers_enabled) call timer_start(T_fftcopy)
 946            do j = 1, d2
 947               do i = 1, fftblock
 948                  xout(i+ii,j,k) = y(i,j,1)
 949               enddo
 950            enddo
 951            if (timers_enabled) call timer_stop(T_fftcopy)
 952         enddo
 953       enddo
 954
 955       return
 956       end
 957
 958
 959 c---------------------------------------------------------------------
 960 c---------------------------------------------------------------------
 961
 962       subroutine cffts3(is, d1, d2, d3, x, xout, y)
 963
 964 c---------------------------------------------------------------------
 965 c---------------------------------------------------------------------
 966
 967       implicit none
 968
 969       include 'global.h'
 970       integer is, d1, d2, d3, logd3
 971       double complex x(d1,d2,d3)
 972       double complex xout(d1,d2,d3)
 973       double complex y(fftblockpad, d3, 2)
 974       integer i, j, k, ii
 975
 976       logd3 = ilog2(d3)
 977
 978       do j = 1, d2
 979         do ii = 0, d1 - fftblock, fftblock
 980            if (timers_enabled) call timer_start(T_fftcopy)
 981            do k = 1, d3
 982               do i = 1, fftblock
 983                  y(i,k,1) = x(i+ii,j,k)
 984               enddo
 985            enddo
 986            if (timers_enabled) call timer_stop(T_fftcopy)
 987
 988            if (timers_enabled) call timer_start(T_fftlow)
 989            call cfftz (is, logd3, d3, y, y(1, 1, 2))
 990            if (timers_enabled) call timer_stop(T_fftlow)
 991
 992            if (timers_enabled) call timer_start(T_fftcopy)
 993            do k = 1, d3
 994               do i = 1, fftblock
 995                  xout(i+ii,j,k) = y(i,k,1)
 996               enddo
 997            enddo
 998            if (timers_enabled) call timer_stop(T_fftcopy)
 999         enddo
1000       enddo
1001
1002       return
1003       end
1004
1005
1006 c---------------------------------------------------------------------
1007 c---------------------------------------------------------------------
1008
1009       subroutine fft_init (n)
1010
1011 c---------------------------------------------------------------------
1012 c---------------------------------------------------------------------
1013
1014 c---------------------------------------------------------------------
1015 c compute the roots-of-unity array that will be used for subsequent FFTs.
1016 c---------------------------------------------------------------------
1017
1018       implicit none
1019       include 'global.h'
1020
1021       integer m,n,nu,ku,i,j,ln
1022       double precision t, ti
1023
1024
1025 c---------------------------------------------------------------------
1026 c   Initialize the U array with sines and cosines in a manner that permits
1027 c   stride one access at each FFT iteration.
1028 c---------------------------------------------------------------------
1029       nu = n
1030       m = ilog2(n)
1031       u(1) = m
1032       ku = 2
1033       ln = 1
1034
1035       do j = 1, m
1036          t = pi / ln
1037
1038          do i = 0, ln - 1
1039             ti = i * t
1040             u(i+ku) = dcmplx (cos (ti), sin(ti))
1041          enddo
1042
1043          ku = ku + ln
1044          ln = 2 * ln
1045       enddo
1046
1047       return
1048       end
1049
1050 c---------------------------------------------------------------------
1051 c---------------------------------------------------------------------
1052
1053       subroutine cfftz (is, m, n, x, y)
1054
1055 c---------------------------------------------------------------------
1056 c---------------------------------------------------------------------
1057
1058 c---------------------------------------------------------------------
1059 c   Computes NY N-point complex-to-complex FFTs of X using an algorithm due
1060 c   to Swarztrauber.  X is both the input and the output array, while Y is a
1061 c   scratch array.  It is assumed that N = 2^M.  Before calling CFFTZ to
1062 c   perform FFTs, the array U must be initialized by calling CFFTZ with IS
1063 c   set to 0 and M set to MX, where MX is the maximum value of M for any
1064 c   subsequent call.
1065 c---------------------------------------------------------------------
1066
1067       implicit none
1068       include 'global.h'
1069
1070       integer is,m,n,i,j,l,mx
1071       double complex x, y
1072
1073       dimension x(fftblockpad,n), y(fftblockpad,n)
1074
1075 c---------------------------------------------------------------------
1076 c   Check if input parameters are invalid.
1077 c---------------------------------------------------------------------
1078       mx = u(1)
1079       if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)
1080      >  then
1081         write (*, 1)  is, m, mx
1082  1      format ('CFFTZ: Either U has not been initialized, or else'/
1083      >    'one of the input parameters is invalid', 3I5)
1084         stop
1085       endif
1086
1087 c---------------------------------------------------------------------
1088 c   Perform one variant of the Stockham FFT.
1089 c---------------------------------------------------------------------
1090       do l = 1, m, 2
1091         call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
1092         if (l .eq. m) goto 160
1093         call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
1094       enddo
1095
1096       goto 180
1097
1098 c---------------------------------------------------------------------
1099 c   Copy Y to X.
1100 c---------------------------------------------------------------------
1101  160  do j = 1, n
1102         do i = 1, fftblock
1103           x(i,j) = y(i,j)
1104         enddo
1105       enddo
1106
1107  180  continue
1108
1109       return
1110       end
1111
1112 c---------------------------------------------------------------------
1113 c---------------------------------------------------------------------
1114
1115       subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
1116
1117 c---------------------------------------------------------------------
1118 c---------------------------------------------------------------------
1119
1120 c---------------------------------------------------------------------
1121 c   Performs the L-th iteration of the second variant of the Stockham FFT.
1122 c---------------------------------------------------------------------
1123
1124       implicit none
1125
1126       integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
1127       double complex u,x,y,u1,x11,x21
1128       dimension u(n), x(ny1,n), y(ny1,n)
1129
1130
1131 c---------------------------------------------------------------------
1132 c   Set initial parameters.
1133 c---------------------------------------------------------------------
1134
1135       n1 = n / 2
1136       lk = 2 ** (l - 1)
1137       li = 2 ** (m - l)
1138       lj = 2 * lk
1139       ku = li + 1
1140
1141       do i = 0, li - 1
1142         i11 = i * lk + 1
1143         i12 = i11 + n1
1144         i21 = i * lj + 1
1145         i22 = i21 + lk
1146         if (is .ge. 1) then
1147           u1 = u(ku+i)
1148         else
1149           u1 = dconjg (u(ku+i))
1150         endif
1151
1152 c---------------------------------------------------------------------
1153 c   This loop is vectorizable.
1154 c---------------------------------------------------------------------
1155         do k = 0, lk - 1
1156           do j = 1, ny
1157             x11 = x(j,i11+k)
1158             x21 = x(j,i12+k)
1159             y(j,i21+k) = x11 + x21
1160             y(j,i22+k) = u1 * (x11 - x21)
1161           enddo
1162         enddo
1163       enddo
1164
1165       return
1166       end
1167
1168 c---------------------------------------------------------------------
1169
1170
1171 c---------------------------------------------------------------------
1172 c---------------------------------------------------------------------
1173
1174       integer function ilog2(n)
1175
1176 c---------------------------------------------------------------------
1177 c---------------------------------------------------------------------
1178
1179       implicit none
1180       integer n, nn, lg
1181       if (n .eq. 1) then
1182          ilog2=0
1183          return
1184       endif
1185       lg = 1
1186       nn = 2
1187       do while (nn .lt. n)
1188          nn = nn*2
1189          lg = lg+1
1190       end do
1191       ilog2 = lg
1192       return
1193       end
1194
1195
1196 c---------------------------------------------------------------------
1197 c---------------------------------------------------------------------
1198
1199       subroutine transpose_x_yz(l1, l2, xin, xout)
1200
1201 c---------------------------------------------------------------------
1202 c---------------------------------------------------------------------
1203
1204       implicit none
1205       include 'global.h'
1206       integer l1, l2
1207       double complex xin(ntdivnp), xout(ntdivnp)
1208
1209       call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1),
1210      >                          xin, xout)
1211
1212       call transpose2_global(xout, xin)
1213
1214       call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1),
1215      >                          xin, xout)
1216
1217       return
1218       end
1219
1220
1221 c---------------------------------------------------------------------
1222 c---------------------------------------------------------------------
1223
1224       subroutine transpose_xy_z(l1, l2, xin, xout)
1225
1226 c---------------------------------------------------------------------
1227 c---------------------------------------------------------------------
1228
1229       implicit none
1230       include 'global.h'
1231       integer l1, l2
1232       double complex xin(ntdivnp), xout(ntdivnp)
1233
1234       call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1),
1235      >                          xin, xout)
1236       call transpose2_global(xout, xin)
1237       call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1),
1238      >                          xin, xout)
1239
1240       return
1241       end
1242
1243
1244
1245 c---------------------------------------------------------------------
1246 c---------------------------------------------------------------------
1247
1248       subroutine transpose2_local(n1, n2, xin, xout)
1249
1250 c---------------------------------------------------------------------
1251 c---------------------------------------------------------------------
1252
1253       implicit none
1254       include 'mpinpb.h'
1255       include 'global.h'
1256       integer n1, n2
1257       double complex xin(n1, n2), xout(n2, n1)
1258
1259       double complex z(transblockpad, transblock)
1260
1261       integer i, j, ii, jj
1262
1263       if (timers_enabled) call timer_start(T_transxzloc)
1264
1265 c---------------------------------------------------------------------
1266 c If possible, block the transpose for cache memory systems.
1267 c How much does this help? Example: R8000 Power Challenge (90 MHz)
1268 c Blocked version decreases time spend in this routine
1269 c from 14 seconds to 5.2 seconds on 8 nodes class A.
1270 c---------------------------------------------------------------------
1271
1272       if (n1 .lt. transblock .or. n2 .lt. transblock) then
1273          if (n1 .ge. n2) then
1274             do j = 1, n2
1275                do i = 1, n1
1276                   xout(j, i) = xin(i, j)
1277                end do
1278             end do
1279          else
1280             do i = 1, n1
1281                do j = 1, n2
1282                   xout(j, i) = xin(i, j)
1283                end do
1284             end do
1285          endif
1286       else
1287          do j = 0, n2-1, transblock
1288             do i = 0, n1-1, transblock
1289
1290 c---------------------------------------------------------------------
1291 c Note: compiler should be able to take j+jj out of inner loop
1292 c---------------------------------------------------------------------
1293                do jj = 1, transblock
1294                   do ii = 1, transblock
1295                      z(jj,ii) = xin(i+ii, j+jj)
1296                   end do
1297                end do
1298
1299                do ii = 1, transblock
1300                   do jj = 1, transblock
1301                      xout(j+jj, i+ii) = z(jj,ii)
1302                   end do
1303                end do
1304
1305             end do
1306          end do
1307       endif
1308       if (timers_enabled) call timer_stop(T_transxzloc)
1309
1310       return
1311       end
1312
1313
1314 c---------------------------------------------------------------------
1315 c---------------------------------------------------------------------
1316
1317       subroutine transpose2_global(xin, xout)
1318
1319 c---------------------------------------------------------------------
1320 c---------------------------------------------------------------------
1321
1322       implicit none
1323       include 'global.h'
1324       include 'mpinpb.h'
1325       double complex xin(ntdivnp)
1326       double complex xout(ntdivnp)
1327       integer ierr
1328
1329       if (timers_enabled) call synchup()
1330
1331       if (timers_enabled) call timer_start(T_transxzglo)
1332       call mpi_alltoall(xin, ntdivnp/np, dc_type,
1333      >                  xout, ntdivnp/np, dc_type,
1334      >                  commslice1, ierr)
1335       if (timers_enabled) call timer_stop(T_transxzglo)
1336
1337       return
1338       end
1339
1340
1341
1342 c---------------------------------------------------------------------
1343 c---------------------------------------------------------------------
1344
1345       subroutine transpose2_finish(n1, n2, xin, xout)
1346
1347 c---------------------------------------------------------------------
1348 c---------------------------------------------------------------------
1349
1350       implicit none
1351       include 'global.h'
1352       integer n1, n2, ioff
1353       double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2)
1354
1355       integer i, j, p
1356
1357       if (timers_enabled) call timer_start(T_transxzfin)
1358       do p = 0, np2-1
1359          ioff = p*n2
1360          do j = 1, n1/np2
1361             do i = 1, n2
1362                xout(i+ioff, j) = xin(i, j, p)
1363             end do
1364          end do
1365       end do
1366       if (timers_enabled) call timer_stop(T_transxzfin)
1367
1368       return
1369       end
1370
1371
1372 c---------------------------------------------------------------------
1373 c---------------------------------------------------------------------
1374
1375       subroutine transpose_x_z(l1, l2, xin, xout)
1376
1377 c---------------------------------------------------------------------
1378 c---------------------------------------------------------------------
1379
1380       implicit none
1381       include 'global.h'
1382       integer l1, l2
1383       double complex xin(ntdivnp), xout(ntdivnp)
1384
1385       call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1),
1386      >                         xin, xout)
1387       call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1),
1388      >                          xout, xin)
1389       call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2),
1390      >                          xin, xout)
1391       return
1392       end
1393
1394
1395 c---------------------------------------------------------------------
1396 c---------------------------------------------------------------------
1397
1398       subroutine transpose_x_z_local(d1, d2, d3, xin, xout)
1399
1400 c---------------------------------------------------------------------
1401 c---------------------------------------------------------------------
1402
1403       implicit none
1404       include 'global.h'
1405       integer d1, d2, d3
1406       double complex xin(d1,d2,d3)
1407       double complex xout(d3,d2,d1)
1408       integer block1, block3
1409       integer i, j, k, kk, ii, i1, k1
1410
1411       double complex buf(transblockpad, maxdim)
1412       if (timers_enabled) call timer_start(T_transxzloc)
1413       if (d1 .lt. 32) goto 100
1414       block3 = d3
1415       if (block3 .eq. 1)  goto 100
1416       if (block3 .gt. transblock) block3 = transblock
1417       block1 = d1
1418       if (block1*block3 .gt. transblock*transblock)
1419      >          block1 = transblock*transblock/block3
1420 c---------------------------------------------------------------------
1421 c blocked transpose
1422 c---------------------------------------------------------------------
1423       do j = 1, d2
1424          do kk = 0, d3-block3, block3
1425             do ii = 0, d1-block1, block1
1426
1427                do k = 1, block3
1428                   k1 = k + kk
1429                   do i = 1, block1
1430                      buf(k, i) = xin(i+ii, j, k1)
1431                   end do
1432                end do
1433
1434                do i = 1, block1
1435                   i1 = i + ii
1436                   do k = 1, block3
1437                      xout(k+kk, j, i1) = buf(k, i)
1438                   end do
1439                end do
1440
1441             end do
1442          end do
1443       end do
1444       goto 200
1445
1446
1447 c---------------------------------------------------------------------
1448 c basic transpose
1449 c---------------------------------------------------------------------
1450  100  continue
1451
1452       do j = 1, d2
1453          do k = 1, d3
1454             do i = 1, d1
1455                xout(k, j, i) = xin(i, j, k)
1456             end do
1457          end do
1458       end do
1459
1460 c---------------------------------------------------------------------
1461 c all done
1462 c---------------------------------------------------------------------
1463  200  continue
1464
1465       if (timers_enabled) call timer_stop(T_transxzloc)
1466       return
1467       end
1468
1469
1470 c---------------------------------------------------------------------
1471 c---------------------------------------------------------------------
1472
1473       subroutine transpose_x_z_global(d1, d2, d3, xin, xout)
1474
1475 c---------------------------------------------------------------------
1476 c---------------------------------------------------------------------
1477
1478       implicit none
1479       include 'global.h'
1480       include 'mpinpb.h'
1481       integer d1, d2, d3
1482       double complex xin(d3,d2,d1)
1483       double complex xout(d3,d2,d1) ! not real layout, but right size
1484       integer ierr
1485
1486       if (timers_enabled) call synchup()
1487
1488 c---------------------------------------------------------------------
1489 c do transpose among all  processes with same 1-coord (me1)
1490 c---------------------------------------------------------------------
1491       if (timers_enabled)call timer_start(T_transxzglo)
1492       call mpi_alltoall(xin, d1*d2*d3/np2, dc_type,
1493      >                  xout, d1*d2*d3/np2, dc_type,
1494      >                  commslice1, ierr)
1495       if (timers_enabled) call timer_stop(T_transxzglo)
1496       return
1497       end
1498
1499
1500 c---------------------------------------------------------------------
1501 c---------------------------------------------------------------------
1502
1503       subroutine transpose_x_z_finish(d1, d2, d3, xin, xout)
1504
1505 c---------------------------------------------------------------------
1506 c---------------------------------------------------------------------
1507
1508       implicit none
1509       include 'global.h'
1510       integer d1, d2, d3
1511       double complex xin(d1/np2, d2, d3, 0:np2-1)
1512       double complex xout(d1,d2,d3)
1513       integer i, j, k, p, ioff
1514       if (timers_enabled) call timer_start(T_transxzfin)
1515 c---------------------------------------------------------------------
1516 c this is the most straightforward way of doing it. the
1517 c calculation in the inner loop doesn't help.
1518 c      do i = 1, d1/np2
1519 c         do j = 1, d2
1520 c            do k = 1, d3
1521 c               do p = 0, np2-1
1522 c                  ii = i + p*d1/np2
1523 c                  xout(ii, j, k) = xin(i, j, k, p)
1524 c               end do
1525 c            end do
1526 c         end do
1527 c      end do
1528 c---------------------------------------------------------------------
1529
1530       do p = 0, np2-1
1531          ioff = p*d1/np2
1532          do k = 1, d3
1533             do j = 1, d2
1534                do i = 1, d1/np2
1535                   xout(i+ioff, j, k) = xin(i, j, k, p)
1536                end do
1537             end do
1538          end do
1539       end do
1540       if (timers_enabled) call timer_stop(T_transxzfin)
1541       return
1542       end
1543
1544
1545 c---------------------------------------------------------------------
1546 c---------------------------------------------------------------------
1547
1548       subroutine transpose_x_y(l1, l2, xin, xout)
1549
1550 c---------------------------------------------------------------------
1551 c---------------------------------------------------------------------
1552
1553       implicit none
1554       include 'global.h'
1555       integer l1, l2
1556       double complex xin(ntdivnp), xout(ntdivnp)
1557
1558 c---------------------------------------------------------------------
1559 c xy transpose is a little tricky, since we don't want
1560 c to touch 3rd axis. But alltoall must involve 3rd axis (most
1561 c slowly varying) to be efficient. So we do
1562 c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local)
1563 c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global)
1564 c then local finish.
1565 c---------------------------------------------------------------------
1566
1567
1568       call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1),
1569      >                         xin, xout)
1570       call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1),
1571      >                          xout, xin)
1572       call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2),
1573      >                          xin, xout)
1574
1575       return
1576       end
1577
1578
1579 c---------------------------------------------------------------------
1580 c---------------------------------------------------------------------
1581
1582       subroutine transpose_x_y_local(d1, d2, d3, xin, xout)
1583
1584 c---------------------------------------------------------------------
1585 c---------------------------------------------------------------------
1586
1587       implicit none
1588       include 'global.h'
1589       integer d1, d2, d3
1590       double complex xin(d1, d2, d3)
1591       double complex xout(d2, d3, d1)
1592       integer i, j, k
1593       if (timers_enabled) call timer_start(T_transxyloc)
1594
1595       do k = 1, d3
1596          do i = 1, d1
1597             do j = 1, d2
1598                xout(j,k,i)=xin(i,j,k)
1599             end do
1600          end do
1601       end do
1602       if (timers_enabled) call timer_stop(T_transxyloc)
1603       return
1604       end
1605
1606
1607 c---------------------------------------------------------------------
1608 c---------------------------------------------------------------------
1609
1610       subroutine transpose_x_y_global(d1, d2, d3, xin, xout)
1611
1612 c---------------------------------------------------------------------
1613 c---------------------------------------------------------------------
1614
1615       implicit none
1616       include 'global.h'
1617       include 'mpinpb.h'
1618       integer d1, d2, d3
1619 c---------------------------------------------------------------------
1620 c array is in form (ny/np1, nz/np2, nx)
1621 c---------------------------------------------------------------------
1622       double complex xin(d2,d3,d1)
1623       double complex xout(d2,d3,d1) ! not real layout but right size
1624       integer ierr
1625
1626       if (timers_enabled) call synchup()
1627
1628 c---------------------------------------------------------------------
1629 c do transpose among all processes with same 1-coord (me1)
1630 c---------------------------------------------------------------------
1631       if (timers_enabled) call timer_start(T_transxyglo)
1632       call mpi_alltoall(xin, d1*d2*d3/np1, dc_type,
1633      >                  xout, d1*d2*d3/np1, dc_type,
1634      >                  commslice2, ierr)
1635       if (timers_enabled) call timer_stop(T_transxyglo)
1636
1637       return
1638       end
1639
1640
1641 c---------------------------------------------------------------------
1642 c---------------------------------------------------------------------
1643
1644       subroutine transpose_x_y_finish(d1, d2, d3, xin, xout)
1645
1646 c---------------------------------------------------------------------
1647 c---------------------------------------------------------------------
1648
1649       implicit none
1650       include 'global.h'
1651       integer d1, d2, d3
1652       double complex xin(d1/np1, d3, d2, 0:np1-1)
1653       double complex xout(d1,d2,d3)
1654       integer i, j, k, p, ioff
1655       if (timers_enabled) call timer_start(T_transxyfin)
1656 c---------------------------------------------------------------------
1657 c this is the most straightforward way of doing it. the
1658 c calculation in the inner loop doesn't help.
1659 c      do i = 1, d1/np1
1660 c         do j = 1, d2
1661 c            do k = 1, d3
1662 c               do p = 0, np1-1
1663 c                  ii = i + p*d1/np1
1664 c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2)
1665 c                  xout(ii, j, k) = xin(i, k, j, p)
1666 c               end do
1667 c            end do
1668 c         end do
1669 c      end do
1670 c---------------------------------------------------------------------
1671
1672       do p = 0, np1-1
1673          ioff = p*d1/np1
1674          do k = 1, d3
1675             do j = 1, d2
1676                do i = 1, d1/np1
1677                   xout(i+ioff, j, k) = xin(i, k, j, p)
1678                end do
1679             end do
1680          end do
1681       end do
1682       if (timers_enabled) call timer_stop(T_transxyfin)
1683       return
1684       end
1685
1686
1687 c---------------------------------------------------------------------
1688 c---------------------------------------------------------------------
1689
1690       subroutine checksum(i, u1, d1, d2, d3)
1691
1692 c---------------------------------------------------------------------
1693 c---------------------------------------------------------------------
1694
1695       implicit none
1696       include 'global.h'
1697       include 'mpinpb.h'
1698       integer i, d1, d2, d3
1699       double complex u1(d1, d2, d3)
1700       integer j, q,r,s, ierr
1701       double complex chk,allchk
1702       chk = (0.0,0.0)
1703
1704       do j=1,1024
1705          q = mod(j, nx)+1
1706          if (q .ge. xstart(1) .and. q .le. xend(1)) then
1707             r = mod(3*j,ny)+1
1708             if (r .ge. ystart(1) .and. r .le. yend(1)) then
1709                s = mod(5*j,nz)+1
1710                if (s .ge. zstart(1) .and. s .le. zend(1)) then
1711                   chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1)
1712                end if
1713             end if
1714          end if
1715       end do
1716       chk = chk/ntotal_f
1717
1718       call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM,
1719      >                0, MPI_COMM_WORLD, ierr)
1720       if (me .eq. 0) then
1721             write (*, 30) i, allchk
1722  30         format (' T =',I5,5X,'Checksum =',1P2D22.12)
1723       endif
1724
1725 c      sums(i) = allchk
1726 c     If we compute the checksum for diagnostic purposes, we let i be
1727 c     negative, so the result will not be stored in an array
1728       if (i .gt. 0) sums(i) = allchk
1729
1730       return
1731       end
1732
1733 c---------------------------------------------------------------------
1734 c---------------------------------------------------------------------
1735
1736       subroutine synchup
1737
1738 c---------------------------------------------------------------------
1739 c---------------------------------------------------------------------
1740
1741       implicit none
1742       include 'global.h'
1743       include 'mpinpb.h'
1744       integer ierr
1745       call timer_start(T_synch)
1746       call mpi_barrier(MPI_COMM_WORLD, ierr)
1747       call timer_stop(T_synch)
1748       return
1749       end
1750
1751
1752 c---------------------------------------------------------------------
1753 c---------------------------------------------------------------------
1754
1755       subroutine verify (d1, d2, d3, nt, verified, class)
1756
1757 c---------------------------------------------------------------------
1758 c---------------------------------------------------------------------
1759
1760       implicit none
1761       include 'global.h'
1762       include 'mpinpb.h'
1763       integer d1, d2, d3, nt
1764       character class
1765       logical verified
1766       integer ierr, size, i
1767       double precision err, epsilon
1768
1769 c---------------------------------------------------------------------
1770 c   Reference checksums
1771 c---------------------------------------------------------------------
1772       double complex csum_ref(25)
1773
1774
1775       class = 'U'
1776
1777       if (me .ne. 0) return
1778
1779       epsilon = 1.0d-12
1780       verified = .FALSE.
1781
1782       if (d1 .eq. 64 .and.
1783      >    d2 .eq. 64 .and.
1784      >    d3 .eq. 64 .and.
1785      >    nt .eq. 6) then
1786 c---------------------------------------------------------------------
1787 c   Sample size reference checksums
1788 c---------------------------------------------------------------------
1789          class = 'S'
1790          csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
1791          csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
1792          csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
1793          csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
1794          csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
1795          csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
1796
1797       else if (d1 .eq. 128 .and.
1798      >    d2 .eq. 128 .and.
1799      >    d3 .eq. 32 .and.
1800      >    nt .eq. 6) then
1801 c---------------------------------------------------------------------
1802 c   Class W size reference checksums
1803 c---------------------------------------------------------------------
1804          class = 'W'
1805          csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
1806          csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
1807          csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
1808          csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
1809          csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
1810          csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
1811
1812       else if (d1 .eq. 256 .and.
1813      >    d2 .eq. 256 .and.
1814      >    d3 .eq. 128 .and.
1815      >    nt .eq. 6) then
1816 c---------------------------------------------------------------------
1817 c   Class A size reference checksums
1818 c---------------------------------------------------------------------
1819          class = 'A'
1820          csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
1821          csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
1822          csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
1823          csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
1824          csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
1825          csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
1826
1827       else if (d1 .eq. 512 .and.
1828      >    d2 .eq. 256 .and.
1829      >    d3 .eq. 256 .and.
1830      >    nt .eq. 20) then
1831 c---------------------------------------------------------------------
1832 c   Class B size reference checksums
1833 c---------------------------------------------------------------------
1834          class = 'B'
1835          csum_ref(1)  = dcmplx(5.177643571579D+02, 5.077803458597D+02)
1836          csum_ref(2)  = dcmplx(5.154521291263D+02, 5.088249431599D+02)
1837          csum_ref(3)  = dcmplx(5.146409228649D+02, 5.096208912659D+02)
1838          csum_ref(4)  = dcmplx(5.142378756213D+02, 5.101023387619D+02)
1839          csum_ref(5)  = dcmplx(5.139626667737D+02, 5.103976610617D+02)
1840          csum_ref(6)  = dcmplx(5.137423460082D+02, 5.105948019802D+02)
1841          csum_ref(7)  = dcmplx(5.135547056878D+02, 5.107404165783D+02)
1842          csum_ref(8)  = dcmplx(5.133910925466D+02, 5.108576573661D+02)
1843          csum_ref(9)  = dcmplx(5.132470705390D+02, 5.109577278523D+02)
1844          csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
1845          csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
1846          csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
1847          csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
1848          csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
1849          csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
1850          csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
1851          csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
1852          csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
1853          csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
1854          csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
1855
1856       else if (d1 .eq. 512 .and.
1857      >    d2 .eq. 512 .and.
1858      >    d3 .eq. 512 .and.
1859      >    nt .eq. 20) then
1860 c---------------------------------------------------------------------
1861 c   Class C size reference checksums
1862 c---------------------------------------------------------------------
1863          class = 'C'
1864          csum_ref(1)  = dcmplx(5.195078707457D+02, 5.149019699238D+02)
1865          csum_ref(2)  = dcmplx(5.155422171134D+02, 5.127578201997D+02)
1866          csum_ref(3)  = dcmplx(5.144678022222D+02, 5.122251847514D+02)
1867          csum_ref(4)  = dcmplx(5.140150594328D+02, 5.121090289018D+02)
1868          csum_ref(5)  = dcmplx(5.137550426810D+02, 5.121143685824D+02)
1869          csum_ref(6)  = dcmplx(5.135811056728D+02, 5.121496764568D+02)
1870          csum_ref(7)  = dcmplx(5.134569343165D+02, 5.121870921893D+02)
1871          csum_ref(8)  = dcmplx(5.133651975661D+02, 5.122193250322D+02)
1872          csum_ref(9)  = dcmplx(5.132955192805D+02, 5.122454735794D+02)
1873          csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
1874          csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
1875          csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
1876          csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
1877          csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
1878          csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
1879          csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
1880          csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
1881          csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
1882          csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
1883          csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
1884
1885       else if (d1 .eq. 2048 .and.
1886      >    d2 .eq. 1024 .and.
1887      >    d3 .eq. 1024 .and.
1888      >    nt .eq. 25) then
1889 c---------------------------------------------------------------------
1890 c   Class D size reference checksums
1891 c---------------------------------------------------------------------
1892          class = 'D'
1893          csum_ref(1)  = dcmplx(5.122230065252D+02, 5.118534037109D+02)
1894          csum_ref(2)  = dcmplx(5.120463975765D+02, 5.117061181082D+02)
1895          csum_ref(3)  = dcmplx(5.119865766760D+02, 5.117096364601D+02)
1896          csum_ref(4)  = dcmplx(5.119518799488D+02, 5.117373863950D+02)
1897          csum_ref(5)  = dcmplx(5.119269088223D+02, 5.117680347632D+02)
1898          csum_ref(6)  = dcmplx(5.119082416858D+02, 5.117967875532D+02)
1899          csum_ref(7)  = dcmplx(5.118943814638D+02, 5.118225281841D+02)
1900          csum_ref(8)  = dcmplx(5.118842385057D+02, 5.118451629348D+02)
1901          csum_ref(9)  = dcmplx(5.118769435632D+02, 5.118649119387D+02)
1902          csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
1903          csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
1904          csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
1905          csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
1906          csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
1907          csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
1908          csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
1909          csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
1910          csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
1911          csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
1912          csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
1913          csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
1914          csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
1915          csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
1916          csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
1917          csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
1918
1919       else if (d1 .eq. 4096 .and.
1920      >    d2 .eq. 2048 .and.
1921      >    d3 .eq. 2048 .and.
1922      >    nt .eq. 25) then
1923 c---------------------------------------------------------------------
1924 c   Class E size reference checksums
1925 c---------------------------------------------------------------------
1926          class = 'E'
1927          csum_ref(1)  = dcmplx(5.121601045346D+02, 5.117395998266D+02)
1928          csum_ref(2)  = dcmplx(5.120905403678D+02, 5.118614716182D+02)
1929          csum_ref(3)  = dcmplx(5.120623229306D+02, 5.119074203747D+02)
1930          csum_ref(4)  = dcmplx(5.120438418997D+02, 5.119345900733D+02)
1931          csum_ref(5)  = dcmplx(5.120311521872D+02, 5.119551325550D+02)
1932          csum_ref(6)  = dcmplx(5.120226088809D+02, 5.119720179919D+02)
1933          csum_ref(7)  = dcmplx(5.120169296534D+02, 5.119861371665D+02)
1934          csum_ref(8)  = dcmplx(5.120131225172D+02, 5.119979364402D+02)
1935          csum_ref(9)  = dcmplx(5.120104767108D+02, 5.120077674092D+02)
1936          csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
1937          csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
1938          csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
1939          csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
1940          csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
1941          csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
1942          csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
1943          csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
1944          csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
1945          csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
1946          csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
1947          csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
1948          csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
1949          csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
1950          csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
1951          csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
1952
1953       endif
1954
1955
1956       if (class .ne. 'U') then
1957
1958          do i = 1, nt
1959             err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
1960             if (.not.(err .le. epsilon)) goto 100
1961          end do
1962          verified = .TRUE.
1963  100     continue
1964
1965       endif
1966
1967       call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
1968       if (size .ne. np) then
1969          write(*, 4010) np
1970          write(*, 4011)
1971          write(*, 4012)
1972 c---------------------------------------------------------------------
1973 c multiple statements because some Fortran compilers have
1974 c problems with long strings.
1975 c---------------------------------------------------------------------
1976  4010    format( ' Warning: benchmark was compiled for ', i5,
1977      >           'processors')
1978  4011    format( ' Must be run on this many processors for official',
1979      >           ' verification')
1980  4012    format( ' so memory access is repeatable')
1981          verified = .false.
1982       endif
1983
1984       if (class .ne. 'U') then
1985          if (verified) then
1986             write(*,2000)
1987  2000       format(' Result verification successful')
1988          else
1989             write(*,2001)
1990  2001       format(' Result verification failed')
1991          endif
1992       endif
1993       print *, 'class = ', class
1994
1995       return
1996       end
1997
1998