examples/smpi/NAS/MG/mg.f

   1 !-------------------------------------------------------------------------!
   2 !                                                                         !
   3 !        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
   4 !                                                                         !
   5 !                                   M G                                   !
   6 !                                                                         !
   7 !-------------------------------------------------------------------------!
   8 !                                                                         !
   9 !    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
  10 !    It is described in NAS Technical Reports 95-020 and 02-007           !
  11 !                                                                         !
  12 !    Permission to use, copy, distribute and modify this software         !
  13 !    for any purpose with or without fee is hereby granted.  We           !
  14 !    request, however, that all derived work reference the NAS            !
  15 !    Parallel Benchmarks 3.3. This software is provided "as is"           !
  16 !    without express or implied warranty.                                 !
  17 !                                                                         !
  18 !    Information on NPB 3.3, including the technical report, the          !
  19 !    original specifications, source code, results and information        !
  20 !    on how to submit new results, is available at:                       !
  21 !                                                                         !
  22 !           http://www.nas.nasa.gov/Software/NPB/                         !
  23 !                                                                         !
  24 !    Send comments or suggestions to  npb@nas.nasa.gov                    !
  25 !                                                                         !
  26 !          NAS Parallel Benchmarks Group                                  !
  27 !          NASA Ames Research Center                                      !
  28 !          Mail Stop: T27A-1                                              !
  29 !          Moffett Field, CA   94035-1000                                 !
  30 !                                                                         !
  31 !          E-mail:  npb@nas.nasa.gov                                      !
  32 !          Fax:     (650) 604-3957                                        !
  33 !                                                                         !
  34 !-------------------------------------------------------------------------!
  35
  36
  37 c---------------------------------------------------------------------
  38 c
  39 c Authors: E. Barszcz
  40 c          P. Frederickson
  41 c          A. Woo
  42 c          M. Yarrow
  43 c          R. F. Van der Wijngaart
  44 c
  45 c---------------------------------------------------------------------
  46
  47
  48 c---------------------------------------------------------------------
  49       program mg_mpi
  50 c---------------------------------------------------------------------
  51
  52       implicit none
  53
  54       include 'mpinpb.h'
  55       include 'globals.h'
  56
  57 c---------------------------------------------------------------------------c
  58 c k is the current level. It is passed down through subroutine args
  59 c and is NOT global. it is the current iteration
  60 c---------------------------------------------------------------------------c
  61
  62       integer k, it
  63
  64       external timer_read
  65       double precision t, t0, tinit, mflops, timer_read
  66
  67 c---------------------------------------------------------------------------c
  68 c These arrays are in common because they are quite large
  69 c and probably shouldn't be allocated on the stack. They
  70 c are always passed as subroutine args.
  71 c---------------------------------------------------------------------------c
  72
  73       double precision u(nr),v(nv),r(nr),a(0:3),c(0:3)
  74       common /noautom/ u,v,r
  75
  76       double precision rnm2, rnmu, old2, oldu, epsilon
  77       integer n1, n2, n3, nit
  78       double precision nn, verify_value, err
  79       logical verified
  80
  81       integer ierr,i, fstatus
  82       integer T_bench, T_init
  83       parameter (T_bench=1, T_init=2)
  84
  85       call mpi_init(ierr)
  86       call mpi_comm_rank(mpi_comm_world, me, ierr)
  87       call mpi_comm_size(mpi_comm_world, nprocs, ierr)
  88
  89       root = 0
  90       if (nprocs_compiled .gt. maxprocs) then
  91          if (me .eq. root) write(*,20) nprocs_compiled, maxprocs
  92  20      format(' ERROR: compiled for ',i8,' processes'//
  93      &          ' The maximum size allowed for this benchmark is ',i6)
  94          call mpi_abort(MPI_COMM_WORLD, ierr)
  95          stop
  96       endif
  97
  98       if (.not. convertdouble) then
  99          dp_type = MPI_DOUBLE_PRECISION
 100       else
 101          dp_type = MPI_REAL
 102       endif
 103
 104
 105       call timer_clear(T_bench)
 106       call timer_clear(T_init)
 107
 108       call mpi_barrier(MPI_COMM_WORLD, ierr)
 109
 110       call timer_start(T_init)
 111
 112
 113 c---------------------------------------------------------------------
 114 c Read in and broadcast input data
 115 c---------------------------------------------------------------------
 116
 117       if( me .eq. root )then
 118          write (*, 1000)
 119
 120          open(unit=7,file="mg.input", status="old", iostat=fstatus)
 121          if (fstatus .eq. 0) then
 122             write(*,50)
 123  50         format(' Reading from input file mg.input')
 124             read(7,*) lt
 125             read(7,*) nx(lt), ny(lt), nz(lt)
 126             read(7,*) nit
 127             read(7,*) (debug_vec(i),i=0,7)
 128          else
 129             write(*,51)
 130  51         format(' No input file. Using compiled defaults ')
 131             lt = lt_default
 132             nit = nit_default
 133             nx(lt) = nx_default
 134             ny(lt) = ny_default
 135             nz(lt) = nz_default
 136             do i = 0,7
 137                debug_vec(i) = debug_default
 138             end do
 139          endif
 140       endif
 141
 142       call mpi_bcast(lt, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
 143       call mpi_bcast(nit, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
 144       call mpi_bcast(nx(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
 145       call mpi_bcast(ny(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
 146       call mpi_bcast(nz(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
 147       call mpi_bcast(debug_vec(0), 8, MPI_INTEGER, 0,
 148      >               mpi_comm_world, ierr)
 149
 150       if ( (nx(lt) .ne. ny(lt)) .or. (nx(lt) .ne. nz(lt)) ) then
 151          Class = 'U'
 152       else if( nx(lt) .eq. 32 .and. nit .eq. 4 ) then
 153          Class = 'S'
 154       else if( nx(lt) .eq. 128 .and. nit .eq. 4 ) then
 155          Class = 'W'
 156       else if( nx(lt) .eq. 256 .and. nit .eq. 4 ) then
 157          Class = 'A'
 158       else if( nx(lt) .eq. 256 .and. nit .eq. 20 ) then
 159          Class = 'B'
 160       else if( nx(lt) .eq. 512 .and. nit .eq. 20 ) then
 161          Class = 'C'
 162       else if( nx(lt) .eq. 1024 .and. nit .eq. 50 ) then
 163          Class = 'D'
 164       else if( nx(lt) .eq. 2048 .and. nit .eq. 50 ) then
 165          Class = 'E'
 166       else
 167          Class = 'U'
 168       endif
 169
 170 c---------------------------------------------------------------------
 171 c  Use these for debug info:
 172 c---------------------------------------------------------------------
 173 c     debug_vec(0) = 1 !=> report all norms
 174 c     debug_vec(1) = 1 !=> some setup information
 175 c     debug_vec(1) = 2 !=> more setup information
 176 c     debug_vec(2) = k => at level k or below, show result of resid
 177 c     debug_vec(3) = k => at level k or below, show result of psinv
 178 c     debug_vec(4) = k => at level k or below, show result of rprj
 179 c     debug_vec(5) = k => at level k or below, show result of interp
 180 c     debug_vec(6) = 1 => (unused)
 181 c     debug_vec(7) = 1 => (unused)
 182 c---------------------------------------------------------------------
 183       a(0) = -8.0D0/3.0D0
 184       a(1) =  0.0D0
 185       a(2) =  1.0D0/6.0D0
 186       a(3) =  1.0D0/12.0D0
 187
 188       if(Class .eq. 'A' .or. Class .eq. 'S'.or. Class .eq.'W') then
 189 c---------------------------------------------------------------------
 190 c     Coefficients for the S(a) smoother
 191 c---------------------------------------------------------------------
 192          c(0) =  -3.0D0/8.0D0
 193          c(1) =  +1.0D0/32.0D0
 194          c(2) =  -1.0D0/64.0D0
 195          c(3) =   0.0D0
 196       else
 197 c---------------------------------------------------------------------
 198 c     Coefficients for the S(b) smoother
 199 c---------------------------------------------------------------------
 200          c(0) =  -3.0D0/17.0D0
 201          c(1) =  +1.0D0/33.0D0
 202          c(2) =  -1.0D0/61.0D0
 203          c(3) =   0.0D0
 204       endif
 205       lb = 1
 206       k  = lt
 207
 208       call setup(n1,n2,n3,k)
 209       call zero3(u,n1,n2,n3)
 210       call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
 211
 212       call norm2u3(v,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
 213
 214       if( me .eq. root )then
 215          write (*, 1001) nx(lt),ny(lt),nz(lt), Class
 216          write (*, 1002) nit
 217
 218  1000 format(//,' NAS Parallel Benchmarks 3.3 -- MG Benchmark', /)
 219  1001    format(' Size: ', i4, 'x', i4, 'x', i4, '  (class ', A, ')' )
 220  1002    format(' Iterations: ', i4)
 221  1003    format(' Number of processes: ', i6)
 222          if (nprocs .ne. nprocs_compiled) then
 223            write (*, 1004) nprocs_compiled
 224            write (*, 1005) nprocs
 225  1004      format(' WARNING: compiled for ', i6, ' processes ')
 226  1005      format(' Number of active processes: ', i6, /)
 227          else
 228            write (*, 1003) nprocs
 229          endif
 230       endif
 231
 232       call resid(u,v,r,n1,n2,n3,a,k)
 233       call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
 234       old2 = rnm2
 235       oldu = rnmu
 236
 237 c---------------------------------------------------------------------
 238 c     One iteration for startup
 239 c---------------------------------------------------------------------
 240       call mg3P(u,v,r,a,c,n1,n2,n3,k)
 241       call resid(u,v,r,n1,n2,n3,a,k)
 242       call setup(n1,n2,n3,k)
 243       call zero3(u,n1,n2,n3)
 244       call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
 245
 246       call timer_stop(T_init)
 247       if( me .eq. root )then
 248          tinit = timer_read(T_init)
 249          write( *,'(/A,F15.3,A/)' )
 250      >        ' Initialization time: ',tinit, ' seconds'
 251       endif
 252
 253       call mpi_barrier(mpi_comm_world,ierr)
 254
 255       call timer_start(T_bench)
 256
 257       call resid(u,v,r,n1,n2,n3,a,k)
 258       call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
 259       old2 = rnm2
 260       oldu = rnmu
 261
 262       do  it=1,nit
 263          if (it.eq.1 .or. it.eq.nit .or. mod(it,5).eq.0) then
 264             if (me .eq. root) write(*,80) it
 265    80       format('  iter ',i4)
 266          endif
 267          call mg3P(u,v,r,a,c,n1,n2,n3,k)
 268          call resid(u,v,r,n1,n2,n3,a,k)
 269       enddo
 270
 271
 272       call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
 273
 274       call timer_stop(T_bench)
 275
 276       t0 = timer_read(T_bench)
 277
 278       call mpi_reduce(t0,t,1,dp_type,
 279      >     mpi_max,root,mpi_comm_world,ierr)
 280       verified = .FALSE.
 281       verify_value = 0.0
 282       if( me .eq. root )then
 283          write(*,100)
 284  100     format(/' Benchmark completed ')
 285
 286          epsilon = 1.d-8
 287          if (Class .ne. 'U') then
 288             if(Class.eq.'S') then
 289                verify_value = 0.5307707005734d-04
 290             elseif(Class.eq.'W') then
 291                verify_value = 0.6467329375339d-05
 292             elseif(Class.eq.'A') then
 293                verify_value = 0.2433365309069d-05
 294             elseif(Class.eq.'B') then
 295                verify_value = 0.1800564401355d-05
 296             elseif(Class.eq.'C') then
 297                verify_value = 0.5706732285740d-06
 298             elseif(Class.eq.'D') then
 299                verify_value = 0.1583275060440d-09
 300             elseif(Class.eq.'E') then
 301                verify_value = 0.5630442584711d-10
 302             endif
 303
 304             err = abs( rnm2 - verify_value ) / verify_value
 305             if( err .le. epsilon ) then
 306                verified = .TRUE.
 307                write(*, 200)
 308                write(*, 201) rnm2
 309                write(*, 202) err
 310  200           format(' VERIFICATION SUCCESSFUL ')
 311  201           format(' L2 Norm is ', E20.13)
 312  202           format(' Error is   ', E20.13)
 313             else
 314                verified = .FALSE.
 315                write(*, 300)
 316                write(*, 301) rnm2
 317                write(*, 302) verify_value
 318  300           format(' VERIFICATION FAILED')
 319  301           format(' L2 Norm is             ', E20.13)
 320  302           format(' The correct L2 Norm is ', E20.13)
 321             endif
 322          else
 323             verified = .FALSE.
 324             write (*, 400)
 325             write (*, 401)
 326             write (*, 201) rnm2
 327  400        format(' Problem size unknown')
 328  401        format(' NO VERIFICATION PERFORMED')
 329          endif
 330
 331          nn = 1.0d0*nx(lt)*ny(lt)*nz(lt)
 332
 333          if( t .ne. 0. ) then
 334             mflops = 58.*1.0D-6*nit*nn / t
 335          else
 336             mflops = 0.0
 337          endif
 338
 339          call print_results('MG', class, nx(lt), ny(lt), nz(lt),
 340      >                      nit, nprocs_compiled, nprocs, t,
 341      >                      mflops, '          floating point',
 342      >                      verified, npbversion, compiletime,
 343      >                      cs1, cs2, cs3, cs4, cs5, cs6, cs7)
 344
 345
 346       endif
 347
 348
 349       call mpi_finalize(ierr)
 350       end
 351
 352 c---------------------------------------------------------------------
 353 c---------------------------------------------------------------------
 354
 355       subroutine setup(n1,n2,n3,k)
 356
 357 c---------------------------------------------------------------------
 358 c---------------------------------------------------------------------
 359
 360       implicit none
 361
 362       include 'mpinpb.h'
 363       include 'globals.h'
 364
 365       integer  is1, is2, is3, ie1, ie2, ie3
 366       common /grid/ is1,is2,is3,ie1,ie2,ie3
 367
 368       integer n1,n2,n3,k
 369       integer dx, dy, log_p, d, i, j
 370
 371       integer ax, next(3),mi(3,maxlevel),mip(3,maxlevel)
 372       integer ng(3,maxlevel)
 373       integer idi(3), pi(3), idin(3,-1:1)
 374       integer s, dir,ierr
 375
 376       do  j=-1,1,1
 377          do  d=1,3
 378             msg_type(d,j) = 100*(j+2+10*d)
 379          enddo
 380       enddo
 381
 382       ng(1,lt) = nx(lt)
 383       ng(2,lt) = ny(lt)
 384       ng(3,lt) = nz(lt)
 385       do  ax=1,3
 386          next(ax) = 1
 387          do  k=lt-1,1,-1
 388             ng(ax,k) = ng(ax,k+1)/2
 389          enddo
 390       enddo
 391  61   format(10i4)
 392       do  k=lt,1,-1
 393          nx(k) = ng(1,k)
 394          ny(k) = ng(2,k)
 395          nz(k) = ng(3,k)
 396       enddo
 397
 398       log_p  = log(float(nprocs)+0.0001)/log(2.0)
 399       dx     = log_p/3
 400       pi(1)  = 2**dx
 401       idi(1) = mod(me,pi(1))
 402
 403       dy     = (log_p-dx)/2
 404       pi(2)  = 2**dy
 405       idi(2) = mod((me/pi(1)),pi(2))
 406
 407       pi(3)  = nprocs/(pi(1)*pi(2))
 408       idi(3) = me/(pi(1)*pi(2))
 409
 410       do  k = lt,1,-1
 411          dead(k) = .false.
 412          do  ax = 1,3
 413             take_ex(ax,k) = .false.
 414             give_ex(ax,k) = .false.
 415
 416             mi(ax,k) = 2 +
 417      >           ((idi(ax)+1)*ng(ax,k))/pi(ax) -
 418      >           ((idi(ax)+0)*ng(ax,k))/pi(ax)
 419             mip(ax,k) = 2 +
 420      >           ((next(ax)+idi(ax)+1)*ng(ax,k))/pi(ax) -
 421      >           ((next(ax)+idi(ax)+0)*ng(ax,k))/pi(ax)
 422
 423             if(mip(ax,k).eq.2.or.mi(ax,k).eq.2)then
 424                next(ax) = 2*next(ax)
 425             endif
 426
 427             if( k+1 .le. lt )then
 428                if((mip(ax,k).eq.2).and.(mi(ax,k).eq.3))then
 429                   give_ex(ax,k+1) = .true.
 430                endif
 431                if((mip(ax,k).eq.3).and.(mi(ax,k).eq.2))then
 432                   take_ex(ax,k+1) = .true.
 433                endif
 434             endif
 435          enddo
 436
 437          if( mi(1,k).eq.2 .or.
 438      >        mi(2,k).eq.2 .or.
 439      >        mi(3,k).eq.2      )then
 440             dead(k) = .true.
 441          endif
 442          m1(k) = mi(1,k)
 443          m2(k) = mi(2,k)
 444          m3(k) = mi(3,k)
 445
 446          do  ax=1,3
 447             idin(ax,+1) = mod( idi(ax) + next(ax) + pi(ax) , pi(ax) )
 448             idin(ax,-1) = mod( idi(ax) - next(ax) + pi(ax) , pi(ax) )
 449          enddo
 450          do  dir = 1,-1,-2
 451             nbr(1,dir,k) = idin(1,dir) + pi(1)
 452      >           *(idi(2)      + pi(2)
 453      >           * idi(3))
 454             nbr(2,dir,k) = idi(1)      + pi(1)
 455      >           *(idin(2,dir) + pi(2)
 456      >           * idi(3))
 457             nbr(3,dir,k) = idi(1)      + pi(1)
 458      >           *(idi(2)      + pi(2)
 459      >           * idin(3,dir))
 460          enddo
 461       enddo
 462
 463       k = lt
 464       is1 = 2 + ng(1,k) - ((pi(1)  -idi(1))*ng(1,lt))/pi(1)
 465       ie1 = 1 + ng(1,k) - ((pi(1)-1-idi(1))*ng(1,lt))/pi(1)
 466       n1 = 3 + ie1 - is1
 467       is2 = 2 + ng(2,k) - ((pi(2)  -idi(2))*ng(2,lt))/pi(2)
 468       ie2 = 1 + ng(2,k) - ((pi(2)-1-idi(2))*ng(2,lt))/pi(2)
 469       n2 = 3 + ie2 - is2
 470       is3 = 2 + ng(3,k) - ((pi(3)  -idi(3))*ng(3,lt))/pi(3)
 471       ie3 = 1 + ng(3,k) - ((pi(3)-1-idi(3))*ng(3,lt))/pi(3)
 472       n3 = 3 + ie3 - is3
 473
 474
 475       ir(lt)=1
 476       do  j = lt-1, 1, -1
 477          ir(j)=ir(j+1)+m1(j+1)*m2(j+1)*m3(j+1)
 478       enddo
 479
 480
 481       if( debug_vec(1) .ge. 1 )then
 482          if( me .eq. root )write(*,*)' in setup, '
 483          if( me .eq. root )write(*,*)' me   k  lt  nx  ny  nz ',
 484      >        ' n1  n2  n3 is1 is2 is3 ie1 ie2 ie3'
 485          do  i=0,nprocs-1
 486             if( me .eq. i )then
 487                write(*,9) me,k,lt,ng(1,k),ng(2,k),ng(3,k),
 488      >              n1,n2,n3,is1,is2,is3,ie1,ie2,ie3
 489  9             format(15i4)
 490             endif
 491             call mpi_barrier(mpi_comm_world,ierr)
 492          enddo
 493       endif
 494       if( debug_vec(1) .ge. 2 )then
 495          do  i=0,nprocs-1
 496             if( me .eq. i )then
 497                write(*,*)' '
 498                write(*,*)' processor =',me
 499                do  k=lt,1,-1
 500                   write(*,7)k,idi(1),idi(2),idi(3),
 501      >                 ((nbr(d,j,k),j=-1,1,2),d=1,3),
 502      >                 (mi(d,k),d=1,3)
 503                enddo
 504  7             format(i4,'idi=',3i4,'nbr=',3(2i4,'  '),'mi=',3i4,' ')
 505                write(*,*)'idi(s) = ',(idi(s),s=1,3)
 506                write(*,*)'dead(2), dead(1) = ',dead(2),dead(1)
 507                do  ax=1,3
 508                   write(*,*)'give_ex(ax,2)= ',give_ex(ax,2)
 509                   write(*,*)'take_ex(ax,2)= ',take_ex(ax,2)
 510                enddo
 511             endif
 512             call mpi_barrier(mpi_comm_world,ierr)
 513          enddo
 514       endif
 515
 516       k = lt
 517
 518       return
 519       end
 520
 521 c---------------------------------------------------------------------
 522 c---------------------------------------------------------------------
 523
 524       subroutine mg3P(u,v,r,a,c,n1,n2,n3,k)
 525
 526 c---------------------------------------------------------------------
 527 c---------------------------------------------------------------------
 528
 529 c---------------------------------------------------------------------
 530 c     multigrid V-cycle routine
 531 c---------------------------------------------------------------------
 532       implicit none
 533
 534       include 'mpinpb.h'
 535       include 'globals.h'
 536
 537       integer n1, n2, n3, k
 538       double precision u(nr),v(nv),r(nr)
 539       double precision a(0:3),c(0:3)
 540
 541       integer j
 542
 543 c---------------------------------------------------------------------
 544 c     down cycle.
 545 c     restrict the residual from the find grid to the coarse
 546 c---------------------------------------------------------------------
 547
 548       do  k= lt, lb+1 , -1
 549          j = k-1
 550          call rprj3(r(ir(k)),m1(k),m2(k),m3(k),
 551      >        r(ir(j)),m1(j),m2(j),m3(j),k)
 552       enddo
 553
 554       k = lb
 555 c---------------------------------------------------------------------
 556 c     compute an approximate solution on the coarsest grid
 557 c---------------------------------------------------------------------
 558       call zero3(u(ir(k)),m1(k),m2(k),m3(k))
 559       call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
 560
 561       do  k = lb+1, lt-1
 562           j = k-1
 563 c---------------------------------------------------------------------
 564 c        prolongate from level k-1  to k
 565 c---------------------------------------------------------------------
 566          call zero3(u(ir(k)),m1(k),m2(k),m3(k))
 567          call interp(u(ir(j)),m1(j),m2(j),m3(j),
 568      >               u(ir(k)),m1(k),m2(k),m3(k),k)
 569 c---------------------------------------------------------------------
 570 c        compute residual for level k
 571 c---------------------------------------------------------------------
 572          call resid(u(ir(k)),r(ir(k)),r(ir(k)),m1(k),m2(k),m3(k),a,k)
 573 c---------------------------------------------------------------------
 574 c        apply smoother
 575 c---------------------------------------------------------------------
 576          call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
 577       enddo
 578  200  continue
 579       j = lt - 1
 580       k = lt
 581       call interp(u(ir(j)),m1(j),m2(j),m3(j),u,n1,n2,n3,k)
 582       call resid(u,v,r,n1,n2,n3,a,k)
 583       call psinv(r,u,n1,n2,n3,c,k)
 584
 585       return
 586       end
 587
 588 c---------------------------------------------------------------------
 589 c---------------------------------------------------------------------
 590
 591       subroutine psinv( r,u,n1,n2,n3,c,k)
 592
 593 c---------------------------------------------------------------------
 594 c---------------------------------------------------------------------
 595
 596 c---------------------------------------------------------------------
 597 c     psinv applies an approximate inverse as smoother:  u = u + Cr
 598 c
 599 c     This  implementation costs  15A + 4M per result, where
 600 c     A and M denote the costs of Addition and Multiplication.
 601 c     Presuming coefficient c(3) is zero (the NPB assumes this,
 602 c     but it is thus not a general case), 2A + 1M may be eliminated,
 603 c     resulting in 13A + 3M.
 604 c     Note that this vectorizes, and is also fine for cache
 605 c     based machines.
 606 c---------------------------------------------------------------------
 607       implicit none
 608
 609       include 'globals.h'
 610
 611       integer n1,n2,n3,k
 612       double precision u(n1,n2,n3),r(n1,n2,n3),c(0:3)
 613       integer i3, i2, i1
 614
 615       double precision r1(m), r2(m)
 616
 617       do i3=2,n3-1
 618          do i2=2,n2-1
 619             do i1=1,n1
 620                r1(i1) = r(i1,i2-1,i3) + r(i1,i2+1,i3)
 621      >                + r(i1,i2,i3-1) + r(i1,i2,i3+1)
 622                r2(i1) = r(i1,i2-1,i3-1) + r(i1,i2+1,i3-1)
 623      >                + r(i1,i2-1,i3+1) + r(i1,i2+1,i3+1)
 624             enddo
 625             do i1=2,n1-1
 626                u(i1,i2,i3) = u(i1,i2,i3)
 627      >                     + c(0) * r(i1,i2,i3)
 628      >                     + c(1) * ( r(i1-1,i2,i3) + r(i1+1,i2,i3)
 629      >                              + r1(i1) )
 630      >                     + c(2) * ( r2(i1) + r1(i1-1) + r1(i1+1) )
 631 c---------------------------------------------------------------------
 632 c  Assume c(3) = 0    (Enable line below if c(3) not= 0)
 633 c---------------------------------------------------------------------
 634 c    >                     + c(3) * ( r2(i1-1) + r2(i1+1) )
 635 c---------------------------------------------------------------------
 636             enddo
 637          enddo
 638       enddo
 639
 640 c---------------------------------------------------------------------
 641 c     exchange boundary points
 642 c---------------------------------------------------------------------
 643       call comm3(u,n1,n2,n3,k)
 644
 645       if( debug_vec(0) .ge. 1 )then
 646          call rep_nrm(u,n1,n2,n3,'   psinv',k)
 647       endif
 648
 649       if( debug_vec(3) .ge. k )then
 650          call showall(u,n1,n2,n3)
 651       endif
 652
 653       return
 654       end
 655
 656 c---------------------------------------------------------------------
 657 c---------------------------------------------------------------------
 658
 659       subroutine resid( u,v,r,n1,n2,n3,a,k )
 660
 661 c---------------------------------------------------------------------
 662 c---------------------------------------------------------------------
 663
 664 c---------------------------------------------------------------------
 665 c     resid computes the residual:  r = v - Au
 666 c
 667 c     This  implementation costs  15A + 4M per result, where
 668 c     A and M denote the costs of Addition (or Subtraction) and
 669 c     Multiplication, respectively.
 670 c     Presuming coefficient a(1) is zero (the NPB assumes this,
 671 c     but it is thus not a general case), 3A + 1M may be eliminated,
 672 c     resulting in 12A + 3M.
 673 c     Note that this vectorizes, and is also fine for cache
 674 c     based machines.
 675 c---------------------------------------------------------------------
 676       implicit none
 677
 678       include 'globals.h'
 679
 680       integer n1,n2,n3,k
 681       double precision u(n1,n2,n3),v(n1,n2,n3),r(n1,n2,n3),a(0:3)
 682       integer i3, i2, i1
 683       double precision u1(m), u2(m)
 684
 685       do i3=2,n3-1
 686          do i2=2,n2-1
 687             do i1=1,n1
 688                u1(i1) = u(i1,i2-1,i3) + u(i1,i2+1,i3)
 689      >                + u(i1,i2,i3-1) + u(i1,i2,i3+1)
 690                u2(i1) = u(i1,i2-1,i3-1) + u(i1,i2+1,i3-1)
 691      >                + u(i1,i2-1,i3+1) + u(i1,i2+1,i3+1)
 692             enddo
 693             do i1=2,n1-1
 694                r(i1,i2,i3) = v(i1,i2,i3)
 695      >                     - a(0) * u(i1,i2,i3)
 696 c---------------------------------------------------------------------
 697 c  Assume a(1) = 0      (Enable 2 lines below if a(1) not= 0)
 698 c---------------------------------------------------------------------
 699 c    >                     - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)
 700 c    >                              + u1(i1) )
 701 c---------------------------------------------------------------------
 702      >                     - a(2) * ( u2(i1) + u1(i1-1) + u1(i1+1) )
 703      >                     - a(3) * ( u2(i1-1) + u2(i1+1) )
 704             enddo
 705          enddo
 706       enddo
 707
 708 c---------------------------------------------------------------------
 709 c     exchange boundary data
 710 c---------------------------------------------------------------------
 711       call comm3(r,n1,n2,n3,k)
 712
 713       if( debug_vec(0) .ge. 1 )then
 714          call rep_nrm(r,n1,n2,n3,'   resid',k)
 715       endif
 716
 717       if( debug_vec(2) .ge. k )then
 718          call showall(r,n1,n2,n3)
 719       endif
 720
 721       return
 722       end
 723
 724 c---------------------------------------------------------------------
 725 c---------------------------------------------------------------------
 726
 727       subroutine rprj3( r,m1k,m2k,m3k,s,m1j,m2j,m3j,k )
 728
 729 c---------------------------------------------------------------------
 730 c---------------------------------------------------------------------
 731
 732 c---------------------------------------------------------------------
 733 c     rprj3 projects onto the next coarser grid,
 734 c     using a trilinear Finite Element projection:  s = r' = P r
 735 c
 736 c     This  implementation costs  20A + 4M per result, where
 737 c     A and M denote the costs of Addition and Multiplication.
 738 c     Note that this vectorizes, and is also fine for cache
 739 c     based machines.
 740 c---------------------------------------------------------------------
 741       implicit none
 742
 743       include 'mpinpb.h'
 744       include 'globals.h'
 745
 746       integer m1k, m2k, m3k, m1j, m2j, m3j,k
 747       double precision r(m1k,m2k,m3k), s(m1j,m2j,m3j)
 748       integer j3, j2, j1, i3, i2, i1, d1, d2, d3, j
 749
 750       double precision x1(m), y1(m), x2,y2
 751
 752
 753       if(m1k.eq.3)then
 754         d1 = 2
 755       else
 756         d1 = 1
 757       endif
 758
 759       if(m2k.eq.3)then
 760         d2 = 2
 761       else
 762         d2 = 1
 763       endif
 764
 765       if(m3k.eq.3)then
 766         d3 = 2
 767       else
 768         d3 = 1
 769       endif
 770
 771       do  j3=2,m3j-1
 772          i3 = 2*j3-d3
 773 C        i3 = 2*j3-1
 774          do  j2=2,m2j-1
 775             i2 = 2*j2-d2
 776 C           i2 = 2*j2-1
 777
 778             do j1=2,m1j
 779               i1 = 2*j1-d1
 780 C             i1 = 2*j1-1
 781               x1(i1-1) = r(i1-1,i2-1,i3  ) + r(i1-1,i2+1,i3  )
 782      >                 + r(i1-1,i2,  i3-1) + r(i1-1,i2,  i3+1)
 783               y1(i1-1) = r(i1-1,i2-1,i3-1) + r(i1-1,i2-1,i3+1)
 784      >                 + r(i1-1,i2+1,i3-1) + r(i1-1,i2+1,i3+1)
 785             enddo
 786
 787             do  j1=2,m1j-1
 788               i1 = 2*j1-d1
 789 C             i1 = 2*j1-1
 790               y2 = r(i1,  i2-1,i3-1) + r(i1,  i2-1,i3+1)
 791      >           + r(i1,  i2+1,i3-1) + r(i1,  i2+1,i3+1)
 792               x2 = r(i1,  i2-1,i3  ) + r(i1,  i2+1,i3  )
 793      >           + r(i1,  i2,  i3-1) + r(i1,  i2,  i3+1)
 794               s(j1,j2,j3) =
 795      >               0.5D0 * r(i1,i2,i3)
 796      >             + 0.25D0 * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) + x2)
 797      >             + 0.125D0 * ( x1(i1-1) + x1(i1+1) + y2)
 798      >             + 0.0625D0 * ( y1(i1-1) + y1(i1+1) )
 799             enddo
 800
 801          enddo
 802       enddo
 803
 804
 805       j = k-1
 806       call comm3(s,m1j,m2j,m3j,j)
 807
 808       if( debug_vec(0) .ge. 1 )then
 809          call rep_nrm(s,m1j,m2j,m3j,'   rprj3',k-1)
 810       endif
 811
 812       if( debug_vec(4) .ge. k )then
 813          call showall(s,m1j,m2j,m3j)
 814       endif
 815
 816       return
 817       end
 818
 819 c---------------------------------------------------------------------
 820 c---------------------------------------------------------------------
 821
 822       subroutine interp( z,mm1,mm2,mm3,u,n1,n2,n3,k )
 823
 824 c---------------------------------------------------------------------
 825 c---------------------------------------------------------------------
 826
 827 c---------------------------------------------------------------------
 828 c     interp adds the trilinear interpolation of the correction
 829 c     from the coarser grid to the current approximation:  u = u + Qu'
 830 c
 831 c     Observe that this  implementation costs  16A + 4M, where
 832 c     A and M denote the costs of Addition and Multiplication.
 833 c     Note that this vectorizes, and is also fine for cache
 834 c     based machines.  Vector machines may get slightly better
 835 c     performance however, with 8 separate "do i1" loops, rather than 4.
 836 c---------------------------------------------------------------------
 837       implicit none
 838
 839       include 'mpinpb.h'
 840       include 'globals.h'
 841
 842       integer mm1, mm2, mm3, n1, n2, n3,k
 843       double precision z(mm1,mm2,mm3),u(n1,n2,n3)
 844       integer i3, i2, i1, d1, d2, d3, t1, t2, t3
 845
 846 c note that m = 1037 in globals.h but for this only need to be
 847 c 535 to handle up to 1024^3
 848 c      integer m
 849 c      parameter( m=535 )
 850       double precision z1(m),z2(m),z3(m)
 851
 852
 853       if( n1 .ne. 3 .and. n2 .ne. 3 .and. n3 .ne. 3 ) then
 854
 855          do  i3=1,mm3-1
 856             do  i2=1,mm2-1
 857
 858                do i1=1,mm1
 859                   z1(i1) = z(i1,i2+1,i3) + z(i1,i2,i3)
 860                   z2(i1) = z(i1,i2,i3+1) + z(i1,i2,i3)
 861                   z3(i1) = z(i1,i2+1,i3+1) + z(i1,i2,i3+1) + z1(i1)
 862                enddo
 863
 864                do  i1=1,mm1-1
 865                   u(2*i1-1,2*i2-1,2*i3-1)=u(2*i1-1,2*i2-1,2*i3-1)
 866      >                 +z(i1,i2,i3)
 867                   u(2*i1,2*i2-1,2*i3-1)=u(2*i1,2*i2-1,2*i3-1)
 868      >                 +0.5d0*(z(i1+1,i2,i3)+z(i1,i2,i3))
 869                enddo
 870                do i1=1,mm1-1
 871                   u(2*i1-1,2*i2,2*i3-1)=u(2*i1-1,2*i2,2*i3-1)
 872      >                 +0.5d0 * z1(i1)
 873                   u(2*i1,2*i2,2*i3-1)=u(2*i1,2*i2,2*i3-1)
 874      >                 +0.25d0*( z1(i1) + z1(i1+1) )
 875                enddo
 876                do i1=1,mm1-1
 877                   u(2*i1-1,2*i2-1,2*i3)=u(2*i1-1,2*i2-1,2*i3)
 878      >                 +0.5d0 * z2(i1)
 879                   u(2*i1,2*i2-1,2*i3)=u(2*i1,2*i2-1,2*i3)
 880      >                 +0.25d0*( z2(i1) + z2(i1+1) )
 881                enddo
 882                do i1=1,mm1-1
 883                   u(2*i1-1,2*i2,2*i3)=u(2*i1-1,2*i2,2*i3)
 884      >                 +0.25d0* z3(i1)
 885                   u(2*i1,2*i2,2*i3)=u(2*i1,2*i2,2*i3)
 886      >                 +0.125d0*( z3(i1) + z3(i1+1) )
 887                enddo
 888             enddo
 889          enddo
 890
 891       else
 892
 893          if(n1.eq.3)then
 894             d1 = 2
 895             t1 = 1
 896          else
 897             d1 = 1
 898             t1 = 0
 899          endif
 900
 901          if(n2.eq.3)then
 902             d2 = 2
 903             t2 = 1
 904          else
 905             d2 = 1
 906             t2 = 0
 907          endif
 908
 909          if(n3.eq.3)then
 910             d3 = 2
 911             t3 = 1
 912          else
 913             d3 = 1
 914             t3 = 0
 915          endif
 916
 917          do  i3=d3,mm3-1
 918             do  i2=d2,mm2-1
 919                do  i1=d1,mm1-1
 920                   u(2*i1-d1,2*i2-d2,2*i3-d3)=u(2*i1-d1,2*i2-d2,2*i3-d3)
 921      >                 +z(i1,i2,i3)
 922                enddo
 923                do  i1=1,mm1-1
 924                   u(2*i1-t1,2*i2-d2,2*i3-d3)=u(2*i1-t1,2*i2-d2,2*i3-d3)
 925      >                 +0.5D0*(z(i1+1,i2,i3)+z(i1,i2,i3))
 926                enddo
 927             enddo
 928             do  i2=1,mm2-1
 929                do  i1=d1,mm1-1
 930                   u(2*i1-d1,2*i2-t2,2*i3-d3)=u(2*i1-d1,2*i2-t2,2*i3-d3)
 931      >                 +0.5D0*(z(i1,i2+1,i3)+z(i1,i2,i3))
 932                enddo
 933                do  i1=1,mm1-1
 934                   u(2*i1-t1,2*i2-t2,2*i3-d3)=u(2*i1-t1,2*i2-t2,2*i3-d3)
 935      >                 +0.25D0*(z(i1+1,i2+1,i3)+z(i1+1,i2,i3)
 936      >                 +z(i1,  i2+1,i3)+z(i1,  i2,i3))
 937                enddo
 938             enddo
 939          enddo
 940
 941          do  i3=1,mm3-1
 942             do  i2=d2,mm2-1
 943                do  i1=d1,mm1-1
 944                   u(2*i1-d1,2*i2-d2,2*i3-t3)=u(2*i1-d1,2*i2-d2,2*i3-t3)
 945      >                 +0.5D0*(z(i1,i2,i3+1)+z(i1,i2,i3))
 946                enddo
 947                do  i1=1,mm1-1
 948                   u(2*i1-t1,2*i2-d2,2*i3-t3)=u(2*i1-t1,2*i2-d2,2*i3-t3)
 949      >                 +0.25D0*(z(i1+1,i2,i3+1)+z(i1,i2,i3+1)
 950      >                 +z(i1+1,i2,i3  )+z(i1,i2,i3  ))
 951                enddo
 952             enddo
 953             do  i2=1,mm2-1
 954                do  i1=d1,mm1-1
 955                   u(2*i1-d1,2*i2-t2,2*i3-t3)=u(2*i1-d1,2*i2-t2,2*i3-t3)
 956      >                 +0.25D0*(z(i1,i2+1,i3+1)+z(i1,i2,i3+1)
 957      >                 +z(i1,i2+1,i3  )+z(i1,i2,i3  ))
 958                enddo
 959                do  i1=1,mm1-1
 960                   u(2*i1-t1,2*i2-t2,2*i3-t3)=u(2*i1-t1,2*i2-t2,2*i3-t3)
 961      >                 +0.125D0*(z(i1+1,i2+1,i3+1)+z(i1+1,i2,i3+1)
 962      >                 +z(i1  ,i2+1,i3+1)+z(i1  ,i2,i3+1)
 963      >                 +z(i1+1,i2+1,i3  )+z(i1+1,i2,i3  )
 964      >                 +z(i1  ,i2+1,i3  )+z(i1  ,i2,i3  ))
 965                enddo
 966             enddo
 967          enddo
 968
 969       endif
 970
 971       call comm3_ex(u,n1,n2,n3,k)
 972
 973       if( debug_vec(0) .ge. 1 )then
 974          call rep_nrm(z,mm1,mm2,mm3,'z: inter',k-1)
 975          call rep_nrm(u,n1,n2,n3,'u: inter',k)
 976       endif
 977
 978       if( debug_vec(5) .ge. k )then
 979          call showall(z,mm1,mm2,mm3)
 980          call showall(u,n1,n2,n3)
 981       endif
 982
 983       return
 984       end
 985
 986 c---------------------------------------------------------------------
 987 c---------------------------------------------------------------------
 988
 989       subroutine norm2u3(r,n1,n2,n3,rnm2,rnmu,nx,ny,nz)
 990
 991 c---------------------------------------------------------------------
 992 c---------------------------------------------------------------------
 993
 994 c---------------------------------------------------------------------
 995 c     norm2u3 evaluates approximations to the L2 norm and the
 996 c     uniform (or L-infinity or Chebyshev) norm, under the
 997 c     assumption that the boundaries are periodic or zero.  Add the
 998 c     boundaries in with half weight (quarter weight on the edges
 999 c     and eighth weight at the corners) for inhomogeneous boundaries.
1000 c---------------------------------------------------------------------
1001       implicit none
1002
1003       include 'mpinpb.h'
1004
1005       integer n1, n2, n3, nx, ny, nz
1006       double precision rnm2, rnmu, r(n1,n2,n3)
1007       double precision s, a, ss
1008       integer i3, i2, i1, ierr
1009
1010       double precision dn
1011
1012       dn = 1.0d0*nx*ny*nz
1013
1014       s=0.0D0
1015       rnmu = 0.0D0
1016       do  i3=2,n3-1
1017          do  i2=2,n2-1
1018             do  i1=2,n1-1
1019                s=s+r(i1,i2,i3)**2
1020                a=abs(r(i1,i2,i3))
1021                if(a.gt.rnmu)rnmu=a
1022             enddo
1023          enddo
1024       enddo
1025
1026       call mpi_allreduce(rnmu,ss,1,dp_type,
1027      >     mpi_max,mpi_comm_world,ierr)
1028       rnmu = ss
1029       call mpi_allreduce(s, ss, 1, dp_type,
1030      >     mpi_sum,mpi_comm_world,ierr)
1031       s = ss
1032       rnm2=sqrt( s / dn )
1033
1034       return
1035       end
1036
1037 c---------------------------------------------------------------------
1038 c---------------------------------------------------------------------
1039
1040       subroutine rep_nrm(u,n1,n2,n3,title,kk)
1041
1042 c---------------------------------------------------------------------
1043 c---------------------------------------------------------------------
1044
1045 c---------------------------------------------------------------------
1046 c     report on norm
1047 c---------------------------------------------------------------------
1048       implicit none
1049
1050       include 'mpinpb.h'
1051       include 'globals.h'
1052
1053       integer n1, n2, n3, kk
1054       double precision u(n1,n2,n3)
1055       character*8 title
1056
1057       double precision rnm2, rnmu
1058
1059
1060       call norm2u3(u,n1,n2,n3,rnm2,rnmu,nx(kk),ny(kk),nz(kk))
1061       if( me .eq. root )then
1062          write(*,7)kk,title,rnm2,rnmu
1063  7       format(' Level',i2,' in ',a8,': norms =',D21.14,D21.14)
1064       endif
1065
1066       return
1067       end
1068
1069 c---------------------------------------------------------------------
1070 c---------------------------------------------------------------------
1071
1072       subroutine comm3(u,n1,n2,n3,kk)
1073
1074 c---------------------------------------------------------------------
1075 c---------------------------------------------------------------------
1076
1077 c---------------------------------------------------------------------
1078 c     comm3 organizes the communication on all borders
1079 c---------------------------------------------------------------------
1080       implicit none
1081
1082       include 'mpinpb.h'
1083       include 'globals.h'
1084
1085       integer n1, n2, n3, kk
1086       double precision u(n1,n2,n3)
1087       integer axis
1088
1089       if( .not. dead(kk) )then
1090          do  axis = 1, 3
1091             if( nprocs .ne. 1) then
1092
1093                call ready( axis, -1, kk )
1094                call ready( axis, +1, kk )
1095
1096                call give3( axis, +1, u, n1, n2, n3, kk )
1097                call give3( axis, -1, u, n1, n2, n3, kk )
1098
1099                call take3( axis, -1, u, n1, n2, n3 )
1100                call take3( axis, +1, u, n1, n2, n3 )
1101
1102             else
1103                call comm1p( axis, u, n1, n2, n3, kk )
1104             endif
1105          enddo
1106       else
1107          call zero3(u,n1,n2,n3)
1108       endif
1109       return
1110       end
1111
1112 c---------------------------------------------------------------------
1113 c---------------------------------------------------------------------
1114
1115       subroutine comm3_ex(u,n1,n2,n3,kk)
1116
1117 c---------------------------------------------------------------------
1118 c---------------------------------------------------------------------
1119
1120 c---------------------------------------------------------------------
1121 c     comm3_ex  communicates to expand the number of processors
1122 c---------------------------------------------------------------------
1123       implicit none
1124
1125       include 'mpinpb.h'
1126       include 'globals.h'
1127
1128       integer n1, n2, n3, kk
1129       double precision u(n1,n2,n3)
1130       integer axis
1131
1132       do  axis = 1, 3
1133          if( nprocs .ne. 1 ) then
1134             if( take_ex( axis, kk ) )then
1135                call ready( axis, -1, kk )
1136                call ready( axis, +1, kk )
1137                call take3_ex( axis, -1, u, n1, n2, n3 )
1138                call take3_ex( axis, +1, u, n1, n2, n3 )
1139             endif
1140
1141             if( give_ex( axis, kk ) )then
1142                call give3_ex( axis, +1, u, n1, n2, n3, kk )
1143                call give3_ex( axis, -1, u, n1, n2, n3, kk )
1144             endif
1145          else
1146             call comm1p_ex( axis, u, n1, n2, n3, kk )
1147          endif
1148       enddo
1149
1150       return
1151       end
1152
1153 c---------------------------------------------------------------------
1154 c---------------------------------------------------------------------
1155
1156       subroutine ready( axis, dir, k )
1157
1158 c---------------------------------------------------------------------
1159 c---------------------------------------------------------------------
1160
1161 c---------------------------------------------------------------------
1162 c     ready allocates a buffer to take in a message
1163 c---------------------------------------------------------------------
1164       implicit none
1165
1166       include 'mpinpb.h'
1167       include 'globals.h'
1168
1169       integer axis, dir, k
1170       integer buff_id,buff_len,i,ierr
1171
1172       buff_id = 3 + dir
1173       buff_len = nm2
1174
1175       do  i=1,nm2
1176          buff(i,buff_id) = 0.0D0
1177       enddo
1178
1179
1180 c---------------------------------------------------------------------
1181 c     fake message request type
1182 c---------------------------------------------------------------------
1183       msg_id(axis,dir,1) = msg_type(axis,dir) +1000*me
1184
1185       call mpi_irecv( buff(1,buff_id), buff_len,
1186      >     dp_type, nbr(axis,-dir,k), msg_type(axis,dir),
1187      >     mpi_comm_world, msg_id(axis,dir,1), ierr)
1188       return
1189       end
1190
1191
1192 c---------------------------------------------------------------------
1193 c---------------------------------------------------------------------
1194
1195       subroutine give3( axis, dir, u, n1, n2, n3, k )
1196
1197 c---------------------------------------------------------------------
1198 c---------------------------------------------------------------------
1199
1200 c---------------------------------------------------------------------
1201 c     give3 sends border data out in the requested direction
1202 c---------------------------------------------------------------------
1203       implicit none
1204
1205       include 'mpinpb.h'
1206       include 'globals.h'
1207
1208       integer axis, dir, n1, n2, n3, k, ierr
1209       double precision u( n1, n2, n3 )
1210
1211       integer i3, i2, i1, buff_len,buff_id
1212
1213       buff_id = 2 + dir
1214       buff_len = 0
1215
1216       if( axis .eq.  1 )then
1217          if( dir .eq. -1 )then
1218
1219             do  i3=2,n3-1
1220                do  i2=2,n2-1
1221                   buff_len = buff_len + 1
1222                   buff(buff_len,buff_id ) = u( 2,  i2,i3)
1223                enddo
1224             enddo
1225
1226             call mpi_send(
1227      >           buff(1, buff_id ), buff_len,dp_type,
1228      >           nbr( axis, dir, k ), msg_type(axis,dir),
1229      >           mpi_comm_world, ierr)
1230
1231          else if( dir .eq. +1 ) then
1232
1233             do  i3=2,n3-1
1234                do  i2=2,n2-1
1235                   buff_len = buff_len + 1
1236                   buff(buff_len, buff_id ) = u( n1-1, i2,i3)
1237                enddo
1238             enddo
1239
1240             call mpi_send(
1241      >           buff(1, buff_id ), buff_len,dp_type,
1242      >           nbr( axis, dir, k ), msg_type(axis,dir),
1243      >           mpi_comm_world, ierr)
1244
1245          endif
1246       endif
1247
1248       if( axis .eq.  2 )then
1249          if( dir .eq. -1 )then
1250
1251             do  i3=2,n3-1
1252                do  i1=1,n1
1253                   buff_len = buff_len + 1
1254                   buff(buff_len, buff_id ) = u( i1,  2,i3)
1255                enddo
1256             enddo
1257
1258             call mpi_send(
1259      >           buff(1, buff_id ), buff_len,dp_type,
1260      >           nbr( axis, dir, k ), msg_type(axis,dir),
1261      >           mpi_comm_world, ierr)
1262
1263          else if( dir .eq. +1 ) then
1264
1265             do  i3=2,n3-1
1266                do  i1=1,n1
1267                   buff_len = buff_len + 1
1268                   buff(buff_len,  buff_id )= u( i1,n2-1,i3)
1269                enddo
1270             enddo
1271
1272             call mpi_send(
1273      >           buff(1, buff_id ), buff_len,dp_type,
1274      >           nbr( axis, dir, k ), msg_type(axis,dir),
1275      >           mpi_comm_world, ierr)
1276
1277          endif
1278       endif
1279
1280       if( axis .eq.  3 )then
1281          if( dir .eq. -1 )then
1282
1283             do  i2=1,n2
1284                do  i1=1,n1
1285                   buff_len = buff_len + 1
1286                   buff(buff_len, buff_id ) = u( i1,i2,2)
1287                enddo
1288             enddo
1289
1290             call mpi_send(
1291      >           buff(1, buff_id ), buff_len,dp_type,
1292      >           nbr( axis, dir, k ), msg_type(axis,dir),
1293      >           mpi_comm_world, ierr)
1294
1295          else if( dir .eq. +1 ) then
1296
1297             do  i2=1,n2
1298                do  i1=1,n1
1299                   buff_len = buff_len + 1
1300                   buff(buff_len, buff_id ) = u( i1,i2,n3-1)
1301                enddo
1302             enddo
1303
1304             call mpi_send(
1305      >           buff(1, buff_id ), buff_len,dp_type,
1306      >           nbr( axis, dir, k ), msg_type(axis,dir),
1307      >           mpi_comm_world, ierr)
1308
1309          endif
1310       endif
1311
1312       return
1313       end
1314
1315 c---------------------------------------------------------------------
1316 c---------------------------------------------------------------------
1317
1318       subroutine take3( axis, dir, u, n1, n2, n3 )
1319
1320 c---------------------------------------------------------------------
1321 c---------------------------------------------------------------------
1322
1323 c---------------------------------------------------------------------
1324 c     take3 copies in border data from the requested direction
1325 c---------------------------------------------------------------------
1326       implicit none
1327
1328       include 'mpinpb.h'
1329       include 'globals.h'
1330
1331       integer axis, dir, n1, n2, n3
1332       double precision u( n1, n2, n3 )
1333
1334       integer buff_id, indx
1335
1336       integer status(mpi_status_size), ierr
1337
1338       integer i3, i2, i1
1339
1340       call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
1341       buff_id = 3 + dir
1342       indx = 0
1343
1344       if( axis .eq.  1 )then
1345          if( dir .eq. -1 )then
1346
1347             do  i3=2,n3-1
1348                do  i2=2,n2-1
1349                   indx = indx + 1
1350                   u(n1,i2,i3) = buff(indx, buff_id )
1351                enddo
1352             enddo
1353
1354          else if( dir .eq. +1 ) then
1355
1356             do  i3=2,n3-1
1357                do  i2=2,n2-1
1358                   indx = indx + 1
1359                   u(1,i2,i3) = buff(indx, buff_id )
1360                enddo
1361             enddo
1362
1363          endif
1364       endif
1365
1366       if( axis .eq.  2 )then
1367          if( dir .eq. -1 )then
1368
1369             do  i3=2,n3-1
1370                do  i1=1,n1
1371                   indx = indx + 1
1372                   u(i1,n2,i3) = buff(indx, buff_id )
1373                enddo
1374             enddo
1375
1376          else if( dir .eq. +1 ) then
1377
1378             do  i3=2,n3-1
1379                do  i1=1,n1
1380                   indx = indx + 1
1381                   u(i1,1,i3) = buff(indx, buff_id )
1382                enddo
1383             enddo
1384
1385          endif
1386       endif
1387
1388       if( axis .eq.  3 )then
1389          if( dir .eq. -1 )then
1390
1391             do  i2=1,n2
1392                do  i1=1,n1
1393                   indx = indx + 1
1394                   u(i1,i2,n3) = buff(indx, buff_id )
1395                enddo
1396             enddo
1397
1398          else if( dir .eq. +1 ) then
1399
1400             do  i2=1,n2
1401                do  i1=1,n1
1402                   indx = indx + 1
1403                   u(i1,i2,1) = buff(indx, buff_id )
1404                enddo
1405             enddo
1406
1407          endif
1408       endif
1409
1410       return
1411       end
1412
1413 c---------------------------------------------------------------------
1414 c---------------------------------------------------------------------
1415
1416       subroutine give3_ex( axis, dir, u, n1, n2, n3, k )
1417
1418 c---------------------------------------------------------------------
1419 c---------------------------------------------------------------------
1420
1421 c---------------------------------------------------------------------
1422 c     give3_ex sends border data out to expand number of processors
1423 c---------------------------------------------------------------------
1424       implicit none
1425
1426       include 'mpinpb.h'
1427       include 'globals.h'
1428
1429       integer axis, dir, n1, n2, n3, k, ierr
1430       double precision u( n1, n2, n3 )
1431
1432       integer i3, i2, i1, buff_len, buff_id
1433
1434       buff_id = 2 + dir
1435       buff_len = 0
1436
1437       if( axis .eq.  1 )then
1438          if( dir .eq. -1 )then
1439
1440             do  i3=1,n3
1441                do  i2=1,n2
1442                   buff_len = buff_len + 1
1443                   buff(buff_len,buff_id ) = u( 2,  i2,i3)
1444                enddo
1445             enddo
1446
1447             call mpi_send(
1448      >           buff(1, buff_id ), buff_len,dp_type,
1449      >           nbr( axis, dir, k ), msg_type(axis,dir),
1450      >           mpi_comm_world, ierr)
1451
1452          else if( dir .eq. +1 ) then
1453
1454             do  i3=1,n3
1455                do  i2=1,n2
1456                   do  i1=n1-1,n1
1457                      buff_len = buff_len + 1
1458                      buff(buff_len,buff_id)= u(i1,i2,i3)
1459                   enddo
1460                enddo
1461             enddo
1462
1463             call mpi_send(
1464      >           buff(1, buff_id ), buff_len,dp_type,
1465      >           nbr( axis, dir, k ), msg_type(axis,dir),
1466      >           mpi_comm_world, ierr)
1467
1468          endif
1469       endif
1470
1471       if( axis .eq.  2 )then
1472          if( dir .eq. -1 )then
1473
1474             do  i3=1,n3
1475                do  i1=1,n1
1476                   buff_len = buff_len + 1
1477                   buff(buff_len, buff_id ) = u( i1,  2,i3)
1478                enddo
1479             enddo
1480
1481             call mpi_send(
1482      >           buff(1, buff_id ), buff_len,dp_type,
1483      >           nbr( axis, dir, k ), msg_type(axis,dir),
1484      >           mpi_comm_world, ierr)
1485
1486          else if( dir .eq. +1 ) then
1487
1488             do  i3=1,n3
1489                do  i2=n2-1,n2
1490                   do  i1=1,n1
1491                      buff_len = buff_len + 1
1492                      buff(buff_len,buff_id )= u(i1,i2,i3)
1493                   enddo
1494                enddo
1495             enddo
1496
1497             call mpi_send(
1498      >           buff(1, buff_id ), buff_len,dp_type,
1499      >           nbr( axis, dir, k ), msg_type(axis,dir),
1500      >           mpi_comm_world, ierr)
1501
1502          endif
1503       endif
1504
1505       if( axis .eq.  3 )then
1506          if( dir .eq. -1 )then
1507
1508             do  i2=1,n2
1509                do  i1=1,n1
1510                   buff_len = buff_len + 1
1511                   buff(buff_len, buff_id ) = u( i1,i2,2)
1512                enddo
1513             enddo
1514
1515             call mpi_send(
1516      >           buff(1, buff_id ), buff_len,dp_type,
1517      >           nbr( axis, dir, k ), msg_type(axis,dir),
1518      >           mpi_comm_world, ierr)
1519
1520          else if( dir .eq. +1 ) then
1521
1522             do  i3=n3-1,n3
1523                do  i2=1,n2
1524                   do  i1=1,n1
1525                      buff_len = buff_len + 1
1526                      buff(buff_len, buff_id ) = u( i1,i2,i3)
1527                   enddo
1528                enddo
1529             enddo
1530
1531             call mpi_send(
1532      >           buff(1, buff_id ), buff_len,dp_type,
1533      >           nbr( axis, dir, k ), msg_type(axis,dir),
1534      >           mpi_comm_world, ierr)
1535
1536          endif
1537       endif
1538
1539       return
1540       end
1541
1542 c---------------------------------------------------------------------
1543 c---------------------------------------------------------------------
1544
1545       subroutine take3_ex( axis, dir, u, n1, n2, n3 )
1546
1547 c---------------------------------------------------------------------
1548 c---------------------------------------------------------------------
1549
1550 c---------------------------------------------------------------------
1551 c     take3_ex copies in border data to expand number of processors
1552 c---------------------------------------------------------------------
1553       implicit none
1554
1555       include 'mpinpb.h'
1556       include 'globals.h'
1557
1558       integer axis, dir, n1, n2, n3
1559       double precision u( n1, n2, n3 )
1560
1561       integer buff_id, indx
1562
1563       integer status(mpi_status_size) , ierr
1564
1565       integer i3, i2, i1
1566
1567       call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
1568       buff_id = 3 + dir
1569       indx = 0
1570
1571       if( axis .eq.  1 )then
1572          if( dir .eq. -1 )then
1573
1574             do  i3=1,n3
1575                do  i2=1,n2
1576                   indx = indx + 1
1577                   u(n1,i2,i3) = buff(indx, buff_id )
1578                enddo
1579             enddo
1580
1581          else if( dir .eq. +1 ) then
1582
1583             do  i3=1,n3
1584                do  i2=1,n2
1585                   do  i1=1,2
1586                      indx = indx + 1
1587                      u(i1,i2,i3) = buff(indx,buff_id)
1588                   enddo
1589                enddo
1590             enddo
1591
1592          endif
1593       endif
1594
1595       if( axis .eq.  2 )then
1596          if( dir .eq. -1 )then
1597
1598             do  i3=1,n3
1599                do  i1=1,n1
1600                   indx = indx + 1
1601                   u(i1,n2,i3) = buff(indx, buff_id )
1602                enddo
1603             enddo
1604
1605          else if( dir .eq. +1 ) then
1606
1607             do  i3=1,n3
1608                do  i2=1,2
1609                   do  i1=1,n1
1610                      indx = indx + 1
1611                      u(i1,i2,i3) = buff(indx,buff_id)
1612                   enddo
1613                enddo
1614             enddo
1615
1616          endif
1617       endif
1618
1619       if( axis .eq.  3 )then
1620          if( dir .eq. -1 )then
1621
1622             do  i2=1,n2
1623                do  i1=1,n1
1624                   indx = indx + 1
1625                   u(i1,i2,n3) = buff(indx, buff_id )
1626                enddo
1627             enddo
1628
1629          else if( dir .eq. +1 ) then
1630
1631             do  i3=1,2
1632                do  i2=1,n2
1633                   do  i1=1,n1
1634                      indx = indx + 1
1635                      u(i1,i2,i3) = buff(indx,buff_id)
1636                   enddo
1637                enddo
1638             enddo
1639
1640          endif
1641       endif
1642
1643       return
1644       end
1645
1646 c---------------------------------------------------------------------
1647 c---------------------------------------------------------------------
1648
1649       subroutine comm1p( axis, u, n1, n2, n3, kk )
1650
1651 c---------------------------------------------------------------------
1652 c---------------------------------------------------------------------
1653
1654       implicit none
1655
1656       include 'mpinpb.h'
1657       include 'globals.h'
1658
1659       integer axis, dir, n1, n2, n3
1660       double precision u( n1, n2, n3 )
1661
1662       integer i3, i2, i1, buff_len,buff_id
1663       integer i, kk, indx
1664
1665       dir = -1
1666
1667       buff_id = 3 + dir
1668       buff_len = nm2
1669
1670       do  i=1,nm2
1671          buff(i,buff_id) = 0.0D0
1672       enddo
1673
1674
1675       dir = +1
1676
1677       buff_id = 3 + dir
1678       buff_len = nm2
1679
1680       do  i=1,nm2
1681          buff(i,buff_id) = 0.0D0
1682       enddo
1683
1684       dir = +1
1685
1686       buff_id = 2 + dir
1687       buff_len = 0
1688
1689       if( axis .eq.  1 )then
1690          do  i3=2,n3-1
1691             do  i2=2,n2-1
1692                buff_len = buff_len + 1
1693                buff(buff_len, buff_id ) = u( n1-1, i2,i3)
1694             enddo
1695          enddo
1696       endif
1697
1698       if( axis .eq.  2 )then
1699          do  i3=2,n3-1
1700             do  i1=1,n1
1701                buff_len = buff_len + 1
1702                buff(buff_len,  buff_id )= u( i1,n2-1,i3)
1703             enddo
1704          enddo
1705       endif
1706
1707       if( axis .eq.  3 )then
1708          do  i2=1,n2
1709             do  i1=1,n1
1710                buff_len = buff_len + 1
1711                buff(buff_len, buff_id ) = u( i1,i2,n3-1)
1712             enddo
1713          enddo
1714       endif
1715
1716       dir = -1
1717
1718       buff_id = 2 + dir
1719       buff_len = 0
1720
1721       if( axis .eq.  1 )then
1722          do  i3=2,n3-1
1723             do  i2=2,n2-1
1724                buff_len = buff_len + 1
1725                buff(buff_len,buff_id ) = u( 2,  i2,i3)
1726             enddo
1727          enddo
1728       endif
1729
1730       if( axis .eq.  2 )then
1731          do  i3=2,n3-1
1732             do  i1=1,n1
1733                buff_len = buff_len + 1
1734                buff(buff_len, buff_id ) = u( i1,  2,i3)
1735             enddo
1736          enddo
1737       endif
1738
1739       if( axis .eq.  3 )then
1740          do  i2=1,n2
1741             do  i1=1,n1
1742                buff_len = buff_len + 1
1743                buff(buff_len, buff_id ) = u( i1,i2,2)
1744             enddo
1745          enddo
1746       endif
1747
1748       do  i=1,nm2
1749          buff(i,4) = buff(i,3)
1750          buff(i,2) = buff(i,1)
1751       enddo
1752
1753       dir = -1
1754
1755       buff_id = 3 + dir
1756       indx = 0
1757
1758       if( axis .eq.  1 )then
1759          do  i3=2,n3-1
1760             do  i2=2,n2-1
1761                indx = indx + 1
1762                u(n1,i2,i3) = buff(indx, buff_id )
1763             enddo
1764          enddo
1765       endif
1766
1767       if( axis .eq.  2 )then
1768          do  i3=2,n3-1
1769             do  i1=1,n1
1770                indx = indx + 1
1771                u(i1,n2,i3) = buff(indx, buff_id )
1772             enddo
1773          enddo
1774       endif
1775
1776       if( axis .eq.  3 )then
1777          do  i2=1,n2
1778             do  i1=1,n1
1779                indx = indx + 1
1780                u(i1,i2,n3) = buff(indx, buff_id )
1781             enddo
1782          enddo
1783       endif
1784
1785
1786       dir = +1
1787
1788       buff_id = 3 + dir
1789       indx = 0
1790
1791       if( axis .eq.  1 )then
1792          do  i3=2,n3-1
1793             do  i2=2,n2-1
1794                indx = indx + 1
1795                u(1,i2,i3) = buff(indx, buff_id )
1796             enddo
1797          enddo
1798       endif
1799
1800       if( axis .eq.  2 )then
1801          do  i3=2,n3-1
1802             do  i1=1,n1
1803                indx = indx + 1
1804                u(i1,1,i3) = buff(indx, buff_id )
1805             enddo
1806          enddo
1807       endif
1808
1809       if( axis .eq.  3 )then
1810          do  i2=1,n2
1811             do  i1=1,n1
1812                indx = indx + 1
1813                u(i1,i2,1) = buff(indx, buff_id )
1814             enddo
1815          enddo
1816       endif
1817
1818       return
1819       end
1820
1821 c---------------------------------------------------------------------
1822 c---------------------------------------------------------------------
1823
1824       subroutine comm1p_ex( axis, u, n1, n2, n3, kk )
1825
1826 c---------------------------------------------------------------------
1827 c---------------------------------------------------------------------
1828
1829       implicit none
1830
1831       include 'mpinpb.h'
1832       include 'globals.h'
1833
1834       integer axis, dir, n1, n2, n3
1835       double precision u( n1, n2, n3 )
1836
1837       integer i3, i2, i1, buff_len,buff_id
1838       integer i, kk, indx
1839
1840       if( take_ex( axis, kk ) ) then
1841
1842          dir = -1
1843
1844          buff_id = 3 + dir
1845          buff_len = nm2
1846
1847          do  i=1,nm2
1848             buff(i,buff_id) = 0.0D0
1849          enddo
1850
1851
1852          dir = +1
1853
1854          buff_id = 3 + dir
1855          buff_len = nm2
1856
1857          do  i=1,nm2
1858             buff(i,buff_id) = 0.0D0
1859          enddo
1860
1861
1862          dir = -1
1863
1864          buff_id = 3 + dir
1865          indx = 0
1866
1867          if( axis .eq.  1 )then
1868             do  i3=1,n3
1869                do  i2=1,n2
1870                   indx = indx + 1
1871                   u(n1,i2,i3) = buff(indx, buff_id )
1872                enddo
1873             enddo
1874          endif
1875
1876          if( axis .eq.  2 )then
1877             do  i3=1,n3
1878                do  i1=1,n1
1879                   indx = indx + 1
1880                   u(i1,n2,i3) = buff(indx, buff_id )
1881                enddo
1882             enddo
1883          endif
1884
1885          if( axis .eq.  3 )then
1886             do  i2=1,n2
1887                do  i1=1,n1
1888                   indx = indx + 1
1889                   u(i1,i2,n3) = buff(indx, buff_id )
1890                enddo
1891             enddo
1892          endif
1893
1894          dir = +1
1895
1896          buff_id = 3 + dir
1897          indx = 0
1898
1899          if( axis .eq.  1 )then
1900             do  i3=1,n3
1901                do  i2=1,n2
1902                   do  i1=1,2
1903                      indx = indx + 1
1904                      u(i1,i2,i3) = buff(indx,buff_id)
1905                   enddo
1906                enddo
1907             enddo
1908          endif
1909
1910          if( axis .eq.  2 )then
1911             do  i3=1,n3
1912                do  i2=1,2
1913                   do  i1=1,n1
1914                      indx = indx + 1
1915                      u(i1,i2,i3) = buff(indx,buff_id)
1916                   enddo
1917                enddo
1918             enddo
1919          endif
1920
1921          if( axis .eq.  3 )then
1922             do  i3=1,2
1923                do  i2=1,n2
1924                   do  i1=1,n1
1925                      indx = indx + 1
1926                      u(i1,i2,i3) = buff(indx,buff_id)
1927                   enddo
1928                enddo
1929             enddo
1930          endif
1931
1932       endif
1933
1934       if( give_ex( axis, kk ) )then
1935
1936          dir = +1
1937
1938          buff_id = 2 + dir
1939          buff_len = 0
1940
1941          if( axis .eq.  1 )then
1942             do  i3=1,n3
1943                do  i2=1,n2
1944                   do  i1=n1-1,n1
1945                      buff_len = buff_len + 1
1946                      buff(buff_len,buff_id)= u(i1,i2,i3)
1947                   enddo
1948                enddo
1949             enddo
1950          endif
1951
1952          if( axis .eq.  2 )then
1953             do  i3=1,n3
1954                do  i2=n2-1,n2
1955                   do  i1=1,n1
1956                      buff_len = buff_len + 1
1957                      buff(buff_len,buff_id )= u(i1,i2,i3)
1958                   enddo
1959                enddo
1960             enddo
1961          endif
1962
1963          if( axis .eq.  3 )then
1964             do  i3=n3-1,n3
1965                do  i2=1,n2
1966                   do  i1=1,n1
1967                      buff_len = buff_len + 1
1968                      buff(buff_len, buff_id ) = u( i1,i2,i3)
1969                   enddo
1970                enddo
1971             enddo
1972          endif
1973
1974          dir = -1
1975
1976          buff_id = 2 + dir
1977          buff_len = 0
1978
1979          if( axis .eq.  1 )then
1980             do  i3=1,n3
1981                do  i2=1,n2
1982                   buff_len = buff_len + 1
1983                   buff(buff_len,buff_id ) = u( 2,  i2,i3)
1984                enddo
1985             enddo
1986          endif
1987
1988          if( axis .eq.  2 )then
1989             do  i3=1,n3
1990                do  i1=1,n1
1991                   buff_len = buff_len + 1
1992                   buff(buff_len, buff_id ) = u( i1,  2,i3)
1993                enddo
1994             enddo
1995          endif
1996
1997          if( axis .eq.  3 )then
1998             do  i2=1,n2
1999                do  i1=1,n1
2000                   buff_len = buff_len + 1
2001                   buff(buff_len, buff_id ) = u( i1,i2,2)
2002                enddo
2003             enddo
2004          endif
2005
2006       endif
2007
2008       do  i=1,nm2
2009          buff(i,4) = buff(i,3)
2010          buff(i,2) = buff(i,1)
2011       enddo
2012
2013       return
2014       end
2015
2016 c---------------------------------------------------------------------
2017 c---------------------------------------------------------------------
2018
2019       subroutine zran3(z,n1,n2,n3,nx,ny,k)
2020
2021 c---------------------------------------------------------------------
2022 c---------------------------------------------------------------------
2023
2024 c---------------------------------------------------------------------
2025 c     zran3  loads +1 at ten randomly chosen points,
2026 c     loads -1 at a different ten random points,
2027 c     and zero elsewhere.
2028 c---------------------------------------------------------------------
2029       implicit none
2030
2031       include 'mpinpb.h'
2032
2033       integer  is1, is2, is3, ie1, ie2, ie3
2034       common /grid/ is1,is2,is3,ie1,ie2,ie3
2035
2036       integer n1, n2, n3, k, nx, ny, ierr, i0, m0, m1
2037       double precision z(n1,n2,n3)
2038
2039       integer mm, i1, i2, i3, d1, e1, e2, e3
2040       double precision x, a
2041       double precision xx, x0, x1, a1, a2, ai, power
2042       parameter( mm = 10,  a = 5.D0 ** 13, x = 314159265.D0)
2043       double precision ten( mm, 0:1 ), temp, best
2044       integer i, j1( mm, 0:1 ), j2( mm, 0:1 ), j3( mm, 0:1 )
2045       integer jg( 0:3, mm, 0:1 ), jg_temp(4)
2046
2047       external randlc
2048       double precision randlc, rdummy
2049
2050       a1 = power( a, nx, 1, 0 )
2051       a2 = power( a, nx, ny, 0 )
2052
2053       call zero3(z,n1,n2,n3)
2054
2055 c      i = is1-2+nx*(is2-2+ny*(is3-2))
2056
2057       ai = power( a, nx, is2-2+ny*(is3-2), is1-2 )
2058       d1 = ie1 - is1 + 1
2059       e1 = ie1 - is1 + 2
2060       e2 = ie2 - is2 + 2
2061       e3 = ie3 - is3 + 2
2062       x0 = x
2063       rdummy = randlc( x0, ai )
2064       do  i3 = 2, e3
2065          x1 = x0
2066          do  i2 = 2, e2
2067             xx = x1
2068             call vranlc( d1, xx, a, z( 2, i2, i3 ))
2069             rdummy = randlc( x1, a1 )
2070          enddo
2071          rdummy = randlc( x0, a2 )
2072       enddo
2073
2074 c---------------------------------------------------------------------
2075 c       call comm3(z,n1,n2,n3)
2076 c       call showall(z,n1,n2,n3)
2077 c---------------------------------------------------------------------
2078
2079 c---------------------------------------------------------------------
2080 c     each processor looks for twenty candidates
2081 c---------------------------------------------------------------------
2082       do  i=1,mm
2083          ten( i, 1 ) = 0.0D0
2084          j1( i, 1 ) = 0
2085          j2( i, 1 ) = 0
2086          j3( i, 1 ) = 0
2087          ten( i, 0 ) = 1.0D0
2088          j1( i, 0 ) = 0
2089          j2( i, 0 ) = 0
2090          j3( i, 0 ) = 0
2091       enddo
2092
2093       do  i3=2,n3-1
2094          do  i2=2,n2-1
2095             do  i1=2,n1-1
2096                if( z(i1,i2,i3) .gt. ten( 1, 1 ) )then
2097                   ten(1,1) = z(i1,i2,i3)
2098                   j1(1,1) = i1
2099                   j2(1,1) = i2
2100                   j3(1,1) = i3
2101                   call bubble( ten, j1, j2, j3, mm, 1 )
2102                endif
2103                if( z(i1,i2,i3) .lt. ten( 1, 0 ) )then
2104                   ten(1,0) = z(i1,i2,i3)
2105                   j1(1,0) = i1
2106                   j2(1,0) = i2
2107                   j3(1,0) = i3
2108                   call bubble( ten, j1, j2, j3, mm, 0 )
2109                endif
2110             enddo
2111          enddo
2112       enddo
2113
2114       call mpi_barrier(mpi_comm_world,ierr)
2115
2116 c---------------------------------------------------------------------
2117 c     Now which of these are globally best?
2118 c---------------------------------------------------------------------
2119       i1 = mm
2120       i0 = mm
2121       do  i=mm,1,-1
2122
2123          best = z( j1(i1,1), j2(i1,1), j3(i1,1) )
2124          call mpi_allreduce(best,temp,1,dp_type,
2125      >        mpi_max,mpi_comm_world,ierr)
2126          best = temp
2127          if(best.eq.z(j1(i1,1),j2(i1,1),j3(i1,1)))then
2128             jg( 0, i, 1) = me
2129             jg( 1, i, 1) = is1 - 2 + j1( i1, 1 )
2130             jg( 2, i, 1) = is2 - 2 + j2( i1, 1 )
2131             jg( 3, i, 1) = is3 - 2 + j3( i1, 1 )
2132             i1 = i1-1
2133          else
2134             jg( 0, i, 1) = 0
2135             jg( 1, i, 1) = 0
2136             jg( 2, i, 1) = 0
2137             jg( 3, i, 1) = 0
2138          endif
2139          ten( i, 1 ) = best
2140          call mpi_allreduce(jg(0,i,1), jg_temp,4,MPI_INTEGER,
2141      >        mpi_max,mpi_comm_world,ierr)
2142          jg( 0, i, 1) =  jg_temp(1)
2143          jg( 1, i, 1) =  jg_temp(2)
2144          jg( 2, i, 1) =  jg_temp(3)
2145          jg( 3, i, 1) =  jg_temp(4)
2146
2147          best = z( j1(i0,0), j2(i0,0), j3(i0,0) )
2148          call mpi_allreduce(best,temp,1,dp_type,
2149      >        mpi_min,mpi_comm_world,ierr)
2150          best = temp
2151          if(best.eq.z(j1(i0,0),j2(i0,0),j3(i0,0)))then
2152             jg( 0, i, 0) = me
2153             jg( 1, i, 0) = is1 - 2 + j1( i0, 0 )
2154             jg( 2, i, 0) = is2 - 2 + j2( i0, 0 )
2155             jg( 3, i, 0) = is3 - 2 + j3( i0, 0 )
2156             i0 = i0-1
2157          else
2158             jg( 0, i, 0) = 0
2159             jg( 1, i, 0) = 0
2160             jg( 2, i, 0) = 0
2161             jg( 3, i, 0) = 0
2162          endif
2163          ten( i, 0 ) = best
2164          call mpi_allreduce(jg(0,i,0), jg_temp,4,MPI_INTEGER,
2165      >        mpi_max,mpi_comm_world,ierr)
2166          jg( 0, i, 0) =  jg_temp(1)
2167          jg( 1, i, 0) =  jg_temp(2)
2168          jg( 2, i, 0) =  jg_temp(3)
2169          jg( 3, i, 0) =  jg_temp(4)
2170
2171       enddo
2172       m1 = i1+1
2173       m0 = i0+1
2174
2175 c      if( me .eq. root) then
2176 c         write(*,*)' '
2177 c         write(*,*)' negative charges at'
2178 c         write(*,9)(jg(1,i,0),jg(2,i,0),jg(3,i,0),i=1,mm)
2179 c         write(*,*)' positive charges at'
2180 c         write(*,9)(jg(1,i,1),jg(2,i,1),jg(3,i,1),i=1,mm)
2181 c         write(*,*)' small random numbers were'
2182 c         write(*,8)(ten( i,0),i=mm,1,-1)
2183 c         write(*,*)' and they were found on processor number'
2184 c         write(*,7)(jg(0,i,0),i=mm,1,-1)
2185 c         write(*,*)' large random numbers were'
2186 c         write(*,8)(ten( i,1),i=mm,1,-1)
2187 c         write(*,*)' and they were found on processor number'
2188 c         write(*,7)(jg(0,i,1),i=mm,1,-1)
2189 c      endif
2190 c 9    format(5(' (',i3,2(',',i3),')'))
2191 c 8    format(5D15.8)
2192 c 7    format(10i4)
2193       call mpi_barrier(mpi_comm_world,ierr)
2194       do  i3=1,n3
2195          do  i2=1,n2
2196             do  i1=1,n1
2197                z(i1,i2,i3) = 0.0D0
2198             enddo
2199          enddo
2200       enddo
2201       do  i=mm,m0,-1
2202          z( j1(i,0), j2(i,0), j3(i,0) ) = -1.0D0
2203       enddo
2204       do  i=mm,m1,-1
2205          z( j1(i,1), j2(i,1), j3(i,1) ) = +1.0D0
2206       enddo
2207       call comm3(z,n1,n2,n3,k)
2208
2209 c---------------------------------------------------------------------
2210 c          call showall(z,n1,n2,n3)
2211 c---------------------------------------------------------------------
2212
2213       return
2214       end
2215
2216 c---------------------------------------------------------------------
2217 c---------------------------------------------------------------------
2218
2219       subroutine show_l(z,n1,n2,n3)
2220
2221 c---------------------------------------------------------------------
2222 c---------------------------------------------------------------------
2223
2224       implicit none
2225
2226       include 'mpinpb.h'
2227
2228       integer n1,n2,n3,i1,i2,i3,ierr
2229       double precision z(n1,n2,n3)
2230       integer m1, m2, m3,i
2231
2232       m1 = min(n1,18)
2233       m2 = min(n2,14)
2234       m3 = min(n3,18)
2235
2236       write(*,*)'  '
2237       do  i=0,nprocs-1
2238          if( me .eq. i )then
2239             write(*,*)' id = ', me
2240             do  i3=1,m3
2241                do  i1=1,m1
2242                   write(*,6)(z(i1,i2,i3),i2=1,m2)
2243                enddo
2244                write(*,*)' - - - - - - - '
2245             enddo
2246             write(*,*)'  '
2247  6          format(6f15.11)
2248          endif
2249          call mpi_barrier(mpi_comm_world,ierr)
2250       enddo
2251
2252       return
2253       end
2254
2255 c---------------------------------------------------------------------
2256 c---------------------------------------------------------------------
2257
2258       subroutine showall(z,n1,n2,n3)
2259
2260 c---------------------------------------------------------------------
2261 c---------------------------------------------------------------------
2262
2263       implicit none
2264
2265       include 'mpinpb.h'
2266
2267       integer n1,n2,n3,i1,i2,i3,i,ierr
2268       double precision z(n1,n2,n3)
2269       integer m1, m2, m3
2270
2271       m1 = min(n1,18)
2272       m2 = min(n2,14)
2273       m3 = min(n3,18)
2274
2275       write(*,*)'  '
2276       do  i=0,nprocs-1
2277          if( me .eq. i )then
2278             write(*,*)' id = ', me
2279             do  i3=1,m3
2280                do  i1=1,m1
2281                   write(*,6)(z(i1,i2,i3),i2=1,m2)
2282                enddo
2283                write(*,*)' - - - - - - - '
2284             enddo
2285             write(*,*)'  '
2286  6          format(15f6.3)
2287          endif
2288          call mpi_barrier(mpi_comm_world,ierr)
2289       enddo
2290
2291       return
2292       end
2293
2294 c---------------------------------------------------------------------
2295 c---------------------------------------------------------------------
2296
2297       subroutine show(z,n1,n2,n3)
2298
2299 c---------------------------------------------------------------------
2300 c---------------------------------------------------------------------
2301
2302       implicit none
2303
2304       include 'mpinpb.h'
2305       integer n1,n2,n3,i1,i2,i3,ierr,i
2306       double precision z(n1,n2,n3)
2307
2308       write(*,*)'  '
2309       do  i=0,nprocs-1
2310          if( me .eq. i )then
2311             write(*,*)' id = ', me
2312             do  i3=2,n3-1
2313                do  i1=2,n1-1
2314                   write(*,6)(z(i1,i2,i3),i2=2,n1-1)
2315                enddo
2316                write(*,*)' - - - - - - - '
2317             enddo
2318             write(*,*)'  '
2319  6          format(8D10.3)
2320          endif
2321          call mpi_barrier(mpi_comm_world,ierr)
2322       enddo
2323
2324 c     call comm3(z,n1,n2,n3)
2325
2326       return
2327       end
2328
2329 c---------------------------------------------------------------------
2330 c---------------------------------------------------------------------
2331
2332       double precision function power( a, n1, n2, n3 )
2333
2334 c---------------------------------------------------------------------
2335 c---------------------------------------------------------------------
2336
2337 c---------------------------------------------------------------------
2338 c     power  raises an integer, disguised as a double
2339 c     precision real, to an integer power.
2340 c     This version tries to avoid integer overflow by treating
2341 c     it as expressed in a form of "n1*n2+n3".
2342 c---------------------------------------------------------------------
2343       implicit none
2344
2345       double precision a, aj
2346       integer n1, n2, n3
2347
2348       integer n1j, n2j, nj
2349       external randlc
2350       double precision randlc, rdummy
2351
2352       power = 1.0d0
2353       aj = a
2354       nj = n3
2355       n1j = n1
2356       n2j = n2
2357  100  continue
2358
2359       if( n2j .gt. 0 ) then
2360          if( mod(n2j,2) .eq. 1 ) nj = nj + n1j
2361          n2j = n2j/2
2362       else if( nj .eq. 0 ) then
2363          go to 200
2364       endif
2365       if( mod(nj,2) .eq. 1 ) rdummy =  randlc( power, aj )
2366       rdummy = randlc( aj, aj )
2367       nj = nj/2
2368       go to 100
2369
2370  200  continue
2371       return
2372       end
2373
2374 c---------------------------------------------------------------------
2375 c---------------------------------------------------------------------
2376
2377       subroutine bubble( ten, j1, j2, j3, m, ind )
2378
2379 c---------------------------------------------------------------------
2380 c---------------------------------------------------------------------
2381
2382 c---------------------------------------------------------------------
2383 c     bubble        does a bubble sort in direction dir
2384 c---------------------------------------------------------------------
2385       implicit none
2386
2387       include 'mpinpb.h'
2388
2389       integer m, ind, j1( m, 0:1 ), j2( m, 0:1 ), j3( m, 0:1 )
2390       double precision ten( m, 0:1 )
2391       double precision temp
2392       integer i, j_temp
2393
2394       if( ind .eq. 1 )then
2395
2396          do  i=1,m-1
2397             if( ten(i,ind) .gt. ten(i+1,ind) )then
2398
2399                temp = ten( i+1, ind )
2400                ten( i+1, ind ) = ten( i, ind )
2401                ten( i, ind ) = temp
2402
2403                j_temp           = j1( i+1, ind )
2404                j1( i+1, ind ) = j1( i,   ind )
2405                j1( i,   ind ) = j_temp
2406
2407                j_temp           = j2( i+1, ind )
2408                j2( i+1, ind ) = j2( i,   ind )
2409                j2( i,   ind ) = j_temp
2410
2411                j_temp           = j3( i+1, ind )
2412                j3( i+1, ind ) = j3( i,   ind )
2413                j3( i,   ind ) = j_temp
2414
2415             else
2416                return
2417             endif
2418          enddo
2419
2420       else
2421
2422          do  i=1,m-1
2423             if( ten(i,ind) .lt. ten(i+1,ind) )then
2424
2425                temp = ten( i+1, ind )
2426                ten( i+1, ind ) = ten( i, ind )
2427                ten( i, ind ) = temp
2428
2429                j_temp           = j1( i+1, ind )
2430                j1( i+1, ind ) = j1( i,   ind )
2431                j1( i,   ind ) = j_temp
2432
2433                j_temp           = j2( i+1, ind )
2434                j2( i+1, ind ) = j2( i,   ind )
2435                j2( i,   ind ) = j_temp
2436
2437                j_temp           = j3( i+1, ind )
2438                j3( i+1, ind ) = j3( i,   ind )
2439                j3( i,   ind ) = j_temp
2440
2441             else
2442                return
2443             endif
2444          enddo
2445
2446       endif
2447
2448       return
2449       end
2450
2451 c---------------------------------------------------------------------
2452 c---------------------------------------------------------------------
2453
2454       subroutine zero3(z,n1,n2,n3)
2455
2456 c---------------------------------------------------------------------
2457 c---------------------------------------------------------------------
2458
2459       implicit none
2460
2461       include 'mpinpb.h'
2462
2463       integer n1, n2, n3
2464       double precision z(n1,n2,n3)
2465       integer i1, i2, i3
2466
2467       do  i3=1,n3
2468          do  i2=1,n2
2469             do  i1=1,n1
2470                z(i1,i2,i3)=0.0D0
2471             enddo
2472          enddo
2473       enddo
2474
2475       return
2476       end
2477
2478
2479 c----- end of program ------------------------------------------------