c**********************************************************************
c   twod.f - a solution to the Poisson problem by using Jacobi 
c   interation on a 2-d decomposition
c     
c   .... the rest of this is from pi3.f to show the style ...
c
c   Each node: 
c    1) receives the number of rectangles used in the approximation.
c    2) calculates the areas of it's rectangles.
c    3) Synchronizes for a global summation.
c   Node 0 prints the result.
c
c  Variables:
c
c    pi  the calculated result
c    n   number of points of integration.  
c    x           midpoint of each rectangle's interval
c    f           function to integrate
c    sum,pi      area of rectangles
c    tmp         temporary scratch space for global summation
c    i           do loop index
c
c     This code is included (without the prints) because one version of
c     MPICH SEGV'ed (probably because of errors in handling send/recv of
c     MPI_PROC_NULL source/destination).
c
c****************************************************************************
      program main
      include "mpif.h"
      integer maxn
      parameter (maxn = 128)
      double precision  a(maxn,maxn), b(maxn,maxn), f(maxn,maxn)
      integer nx, ny
      integer myid, numprocs, it, rc, comm2d, ierr, stride
      integer nbrleft, nbrright, nbrtop, nbrbottom
      integer sx, ex, sy, ey
      integer dims(2)
      logical periods(2)
      double precision diff2d, diffnorm, dwork
      double precision t1, t2
      external diff2d
      data periods/2*.false./

      call MPI_INIT( ierr )
      call MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr )
      call MPI_COMM_SIZE( MPI_COMM_WORLD, numprocs, ierr )
c      print *, "Process ", myid, " of ", numprocs, " is alive"
      if (myid .eq. 0) then
c
c         Get the size of the problem
c
c          print *, 'Enter nx'
c          read *, nx
           nx = 10
      endif
c      print *, 'About to do bcast on ', myid
      call MPI_BCAST(nx,1,MPI_INTEGER,0,MPI_COMM_WORLD,ierr)
      ny   = nx
c
c Get a new communicator for a decomposition of the domain.  Let MPI
c find a "good" decomposition
c
      dims(1) = 0
      dims(2) = 0
      call MPI_DIMS_CREATE( numprocs, 2, dims, ierr )
      call MPI_CART_CREATE( MPI_COMM_WORLD, 2, dims, periods, .true.,
     *                    comm2d, ierr )
c
c Get my position in this communicator
c 
      call MPI_COMM_RANK( comm2d, myid, ierr )
c      print *, "Process ", myid, " of ", numprocs, " is alive"
c
c My neighbors are now +/- 1 with my rank.  Handle the case of the 
c boundaries by using MPI_PROCNULL.
      call fnd2dnbrs( comm2d, nbrleft, nbrright, nbrtop, nbrbottom )
c      print *, "Process ", myid, ":", 
c     *     nbrleft, nbrright, nbrtop, nbrbottom
c
c Compute the decomposition
c     
      call fnd2ddecomp( comm2d, nx, sx, ex, sy, ey )
c      print *, "Process ", myid, ":", sx, ex, sy, ey
c
c Create a new, "strided" datatype for the exchange in the "non-contiguous"
c direction
c
      call mpi_Type_vector( ey-sy+1, 1, ex-sx+3, 
     $                      MPI_DOUBLE_PRECISION, stride, ierr )
      call mpi_Type_commit( stride, ierr )
c
c
c Initialize the right-hand-side (f) and the initial solution guess (a)
c
      call twodinit( a, b, f, nx, sx, ex, sy, ey )
c
c Actually do the computation.  Note the use of a collective operation to
c check for convergence, and a do-loop to bound the number of iterations.
c
      call MPI_BARRIER( MPI_COMM_WORLD, ierr )
      t1 = MPI_WTIME()
      do 10 it=1, 100
        call exchng2( a, b, sx, ex, sy, ey, comm2d, stride, 
     $                nbrleft, nbrright, nbrtop, nbrbottom )
        call sweep2d( b, f, nx, sx, ex, sy, ey, a )
        call exchng2( b, a, sx, ex, sy, ey, comm2d, stride, 
     $                nbrleft, nbrright, nbrtop, nbrbottom )
        call sweep2d( a, f, nx, sx, ex, sy, ey, b )
        dwork = diff2d( a, b, nx, sx, ex, sy, ey )
        call MPI_Allreduce( dwork, diffnorm, 1, MPI_DOUBLE_PRECISION, 
     $                     MPI_SUM, comm2d, ierr )
        if (diffnorm .lt. 1.0e-5) goto 20
c        if (myid .eq. 0) print *, 2*it, ' Difference is ', diffnorm
10     continue
      if (myid .eq. 0) print *, 'Failed to converge'
20    continue
      t2 = MPI_WTIME()
c      if (myid .eq. 0) then
c         print *, 'Converged after ', 2*it, ' Iterations in ', t2 - t1,
c     $        ' secs '
c      endif
c
c
      call MPI_Type_free( stride, ierr )
      call MPI_Comm_free( comm2d, ierr )
      if (myid .eq. 0) then
         print *, ' No Errors'
      endif
      call MPI_FINALIZE(rc)
      end
c
c Perform a Jacobi sweep for a 2-d decomposition
c
      subroutine sweep2d( a, f, n, sx, ex, sy, ey, b )
      integer n, sx, ex, sy, ey
      double precision a(sx-1:ex+1, sy-1:ey+1), f(sx-1:ex+1, sy-1:ey+1),
     +                 b(sx-1:ex+1, sy-1:ey+1)
c
      integer i, j
      double precision h
c
      h = 1.0d0 / dble(n+1)
      do 10 i=sx, ex
         do 10 j=sy, ey
            b(i,j) = 0.25 * (a(i-1,j)+a(i,j+1)+a(i,j-1)+a(i+1,j)) - 
     +               h * h * f(i,j)
 10   continue
      return
      end

       subroutine exchng2( a, b, sx, ex, sy, ey, 
     $                     comm2d, stride, 
     $                     nbrleft, nbrright, nbrtop, nbrbottom  )
       include "mpif.h"
       integer sx, ex, sy, ey, stride
       double precision a(sx-1:ex+1, sy-1:ey+1), 
     $                  b(sx-1:ex+1, sy-1:ey+1)
       integer nbrleft, nbrright, nbrtop, nbrbottom, comm2d
       integer status(MPI_STATUS_SIZE), ierr, nx
c
       nx = ex - sx + 1
c  These are just like the 1-d versions, except for less data
        call MPI_SENDRECV( b(ex,sy),  nx, MPI_DOUBLE_PRECISION, 
     $                    nbrtop, 0,
     $                    a(sx-1,sy), nx, MPI_DOUBLE_PRECISION, 
     $                    nbrbottom, 0, comm2d, status, ierr )
        call MPI_SENDRECV( b(sx,sy),  nx, MPI_DOUBLE_PRECISION,
     $                    nbrbottom, 1,
     $                    a(ex+1,sy), nx, MPI_DOUBLE_PRECISION, 
     $                    nbrtop, 1, comm2d, status, ierr )
c
c This uses the "strided" datatype
        call MPI_SENDRECV( b(sx,ey),  1, stride, nbrright,  0,
     $                     a(sx,sy-1), 1, stride, nbrleft,  0, 
     $                     comm2d, status, ierr )
        call MPI_SENDRECV( b(sx,sy),  1, stride, nbrleft,   1,
     $                     a(sx,ey+1), 1, stride, nbrright, 1, 
     $                     comm2d, status, ierr )
        return
        end

c
c  The rest of the 2-d program
c
      double precision function diff2d( a, b, nx, sx, ex, sy, ey )
      integer nx, sx, ex, sy, ey
      double precision a(sx-1:ex+1, sy-1:ey+1), b(sx-1:ex+1, sy-1:ey+1)
c
      double precision sum
      integer i, j
c
      sum = 0.0d0
      do 10 j=sy,ey
         do 10 i=sx,ex
            sum = sum + (a(i,j) - b(i,j)) ** 2
 10      continue
c      
      diff2d = sum
      return
      end
      subroutine twodinit( a, b, f, nx, sx, ex, sy, ey )
      integer nx, sx, ex, sy, ey
      double precision a(sx-1:ex+1, sy-1:ey+1), b(sx-1:ex+1, sy-1:ey+1),
     &                 f(sx-1:ex+1, sy-1:ey+1)
c
      integer i, j
c
      do 10 j=sy-1,ey+1
         do 10 i=sx-1,ex+1
            a(i,j) = 0.0d0
            b(i,j) = 0.0d0
            f(i,j) = 0.0d0
 10      continue
c      
c    Handle boundary conditions
c
      if (sx .eq. 1) then 
         do 20 j=sy,ey
            a(0,j) = 1.0d0
            b(0,j) = 1.0d0
 20      continue
      endif
      if (ex .eq. nx) then
         do 21 j=sy,ey
            a(nx+1,j) = 0.0d0
            b(nx+1,j) = 0.0d0
 21      continue
      endif 
      if (sy .eq. 1) then
         do 30 i=sx,ex
            a(i,0) = 1.0d0
            b(i,0) = 1.0d0
 30      continue 
      endif
c
      return
      end

c
c  This file contains a routine for producing a decomposition of a 1-d array
c  when given a number of processors.  It may be used in "direct" product
c  decomposition.  The values returned assume a "global" domain in [1:n]
c
      subroutine MPE_DECOMP1D( n, numprocs, myid, s, e )
      integer n, numprocs, myid, s, e
      integer nlocal
      integer deficit
c
      nlocal  = n / numprocs
      s       = myid * nlocal + 1
      deficit = mod(n,numprocs)
      s       = s + min(myid,deficit)
      if (myid .lt. deficit) then
          nlocal = nlocal + 1
      endif
      e = s + nlocal - 1
      if (e .gt. n .or. myid .eq. numprocs-1) e = n
      return
      end
c
c This routine show how to determine the neighbors in a 2-d decomposition of
c the domain. This assumes that MPI_Cart_create has already been called 
c
      subroutine fnd2dnbrs( comm2d, 
     $                      nbrleft, nbrright, nbrtop, nbrbottom )
      integer comm2d, nbrleft, nbrright, nbrtop, nbrbottom
c
      integer ierr
c
      call MPI_Cart_shift( comm2d, 0,  1, nbrleft,   nbrright, ierr )
      call MPI_Cart_shift( comm2d, 1,  1, nbrbottom, nbrtop,   ierr )
c
      return
      end
c
c Note: THIS IS A TEST PROGRAM.  THE ACTUAL VALUES MOVED ARE NOT
c CORRECT FOR A POISSON SOLVER.
c
      subroutine fnd2ddecomp( comm2d, n, sx, ex, sy, ey )
      integer comm2d
      integer n, sx, ex, sy, ey
      integer dims(2), coords(2), ierr
      logical periods(2)
c
      call MPI_Cart_get( comm2d, 2, dims, periods, coords, ierr )

      call MPE_DECOMP1D( n, dims(1), coords(1), sx, ex )
      call MPE_DECOMP1D( n, dims(2), coords(2), sy, ey )
c
      return
      end