c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine z_solve c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c this function performs the solution of the approximate factorization c step in the z-direction for all five matrix components c simultaneously. The Thomas algorithm is employed to solve the c systems for the z-lines. Boundary conditions are non-periodic c--------------------------------------------------------------------- include 'header.h' include 'mpinpb.h' integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2, > buffer_size, c, m, p, kstart, error, > requests(2), statuses(MPI_STATUS_SIZE, 2) double precision r1, r2, d, e, s(5), sm1, sm2, > fac1, fac2 c--------------------------------------------------------------------- c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells c on this node in the direction of increasing i for the forward sweep, c and after that reversing the direction for the backsubstitution c--------------------------------------------------------------------- c--------------------------------------------------------------------- c FORWARD ELIMINATION c--------------------------------------------------------------------- do stage = 1, ncells c = slice(3,stage) kstart = 0 kend = cell_size(3,c)-1 isize = cell_size(1,c) jsize = cell_size(2,c) ip = cell_coord(1,c)-1 jp = cell_coord(2,c)-1 buffer_size = (isize-start(1,c)-end(1,c)) * > (jsize-start(2,c)-end(2,c)) if (stage .ne. 1) then c--------------------------------------------------------------------- c if this is not the first processor in this row of cells, c receive data from predecessor containing the right hand c sides and the upper diagonal elements of the previous two rows c--------------------------------------------------------------------- call mpi_irecv(in_buffer, 22*buffer_size, > dp_type, predecessor(3), > DEFAULT_TAG, comm_solve, > requests(1), error) c--------------------------------------------------------------------- c communication has already been started. c compute the left hand side while waiting for the msg c--------------------------------------------------------------------- call lhsz(c) c--------------------------------------------------------------------- c wait for pending communication to complete c--------------------------------------------------------------------- call mpi_waitall(2, requests, statuses, error) c--------------------------------------------------------------------- c unpack the buffer c--------------------------------------------------------------------- k = kstart k1 = kstart + 1 n = 0 c--------------------------------------------------------------------- c create a running pointer c--------------------------------------------------------------------- p = 0 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - > in_buffer(p+1) * lhs(i,j,k,n+1,c) lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - > in_buffer(p+2) * lhs(i,j,k,n+1,c) do m = 1, 3 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - > in_buffer(p+2+m) * lhs(i,j,k,n+1,c) end do d = in_buffer(p+6) e = in_buffer(p+7) do m = 1, 3 s(m) = in_buffer(p+7+m) end do r1 = lhs(i,j,k,n+2,c) lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 do m = 1, 3 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 end do r2 = lhs(i,j,k1,n+1,c) lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2 do m = 1, 3 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2 end do p = p + 10 end do end do do m = 4, 5 n = (m-3)*5 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - > in_buffer(p+1) * lhs(i,j,k,n+1,c) lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - > in_buffer(p+2) * lhs(i,j,k,n+1,c) rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - > in_buffer(p+3) * lhs(i,j,k,n+1,c) d = in_buffer(p+4) e = in_buffer(p+5) s(m) = in_buffer(p+6) r1 = lhs(i,j,k,n+2,c) lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 r2 = lhs(i,j,k1,n+1,c) lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2 lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2 p = p + 6 end do end do end do else c--------------------------------------------------------------------- c if this IS the first cell, we still compute the lhs c--------------------------------------------------------------------- call lhsz(c) endif c--------------------------------------------------------------------- c perform the Thomas algorithm; first, FORWARD ELIMINATION c--------------------------------------------------------------------- n = 0 do k = kstart, kend-2 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 k1 = k + 1 k2 = k + 2 fac1 = 1.d0/lhs(i,j,k,n+3,c) lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) do m = 1, 3 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) end do lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) do m = 1, 3 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) end do lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) - > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c) lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) - > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c) do m = 1, 3 rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) - > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c) end do end do end do end do c--------------------------------------------------------------------- c The last two rows in this grid block are a bit different, c since they do not have two more rows available for the c elimination of off-diagonal entries c--------------------------------------------------------------------- k = kend - 1 k1 = kend do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 fac1 = 1.d0/lhs(i,j,k,n+3,c) lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) do m = 1, 3 rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) end do lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) do m = 1, 3 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) end do c--------------------------------------------------------------------- c scale the last row immediately (some of this is c overkill in case this is the last cell) c--------------------------------------------------------------------- fac2 = 1.d0/lhs(i,j,k1,n+3,c) lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c) lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c) do m = 1, 3 rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c) end do end do end do c--------------------------------------------------------------------- c do the u+c and the u-c factors c--------------------------------------------------------------------- do m = 4, 5 n = (m-3)*5 do k = kstart, kend-2 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 k1 = k + 1 k2 = k + 2 fac1 = 1.d0/lhs(i,j,k,n+3,c) lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) - > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c) lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) - > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c) rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) - > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c) end do end do end do c--------------------------------------------------------------------- c And again the last two rows separately c--------------------------------------------------------------------- k = kend - 1 k1 = kend do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 fac1 = 1.d0/lhs(i,j,k,n+3,c) lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) c--------------------------------------------------------------------- c Scale the last row immediately (some of this is overkill c if this is the last cell) c--------------------------------------------------------------------- fac2 = 1.d0/lhs(i,j,k1,n+3,c) lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c) lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c) rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c) end do end do end do c--------------------------------------------------------------------- c send information to the next processor, except when this c is the last grid block, c--------------------------------------------------------------------- if (stage .ne. ncells) then c--------------------------------------------------------------------- c create a running pointer for the send buffer c--------------------------------------------------------------------- p = 0 n = 0 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 do k = kend-1, kend out_buffer(p+1) = lhs(i,j,k,n+4,c) out_buffer(p+2) = lhs(i,j,k,n+5,c) do m = 1, 3 out_buffer(p+2+m) = rhs(i,j,k,m,c) end do p = p+5 end do end do end do do m = 4, 5 n = (m-3)*5 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 do k = kend-1, kend out_buffer(p+1) = lhs(i,j,k,n+4,c) out_buffer(p+2) = lhs(i,j,k,n+5,c) out_buffer(p+3) = rhs(i,j,k,m,c) p = p + 3 end do end do end do end do call mpi_isend(out_buffer, 22*buffer_size, > dp_type, successor(3), > DEFAULT_TAG, comm_solve, > requests(2), error) endif end do c--------------------------------------------------------------------- c now go in the reverse direction c--------------------------------------------------------------------- c--------------------------------------------------------------------- c BACKSUBSTITUTION c--------------------------------------------------------------------- do stage = ncells, 1, -1 c = slice(3,stage) kstart = 0 kend = cell_size(3,c)-1 isize = cell_size(1,c) jsize = cell_size(2,c) ip = cell_coord(1,c)-1 jp = cell_coord(2,c)-1 buffer_size = (isize-start(1,c)-end(1,c)) * > (jsize-start(2,c)-end(2,c)) if (stage .ne. ncells) then c--------------------------------------------------------------------- c if this is not the starting cell in this row of cells, c wait for a message to be received, containing the c solution of the previous two stations c--------------------------------------------------------------------- call mpi_irecv(in_buffer, 10*buffer_size, > dp_type, successor(3), > DEFAULT_TAG, comm_solve, > requests(1), error) c--------------------------------------------------------------------- c communication has already been started c while waiting, do the block-diagonal inversion for the c cell that was just finished c--------------------------------------------------------------------- call tzetar(slice(3,stage+1)) c--------------------------------------------------------------------- c wait for pending communication to complete c--------------------------------------------------------------------- call mpi_waitall(2, requests, statuses, error) c--------------------------------------------------------------------- c unpack the buffer for the first three factors c--------------------------------------------------------------------- n = 0 p = 0 k = kend k1 = k - 1 do m = 1, 3 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 sm1 = in_buffer(p+1) sm2 = in_buffer(p+2) rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - > lhs(i,j,k,n+4,c)*sm1 - > lhs(i,j,k,n+5,c)*sm2 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - > lhs(i,j,k1,n+5,c) * sm1 p = p + 2 end do end do end do c--------------------------------------------------------------------- c now unpack the buffer for the remaining two factors c--------------------------------------------------------------------- do m = 4, 5 n = (m-3)*5 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 sm1 = in_buffer(p+1) sm2 = in_buffer(p+2) rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - > lhs(i,j,k,n+4,c)*sm1 - > lhs(i,j,k,n+5,c)*sm2 rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - > lhs(i,j,k1,n+5,c) * sm1 p = p + 2 end do end do end do else c--------------------------------------------------------------------- c now we know this is the first grid block on the back sweep, c so we don't need a message to start the substitution. c--------------------------------------------------------------------- k = kend - 1 k1 = kend n = 0 do m = 1, 3 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) end do end do end do do m = 4, 5 n = (m-3)*5 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) end do end do end do endif c--------------------------------------------------------------------- c Whether or not this is the last processor, we always have c to complete the back-substitution c--------------------------------------------------------------------- c--------------------------------------------------------------------- c The first three factors c--------------------------------------------------------------------- n = 0 do m = 1, 3 do k = kend-2, kstart, -1 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 k1 = k + 1 k2 = k + 2 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) - > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c) end do end do end do end do c--------------------------------------------------------------------- c And the remaining two c--------------------------------------------------------------------- do m = 4, 5 n = (m-3)*5 do k = kend-2, kstart, -1 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 k1 = k + 1 k2 = k + 2 rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) - > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c) end do end do end do end do c--------------------------------------------------------------------- c send on information to the previous processor, if needed c--------------------------------------------------------------------- if (stage .ne. 1) then k = kstart k1 = kstart + 1 p = 0 do m = 1, 5 do j = start(2,c), jsize-end(2,c)-1 do i = start(1,c), isize-end(1,c)-1 out_buffer(p+1) = rhs(i,j,k,m,c) out_buffer(p+2) = rhs(i,j,k1,m,c) p = p + 2 end do end do end do call mpi_isend(out_buffer, 10*buffer_size, > dp_type, predecessor(3), > DEFAULT_TAG, comm_solve, > requests(2), error) endif c--------------------------------------------------------------------- c If this was the last stage, do the block-diagonal inversion c--------------------------------------------------------------------- if (stage .eq. 1) call tzetar(c) end do return end