--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=bt
+BENCHMARKU=BT
+VEC=
+
+include ../config/make.def
+
+
+OBJS = bt.o make_set.o initialize.o exact_solution.o exact_rhs.o \
+ set_constants.o adi.o define.o copy_faces.o rhs.o solve_subs.o \
+ x_solve$(VEC).o y_solve$(VEC).o z_solve$(VEC).o add.o error.o \
+ verify.o setup_mpi.o \
+ ${COMMON}/print_results.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+# npbparams.h is included by header.h
+# The following rule should do the trick but many make programs (not gmake)
+# will do the wrong thing and rebuild the world every time (because the
+# mod time on header.h is not changed. One solution would be to
+# touch header.h but this might cause confusion if someone has
+# accidentally deleted it. Instead, make the dependency on npbparams.h
+# explicit in all the lines below (even though dependence is indirect).
+
+# header.h: npbparams.h
+
+${PROGRAM}: config
+ @if [ x$(VERSION) = xvec ] ; then \
+ ${MAKE} VEC=_vec exec; \
+ elif [ x$(VERSION) = xVEC ] ; then \
+ ${MAKE} VEC=_vec exec; \
+ else \
+ ${MAKE} exec; \
+ fi
+
+exec: $(OBJS)
+ @if [ x$(SUBTYPE) = xfull ] ; then \
+ ${MAKE} bt-full; \
+ elif [ x$(SUBTYPE) = xFULL ] ; then \
+ ${MAKE} bt-full; \
+ elif [ x$(SUBTYPE) = xsimple ] ; then \
+ ${MAKE} bt-simple; \
+ elif [ x$(SUBTYPE) = xSIMPLE ] ; then \
+ ${MAKE} bt-simple; \
+ elif [ x$(SUBTYPE) = xfortran ] ; then \
+ ${MAKE} bt-fortran; \
+ elif [ x$(SUBTYPE) = xFORTRAN ] ; then \
+ ${MAKE} bt-fortran; \
+ elif [ x$(SUBTYPE) = xepio ] ; then \
+ ${MAKE} bt-epio; \
+ elif [ x$(SUBTYPE) = xEPIO ] ; then \
+ ${MAKE} bt-epio; \
+ else \
+ ${MAKE} bt-bt; \
+ fi
+
+bt-bt: ${OBJS} btio.o
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} btio.o ${FMPI_LIB}
+
+bt-full: ${OBJS} full_mpiio.o btio_common.o
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_full ${OBJS} btio_common.o full_mpiio.o ${FMPI_LIB}
+
+bt-simple: ${OBJS} simple_mpiio.o btio_common.o
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_simple ${OBJS} btio_common.o simple_mpiio.o ${FMPI_LIB}
+
+bt-fortran: ${OBJS} fortran_io.o btio_common.o
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.fortran_io ${OBJS} btio_common.o fortran_io.o ${FMPI_LIB}
+
+bt-epio: ${OBJS} epio.o btio_common.o
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.ep_io ${OBJS} btio_common.o epio.o ${FMPI_LIB}
+
+.f.o:
+ ${FCOMPILE} $<
+
+.c.o:
+ ${CCOMPILE} $<
+
+
+bt.o: bt.f header.h npbparams.h mpinpb.h
+make_set.o: make_set.f header.h npbparams.h mpinpb.h
+initialize.o: initialize.f header.h npbparams.h
+exact_solution.o: exact_solution.f header.h npbparams.h
+exact_rhs.o: exact_rhs.f header.h npbparams.h
+set_constants.o: set_constants.f header.h npbparams.h
+adi.o: adi.f header.h npbparams.h
+define.o: define.f header.h npbparams.h
+copy_faces.o: copy_faces.f header.h npbparams.h mpinpb.h
+rhs.o: rhs.f header.h npbparams.h
+x_solve$(VEC).o: x_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h
+y_solve$(VEC).o: y_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h
+z_solve$(VEC).o: z_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h
+solve_subs.o: solve_subs.f npbparams.h
+add.o: add.f header.h npbparams.h
+error.o: error.f header.h npbparams.h mpinpb.h
+verify.o: verify.f header.h npbparams.h mpinpb.h
+setup_mpi.o: setup_mpi.f mpinpb.h npbparams.h
+btio.o: btio.f header.h npbparams.h
+btio_common.o: btio_common.f mpinpb.h npbparams.h
+fortran_io.o: fortran_io.f mpinpb.h npbparams.h
+simple_mpiio.o: simple_mpiio.f mpinpb.h npbparams.h
+full_mpiio.o: full_mpiio.f mpinpb.h npbparams.h
+epio.o: epio.f mpinpb.h npbparams.h
+
+clean:
+ - rm -f *.o *~ mputil*
+ - rm -f npbparams.h core
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine add
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c addition of update to the vector u
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, i, j, k, m
+
+ do c = 1, ncells
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ u(m,i,j,k,c) = u(m,i,j,k,c) + rhs(m,i,j,k,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine adi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ call copy_faces
+
+ call x_solve
+
+ call y_solve
+
+ call z_solve
+
+ call add
+
+ return
+ end
+
--- /dev/null
+!-------------------------------------------------------------------------!
+! !
+! N A S P A R A L L E L B E N C H M A R K S 3.3 !
+! !
+! B T !
+! !
+!-------------------------------------------------------------------------!
+! !
+! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
+! It is described in NAS Technical Reports 95-020 and 02-007. !
+! !
+! Permission to use, copy, distribute and modify this software !
+! for any purpose with or without fee is hereby granted. We !
+! request, however, that all derived work reference the NAS !
+! Parallel Benchmarks 3.3. This software is provided "as is" !
+! without express or implied warranty. !
+! !
+! Information on NPB 3.3, including the technical report, the !
+! original specifications, source code, results and information !
+! on how to submit new results, is available at: !
+! !
+! http://www.nas.nasa.gov/Software/NPB/ !
+! !
+! Send comments or suggestions to npb@nas.nasa.gov !
+! !
+! NAS Parallel Benchmarks Group !
+! NASA Ames Research Center !
+! Mail Stop: T27A-1 !
+! Moffett Field, CA 94035-1000 !
+! !
+! E-mail: npb@nas.nasa.gov !
+! Fax: (650) 604-3957 !
+! !
+!-------------------------------------------------------------------------!
+
+c---------------------------------------------------------------------
+c
+c Authors: R. F. Van der Wijngaart
+c T. Harris
+c M. Yarrow
+c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+ program MPBT
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i, niter, step, c, error, fstatus
+ double precision navg, mflops, mbytes, n3
+
+ external timer_read
+ double precision t, tmax, tiominv, tpc, timer_read
+ logical verified
+ character class, cbuff*40
+
+ integer wr_interval
+
+ call setup_mpi
+ if (.not. active) goto 999
+
+c---------------------------------------------------------------------
+c Root node reads input file (if it exists) else takes
+c defaults from parameters
+c---------------------------------------------------------------------
+ if (node .eq. root) then
+
+ write(*, 1000)
+ open (unit=2,file='inputbt.data',status='old', iostat=fstatus)
+c
+ rd_interval = 0
+ if (fstatus .eq. 0) then
+ write(*,233)
+ 233 format(' Reading from input file inputbt.data')
+ read (2,*) niter
+ read (2,*) dt
+ read (2,*) grid_points(1), grid_points(2), grid_points(3)
+ if (iotype .ne. 0) then
+ read (2,'(A)') cbuff
+ read (cbuff,*,iostat=i) wr_interval, rd_interval
+ if (i .ne. 0) rd_interval = 0
+ if (wr_interval .le. 0) wr_interval = wr_default
+ endif
+ if (iotype .eq. 1) then
+ read (2,*) collbuf_nodes, collbuf_size
+ write(*,*) 'collbuf_nodes ', collbuf_nodes
+ write(*,*) 'collbuf_size ', collbuf_size
+ endif
+ close(2)
+ else
+ write(*,234)
+ niter = niter_default
+ dt = dt_default
+ grid_points(1) = problem_size
+ grid_points(2) = problem_size
+ grid_points(3) = problem_size
+ wr_interval = wr_default
+ if (iotype .eq. 1) then
+c set number of nodes involved in collective buffering to 4,
+c unless total number of nodes is smaller than that.
+c set buffer size for collective buffering to 1MB per node
+c collbuf_nodes = min(4,no_nodes)
+c set default to No-File-Hints with a value of 0
+ collbuf_nodes = 0
+ collbuf_size = 1000000
+ endif
+ endif
+ 234 format(' No input file inputbt.data. Using compiled defaults')
+
+ write(*, 1001) grid_points(1), grid_points(2), grid_points(3)
+ write(*, 1002) niter, dt
+ if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes
+ if (no_nodes .ne. maxcells*maxcells)
+ > write(*, 1005) maxcells*maxcells
+ write(*, 1003) no_nodes
+
+ if (iotype .eq. 1) write(*, 1006) 'FULL MPI-IO', wr_interval
+ if (iotype .eq. 2) write(*, 1006) 'SIMPLE MPI-IO', wr_interval
+ if (iotype .eq. 3) write(*, 1006) 'EPIO', wr_interval
+ if (iotype .eq. 4) write(*, 1006) 'FORTRAN IO', wr_interval
+
+ 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- BT Benchmark ',/)
+ 1001 format(' Size: ', i4, 'x', i4, 'x', i4)
+ 1002 format(' Iterations: ', i4, ' dt: ', F11.7)
+ 1004 format(' Total number of processes: ', i5)
+ 1005 format(' WARNING: compiled for ', i5, ' processes ')
+ 1003 format(' Number of active processes: ', i5, /)
+ 1006 format(' BTIO -- ', A, ' write interval: ', i3 /)
+
+ endif
+
+ call mpi_bcast(niter, 1, MPI_INTEGER,
+ > root, comm_setup, error)
+
+ call mpi_bcast(dt, 1, dp_type,
+ > root, comm_setup, error)
+
+ call mpi_bcast(grid_points(1), 3, MPI_INTEGER,
+ > root, comm_setup, error)
+
+ call mpi_bcast(wr_interval, 1, MPI_INTEGER,
+ > root, comm_setup, error)
+
+ call mpi_bcast(rd_interval, 1, MPI_INTEGER,
+ > root, comm_setup, error)
+
+ call make_set
+
+ do c = 1, maxcells
+ if ( (cell_size(1,c) .gt. IMAX) .or.
+ > (cell_size(2,c) .gt. JMAX) .or.
+ > (cell_size(3,c) .gt. KMAX) ) then
+ print *,node, c, (cell_size(i,c),i=1,3)
+ print *,' Problem size too big for compiled array sizes'
+ goto 999
+ endif
+ end do
+
+ call set_constants
+
+ call initialize
+
+ call setup_btio
+ idump = 0
+
+ call lhsinit
+
+ call exact_rhs
+
+ call compute_buffer_size(5)
+
+c---------------------------------------------------------------------
+c do one time step to touch all code, and reinitialize
+c---------------------------------------------------------------------
+ call adi
+ call initialize
+
+ call timer_clear(2)
+
+c---------------------------------------------------------------------
+c Synchronize before placing time stamp
+c---------------------------------------------------------------------
+ call mpi_barrier(comm_setup, error)
+
+ call timer_clear(1)
+ call timer_start(1)
+
+ do step = 1, niter
+
+ if (node .eq. root) then
+ if (mod(step, 20) .eq. 0 .or. step .eq. niter .or.
+ > step .eq. 1) then
+ write(*, 200) step
+ 200 format(' Time step ', i4)
+ endif
+ endif
+
+ call adi
+
+ if (iotype .ne. 0) then
+ call timer_start(2)
+ if (mod(step, wr_interval).eq.0 .or. step .eq. niter) then
+ if (node .eq. root) then
+ print *, 'Writing data set, time step', step
+ endif
+ if (step .eq. niter .and. rd_interval .gt. 1) then
+ rd_interval = 1
+ endif
+ call output_timestep
+ idump = idump + 1
+ endif
+ call timer_stop(2)
+ endif
+ end do
+
+ call btio_cleanup
+
+ call timer_stop(1)
+ t = timer_read(1)
+
+ call verify(niter, class, verified)
+
+ call mpi_reduce(t, tmax, 1,
+ > dp_type, MPI_MAX,
+ > root, comm_setup, error)
+
+ if (iotype .ne. 0) then
+ t = timer_read(2)
+ if (t .ne. 0.d0) t = 1.0d0 / t
+ call mpi_reduce(t, tiominv, 1,
+ > dp_type, MPI_SUM,
+ > root, comm_setup, error)
+ endif
+
+ if( node .eq. root ) then
+ n3 = 1.0d0*grid_points(1)*grid_points(2)*grid_points(3)
+ navg = (grid_points(1)+grid_points(2)+grid_points(3))/3.0
+ if( tmax .ne. 0. ) then
+ mflops = 1.0e-6*float(niter)*
+ > (3478.8*n3-17655.7*navg**2+28023.7*navg)
+ > / tmax
+ else
+ mflops = 0.0
+ endif
+
+ if (iotype .ne. 0) then
+ mbytes = n3 * 40.0 * idump * 1.0d-6
+ tiominv = tiominv / no_nodes
+ t = 0.0
+ if (tiominv .ne. 0.) t = 1.d0 / tiominv
+ tpc = 0.0
+ if (tmax .ne. 0.) tpc = t * 100.0 / tmax
+ write(*,1100) t, tpc, mbytes, mbytes*tiominv
+ 1100 format(/' BTIO -- statistics:'/
+ > ' I/O timing in seconds : ', f14.2/
+ > ' I/O timing percentage : ', f14.2/
+ > ' Total data written (MB) : ', f14.2/
+ > ' I/O data rate (MB/sec) : ', f14.2)
+ endif
+
+ call print_results('BT', class, grid_points(1),
+ > grid_points(2), grid_points(3), niter, maxcells*maxcells,
+ > total_nodes, tmax, mflops, ' floating point',
+ > verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5,
+ > cs6, '(none)')
+ endif
+
+ 999 continue
+ call mpi_barrier(MPI_COMM_WORLD, error)
+ call mpi_finalize(error)
+
+ end
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine btio_verify(verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ logical verified
+
+ verified = .true.
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ double precision xce_acc(5)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine checksum_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine clear_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer cio, kio, jio, ix
+
+ do cio=1,ncells
+ do kio=0, cell_size(3,cio)-1
+ do jio=0, cell_size(2,cio)-1
+ do ix=0,cell_size(1,cio)-1
+ u(1,ix, jio,kio,cio) = 0
+ u(2,ix, jio,kio,cio) = 0
+ u(3,ix, jio,kio,cio) = 0
+ u(4,ix, jio,kio,cio) = 0
+ u(5,ix, jio,kio,cio) = 0
+ enddo
+ enddo
+ enddo
+ enddo
+
+ return
+ end
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine copy_faces
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c This function copies the face values of a variable defined on a set
+c of cells to the overlap locations of the adjacent sets of cells.
+c Because a set of cells interfaces in each direction with exactly one
+c other set, we only need to fill six different buffers. We could try to
+c overlap communication with computation, by computing
+c some internal values while communicating boundary values, but this
+c adds so much overhead that it's not clearly useful.
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i, j, k, c, m, requests(0:11), p0, p1,
+ > p2, p3, p4, p5, b_size(0:5), ss(0:5),
+ > sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11)
+
+c---------------------------------------------------------------------
+c exit immediately if there are no faces to be copied
+c---------------------------------------------------------------------
+ if (no_nodes .eq. 1) then
+ call compute_rhs
+ return
+ endif
+
+ ss(0) = start_send_east
+ ss(1) = start_send_west
+ ss(2) = start_send_north
+ ss(3) = start_send_south
+ ss(4) = start_send_top
+ ss(5) = start_send_bottom
+
+ sr(0) = start_recv_east
+ sr(1) = start_recv_west
+ sr(2) = start_recv_north
+ sr(3) = start_recv_south
+ sr(4) = start_recv_top
+ sr(5) = start_recv_bottom
+
+ b_size(0) = east_size
+ b_size(1) = west_size
+ b_size(2) = north_size
+ b_size(3) = south_size
+ b_size(4) = top_size
+ b_size(5) = bottom_size
+
+c---------------------------------------------------------------------
+c because the difference stencil for the diagonalized scheme is
+c orthogonal, we do not have to perform the staged copying of faces,
+c but can send all face information simultaneously to the neighboring
+c cells in all directions
+c---------------------------------------------------------------------
+ p0 = 0
+ p1 = 0
+ p2 = 0
+ p3 = 0
+ p4 = 0
+ p5 = 0
+
+ do c = 1, ncells
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to eastern neighbors (i-dir)
+c---------------------------------------------------------------------
+ if (cell_coord(1,c) .ne. ncells) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = cell_size(1,c)-2, cell_size(1,c)-1
+ do m = 1, 5
+ out_buffer(ss(0)+p0) = u(m,i,j,k,c)
+ p0 = p0 + 1
+ end do
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to western neighbors
+c---------------------------------------------------------------------
+ if (cell_coord(1,c) .ne. 1) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, 1
+ do m = 1, 5
+ out_buffer(ss(1)+p1) = u(m,i,j,k,c)
+ p1 = p1 + 1
+ end do
+ end do
+ end do
+ end do
+
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to northern neighbors (j_dir)
+c---------------------------------------------------------------------
+ if (cell_coord(2,c) .ne. ncells) then
+ do k = 0, cell_size(3,c)-1
+ do j = cell_size(2,c)-2, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ out_buffer(ss(2)+p2) = u(m,i,j,k,c)
+ p2 = p2 + 1
+ end do
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to southern neighbors
+c---------------------------------------------------------------------
+ if (cell_coord(2,c).ne. 1) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, 1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ out_buffer(ss(3)+p3) = u(m,i,j,k,c)
+ p3 = p3 + 1
+ end do
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to top neighbors (k-dir)
+c---------------------------------------------------------------------
+ if (cell_coord(3,c) .ne. ncells) then
+ do k = cell_size(3,c)-2, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ out_buffer(ss(4)+p4) = u(m,i,j,k,c)
+ p4 = p4 + 1
+ end do
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to bottom neighbors
+c---------------------------------------------------------------------
+ if (cell_coord(3,c).ne. 1) then
+ do k=0, 1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ out_buffer(ss(5)+p5) = u(m,i,j,k,c)
+ p5 = p5 + 1
+ end do
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c cell loop
+c---------------------------------------------------------------------
+ end do
+
+ call mpi_irecv(in_buffer(sr(0)), b_size(0),
+ > dp_type, successor(1), WEST,
+ > comm_rhs, requests(0), error)
+ call mpi_irecv(in_buffer(sr(1)), b_size(1),
+ > dp_type, predecessor(1), EAST,
+ > comm_rhs, requests(1), error)
+ call mpi_irecv(in_buffer(sr(2)), b_size(2),
+ > dp_type, successor(2), SOUTH,
+ > comm_rhs, requests(2), error)
+ call mpi_irecv(in_buffer(sr(3)), b_size(3),
+ > dp_type, predecessor(2), NORTH,
+ > comm_rhs, requests(3), error)
+ call mpi_irecv(in_buffer(sr(4)), b_size(4),
+ > dp_type, successor(3), BOTTOM,
+ > comm_rhs, requests(4), error)
+ call mpi_irecv(in_buffer(sr(5)), b_size(5),
+ > dp_type, predecessor(3), TOP,
+ > comm_rhs, requests(5), error)
+
+ call mpi_isend(out_buffer(ss(0)), b_size(0),
+ > dp_type, successor(1), EAST,
+ > comm_rhs, requests(6), error)
+ call mpi_isend(out_buffer(ss(1)), b_size(1),
+ > dp_type, predecessor(1), WEST,
+ > comm_rhs, requests(7), error)
+ call mpi_isend(out_buffer(ss(2)), b_size(2),
+ > dp_type,successor(2), NORTH,
+ > comm_rhs, requests(8), error)
+ call mpi_isend(out_buffer(ss(3)), b_size(3),
+ > dp_type,predecessor(2), SOUTH,
+ > comm_rhs, requests(9), error)
+ call mpi_isend(out_buffer(ss(4)), b_size(4),
+ > dp_type,successor(3), TOP,
+ > comm_rhs, requests(10), error)
+ call mpi_isend(out_buffer(ss(5)), b_size(5),
+ > dp_type,predecessor(3), BOTTOM,
+ > comm_rhs,requests(11), error)
+
+
+ call mpi_waitall(12, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c unpack the data that has just been received;
+c---------------------------------------------------------------------
+ p0 = 0
+ p1 = 0
+ p2 = 0
+ p3 = 0
+ p4 = 0
+ p5 = 0
+
+ do c = 1, ncells
+
+ if (cell_coord(1,c) .ne. 1) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = -2, -1
+ do m = 1, 5
+ u(m,i,j,k,c) = in_buffer(sr(1)+p0)
+ p0 = p0 + 1
+ end do
+ end do
+ end do
+ end do
+ endif
+
+ if (cell_coord(1,c) .ne. ncells) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = cell_size(1,c), cell_size(1,c)+1
+ do m = 1, 5
+ u(m,i,j,k,c) = in_buffer(sr(0)+p1)
+ p1 = p1 + 1
+ end do
+ end do
+ end do
+ end do
+ end if
+
+ if (cell_coord(2,c) .ne. 1) then
+ do k = 0, cell_size(3,c)-1
+ do j = -2, -1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ u(m,i,j,k,c) = in_buffer(sr(3)+p2)
+ p2 = p2 + 1
+ end do
+ end do
+ end do
+ end do
+
+ endif
+
+ if (cell_coord(2,c) .ne. ncells) then
+ do k = 0, cell_size(3,c)-1
+ do j = cell_size(2,c), cell_size(2,c)+1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ u(m,i,j,k,c) = in_buffer(sr(2)+p3)
+ p3 = p3 + 1
+ end do
+ end do
+ end do
+ end do
+ endif
+
+ if (cell_coord(3,c) .ne. 1) then
+ do k = -2, -1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ u(m,i,j,k,c) = in_buffer(sr(5)+p4)
+ p4 = p4 + 1
+ end do
+ end do
+ end do
+ end do
+ endif
+
+ if (cell_coord(3,c) .ne. ncells) then
+ do k = cell_size(3,c), cell_size(3,c)+1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ u(m,i,j,k,c) = in_buffer(sr(4)+p5)
+ p5 = p5 + 1
+ end do
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c cells loop
+c---------------------------------------------------------------------
+ end do
+
+c---------------------------------------------------------------------
+c do the rest of the rhs that uses the copied face values
+c---------------------------------------------------------------------
+ call compute_rhs
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine compute_buffer_size(dim)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, dim, face_size
+
+ if (ncells .eq. 1) return
+
+c---------------------------------------------------------------------
+c compute the actual sizes of the buffers; note that there is
+c always one cell face that doesn't need buffer space, because it
+c is at the boundary of the grid
+c---------------------------------------------------------------------
+ west_size = 0
+ east_size = 0
+
+ do c = 1, ncells
+ face_size = cell_size(2,c) * cell_size(3,c) * dim * 2
+ if (cell_coord(1,c).ne.1) west_size = west_size + face_size
+ if (cell_coord(1,c).ne.ncells) east_size = east_size +
+ > face_size
+ end do
+
+ north_size = 0
+ south_size = 0
+ do c = 1, ncells
+ face_size = cell_size(1,c)*cell_size(3,c) * dim * 2
+ if (cell_coord(2,c).ne.1) south_size = south_size + face_size
+ if (cell_coord(2,c).ne.ncells) north_size = north_size +
+ > face_size
+ end do
+
+ top_size = 0
+ bottom_size = 0
+ do c = 1, ncells
+ face_size = cell_size(1,c) * cell_size(2,c) * dim * 2
+ if (cell_coord(3,c).ne.1) bottom_size = bottom_size +
+ > face_size
+ if (cell_coord(3,c).ne.ncells) top_size = top_size +
+ > face_size
+ end do
+
+ start_send_west = 1
+ start_send_east = start_send_west + west_size
+ start_send_south = start_send_east + east_size
+ start_send_north = start_send_south + south_size
+ start_send_bottom = start_send_north + north_size
+ start_send_top = start_send_bottom + bottom_size
+ start_recv_west = 1
+ start_recv_east = start_recv_west + west_size
+ start_recv_south = start_recv_east + east_size
+ start_recv_north = start_recv_south + south_size
+ start_recv_bottom = start_recv_north + north_size
+ start_recv_top = start_recv_bottom + bottom_size
+
+ return
+ end
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ character*(128) newfilenm
+ integer m
+
+ if (node .lt. 10000) then
+ write (newfilenm, 996) filenm,node
+ else
+ print *, 'error generating file names (> 10000 nodes)'
+ stop
+ endif
+
+996 format (a,'.',i4.4)
+
+ open (unit=99, file=newfilenm, form='unformatted',
+ $ status='unknown')
+
+ do m = 1, 5
+ xce_sub(m) = 0.d0
+ end do
+
+ idump_sub = 0
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer ix, iio, jio, kio, cio, aio
+
+ do cio=1,ncells
+ write(99)
+ $ ((((u(aio,ix, jio,kio,cio),aio=1,5),
+ $ ix=0, cell_size(1,cio)-1),
+ $ jio=0, cell_size(2,cio)-1),
+ $ kio=0, cell_size(3,cio)-1)
+ enddo
+
+ idump_sub = idump_sub + 1
+ if (rd_interval .gt. 0) then
+ if (idump_sub .ge. rd_interval) then
+
+ rewind(99)
+ call acc_sub_norms(idump+1)
+
+ rewind(99)
+ idump_sub = 0
+ endif
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine acc_sub_norms(idump_cur)
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer idump_cur
+
+ integer ix, jio, kio, cio, ii, m, ichunk
+ double precision xce_single(5)
+
+ ichunk = idump_cur - idump_sub + 1
+ do ii=0, idump_sub-1
+ do cio=1,ncells
+ read(99)
+ $ ((((u(m,ix, jio,kio,cio),m=1,5),
+ $ ix=0, cell_size(1,cio)-1),
+ $ jio=0, cell_size(2,cio)-1),
+ $ kio=0, cell_size(3,cio)-1)
+ enddo
+
+ if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+ call error_norm(xce_single)
+ do m = 1, 5
+ xce_sub(m) = xce_sub(m) + xce_single(m)
+ end do
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ close(unit=99)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ double precision xce_acc(5)
+
+ character*(128) newfilenm
+ integer m
+
+ if (rd_interval .gt. 0) goto 20
+
+ if (node .lt. 10000) then
+ write (newfilenm, 996) filenm,node
+ else
+ print *, 'error generating file names (> 10000 nodes)'
+ stop
+ endif
+
+996 format (a,'.',i4.4)
+
+ open (unit=99, file=newfilenm,
+ $ form='unformatted')
+
+c clear the last time step
+
+ call clear_timestep
+
+c read back the time steps and accumulate norms
+
+ call acc_sub_norms(idump)
+
+ close(unit=99)
+
+ 20 continue
+ do m = 1, 5
+ xce_acc(m) = xce_sub(m) / dble(idump)
+ end do
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine error_norm(rms)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function computes the norm of the difference between the
+c computed solution and the exact solution
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer c, i, j, k, m, ii, jj, kk, d, error
+ double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5),
+ > add
+
+ do m = 1, 5
+ rms_work(m) = 0.0d0
+ enddo
+
+ do c = 1, ncells
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ ii = 0
+ do i = cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+ call exact_solution(xi, eta, zeta, u_exact)
+
+ do m = 1, 5
+ add = u(m,ii,jj,kk,c)-u_exact(m)
+ rms_work(m) = rms_work(m) + add*add
+ enddo
+ ii = ii + 1
+ enddo
+ jj = jj + 1
+ enddo
+ kk = kk + 1
+ enddo
+ enddo
+
+ call mpi_allreduce(rms_work, rms, 5, dp_type,
+ > MPI_SUM, comm_setup, error)
+
+ do m = 1, 5
+ do d = 1, 3
+ rms(m) = rms(m) / dble(grid_points(d)-2)
+ enddo
+ rms(m) = dsqrt(rms(m))
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine rhs_norm(rms)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer c, i, j, k, d, m, error
+ double precision rms(5), rms_work(5), add
+
+ do m = 1, 5
+ rms_work(m) = 0.0d0
+ enddo
+
+ do c = 1, ncells
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ add = rhs(m,i,j,k,c)
+ rms_work(m) = rms_work(m) + add*add
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ call mpi_allreduce(rms_work, rms, 5, dp_type,
+ > MPI_SUM, comm_setup, error)
+
+ do m = 1, 5
+ do d = 1, 3
+ rms(m) = rms(m) / dble(grid_points(d)-2)
+ enddo
+ rms(m) = dsqrt(rms(m))
+ enddo
+
+ return
+ end
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exact_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ double precision dtemp(5), xi, eta, zeta, dtpp
+ integer c, m, i, j, k, ip1, im1, jp1,
+ > jm1, km1, kp1
+
+
+c---------------------------------------------------------------------
+c loop over all cells owned by this node
+c---------------------------------------------------------------------
+ do c = 1, ncells
+
+c---------------------------------------------------------------------
+c initialize
+c---------------------------------------------------------------------
+ do k= 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ forcing(m,i,j,k,c) = 0.0d0
+ enddo
+ enddo
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c xi-direction flux differences
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ zeta = dble(k+cell_low(3,c)) * dnzm1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ eta = dble(j+cell_low(2,c)) * dnym1
+
+ do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c)
+ xi = dble(i+cell_low(1,c)) * dnxm1
+
+ call exact_solution(xi, eta, zeta, dtemp)
+ do m = 1, 5
+ ue(i,m) = dtemp(m)
+ enddo
+
+ dtpp = 1.0d0 / dtemp(1)
+
+ do m = 2, 5
+ buf(i,m) = dtpp * dtemp(m)
+ enddo
+
+ cuf(i) = buf(i,2) * buf(i,2)
+ buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) +
+ > buf(i,4) * buf(i,4)
+ q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
+ > buf(i,4)*ue(i,4))
+
+ enddo
+
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ im1 = i-1
+ ip1 = i+1
+
+ forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+ > tx2*( ue(ip1,2)-ue(im1,2) )+
+ > dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1))
+
+ forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tx2 * (
+ > (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
+ > (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
+ > xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+
+ > dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2))
+
+ forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tx2 * (
+ > ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
+ > xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+
+ > dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3))
+
+ forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tx2*(
+ > ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
+ > xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+
+ > dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4))
+
+ forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tx2*(
+ > buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
+ > buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
+ > 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+
+ > buf(im1,1))+
+ > xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+
+ > xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+
+ > dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5))
+ enddo
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation
+c---------------------------------------------------------------------
+ if (start(1,c) .gt. 0) then
+ do m = 1, 5
+ i = 1
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m))
+ i = 2
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) -
+ > 4.0d0*ue(i+1,m) + ue(i+2,m))
+ enddo
+ endif
+
+ do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1
+ do m = 1, 5
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+ > (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+ > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m))
+ enddo
+ enddo
+
+ if (end(1,c) .gt. 0) then
+ do m = 1, 5
+ i = cell_size(1,c)-3
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+ > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m))
+ i = cell_size(1,c)-2
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m))
+ enddo
+ endif
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c eta-direction flux differences
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ zeta = dble(k+cell_low(3,c)) * dnzm1
+ do i=start(1,c), cell_size(1,c)-end(1,c)-1
+ xi = dble(i+cell_low(1,c)) * dnxm1
+
+ do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c)
+ eta = dble(j+cell_low(2,c)) * dnym1
+
+ call exact_solution(xi, eta, zeta, dtemp)
+ do m = 1, 5
+ ue(j,m) = dtemp(m)
+ enddo
+
+ dtpp = 1.0d0/dtemp(1)
+
+ do m = 2, 5
+ buf(j,m) = dtpp * dtemp(m)
+ enddo
+
+ cuf(j) = buf(j,3) * buf(j,3)
+ buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) +
+ > buf(j,4) * buf(j,4)
+ q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
+ > buf(j,4)*ue(j,4))
+ enddo
+
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ jm1 = j-1
+ jp1 = j+1
+
+ forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+ > ty2*( ue(jp1,3)-ue(jm1,3) )+
+ > dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1))
+
+ forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - ty2*(
+ > ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
+ > yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+
+ > dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2))
+
+ forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - ty2*(
+ > (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
+ > (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
+ > yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+
+ > dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3))
+
+ forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - ty2*(
+ > ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
+ > yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+
+ > dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4))
+
+ forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - ty2*(
+ > buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
+ > buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
+ > 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+
+ > buf(jm1,1))+
+ > yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+
+ > yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+
+ > dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5))
+ enddo
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation
+c---------------------------------------------------------------------
+ if (start(2,c) .gt. 0) then
+ do m = 1, 5
+ j = 1
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m))
+ j = 2
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) -
+ > 4.0d0*ue(j+1,m) + ue(j+2,m))
+ enddo
+ endif
+
+ do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1
+ do m = 1, 5
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+ > (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+ > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m))
+ enddo
+ enddo
+
+ if (end(2,c) .gt. 0) then
+ do m = 1, 5
+ j = cell_size(2,c)-3
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+ > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m))
+ j = cell_size(2,c)-2
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m))
+
+ enddo
+ endif
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c zeta-direction flux differences
+c---------------------------------------------------------------------
+ do j=start(2,c), cell_size(2,c)-end(2,c)-1
+ eta = dble(j+cell_low(2,c)) * dnym1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ xi = dble(i+cell_low(1,c)) * dnxm1
+
+ do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c)
+ zeta = dble(k+cell_low(3,c)) * dnzm1
+
+ call exact_solution(xi, eta, zeta, dtemp)
+ do m = 1, 5
+ ue(k,m) = dtemp(m)
+ enddo
+
+ dtpp = 1.0d0/dtemp(1)
+
+ do m = 2, 5
+ buf(k,m) = dtpp * dtemp(m)
+ enddo
+
+ cuf(k) = buf(k,4) * buf(k,4)
+ buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) +
+ > buf(k,3) * buf(k,3)
+ q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
+ > buf(k,4)*ue(k,4))
+ enddo
+
+ do k=start(3,c), cell_size(3,c)-end(3,c)-1
+ km1 = k-1
+ kp1 = k+1
+
+ forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+ > tz2*( ue(kp1,4)-ue(km1,4) )+
+ > dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1))
+
+ forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tz2 * (
+ > ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
+ > zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+
+ > dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2))
+
+ forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tz2 * (
+ > ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
+ > zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+
+ > dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3))
+
+ forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tz2 * (
+ > (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
+ > (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
+ > zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+
+ > dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4))
+
+ forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tz2 * (
+ > buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
+ > buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
+ > 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1)
+ > +buf(km1,1))+
+ > zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+
+ > zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+
+ > dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5))
+ enddo
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation
+c---------------------------------------------------------------------
+ if (start(3,c) .gt. 0) then
+ do m = 1, 5
+ k = 1
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m))
+ k = 2
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) -
+ > 4.0d0*ue(k+1,m) + ue(k+2,m))
+ enddo
+ endif
+
+ do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1
+ do m = 1, 5
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+ > (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+ > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m))
+ enddo
+ enddo
+
+ if (end(3,c) .gt. 0) then
+ do m = 1, 5
+ k = cell_size(3,c)-3
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+ > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m))
+ k = cell_size(3,c)-2
+ forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+ > (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m))
+ enddo
+ endif
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c now change the sign of the forcing function,
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ forcing(m,i,j,k,c) = -1.d0 * forcing(m,i,j,k,c)
+ enddo
+ enddo
+ enddo
+ enddo
+
+ enddo
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exact_solution(xi,eta,zeta,dtemp)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function returns the exact solution at point xi, eta, zeta
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ double precision xi, eta, zeta, dtemp(5)
+ integer m
+
+ do m = 1, 5
+ dtemp(m) = ce(m,1) +
+ > xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
+ > eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
+ > zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) +
+ > zeta*ce(m,13))))
+ enddo
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ character*(128) newfilenm
+ integer m, ierr
+
+ if (node.eq.root) record_length = 40/fortran_rec_sz
+ call mpi_bcast(record_length, 1, MPI_INTEGER,
+ > root, comm_setup, ierr)
+
+ open (unit=99, file=filenm,
+ $ form='unformatted', access='direct',
+ $ recl=record_length)
+
+ do m = 1, 5
+ xce_sub(m) = 0.d0
+ end do
+
+ idump_sub = 0
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer ix, jio, kio, cio
+
+ do cio=1,ncells
+ do kio=0, cell_size(3,cio)-1
+ do jio=0, cell_size(2,cio)-1
+ iseek=(cell_low(1,cio) +
+ $ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+ $ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+ $ PROBLEM_SIZE*idump_sub)))
+
+ do ix=0,cell_size(1,cio)-1
+ write(99, rec=iseek+ix+1)
+ $ u(1,ix, jio,kio,cio),
+ $ u(2,ix, jio,kio,cio),
+ $ u(3,ix, jio,kio,cio),
+ $ u(4,ix, jio,kio,cio),
+ $ u(5,ix, jio,kio,cio)
+ enddo
+ enddo
+ enddo
+ enddo
+
+ idump_sub = idump_sub + 1
+ if (rd_interval .gt. 0) then
+ if (idump_sub .ge. rd_interval) then
+
+ call acc_sub_norms(idump+1)
+
+ idump_sub = 0
+ endif
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine acc_sub_norms(idump_cur)
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer idump_cur
+
+ integer ix, jio, kio, cio, ii, m, ichunk
+ double precision xce_single(5)
+
+ ichunk = idump_cur - idump_sub + 1
+ do ii=0, idump_sub-1
+ do cio=1,ncells
+ do kio=0, cell_size(3,cio)-1
+ do jio=0, cell_size(2,cio)-1
+ iseek=(cell_low(1,cio) +
+ $ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+ $ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+ $ PROBLEM_SIZE*ii)))
+
+
+ do ix=0,cell_size(1,cio)-1
+ read(99, rec=iseek+ix+1)
+ $ u(1,ix, jio,kio,cio),
+ $ u(2,ix, jio,kio,cio),
+ $ u(3,ix, jio,kio,cio),
+ $ u(4,ix, jio,kio,cio),
+ $ u(5,ix, jio,kio,cio)
+ enddo
+ enddo
+ enddo
+ enddo
+
+ if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+ call error_norm(xce_single)
+ do m = 1, 5
+ xce_sub(m) = xce_sub(m) + xce_single(m)
+ end do
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ close(unit=99)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ include 'header.h'
+ include 'mpinpb.h'
+
+ double precision xce_acc(5)
+ integer m
+
+ if (rd_interval .gt. 0) goto 20
+
+ open (unit=99, file=filenm,
+ $ form='unformatted', access='direct',
+ $ recl=record_length)
+
+c clear the last time step
+
+ call clear_timestep
+
+c read back the time steps and accumulate norms
+
+ call acc_sub_norms(idump)
+
+ close(unit=99)
+
+ 20 continue
+ do m = 1, 5
+ xce_acc(m) = xce_sub(m) / dble(idump)
+ end do
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer ierr
+ integer mstatus(MPI_STATUS_SIZE)
+ integer sizes(4), starts(4), subsizes(4)
+ integer cell_btype(maxcells), cell_ftype(maxcells)
+ integer cell_blength(maxcells)
+ integer info
+ character*20 cb_nodes, cb_size
+ integer c, m
+ integer cell_disp(maxcells)
+
+ call mpi_bcast(collbuf_nodes, 1, MPI_INTEGER,
+ > root, comm_setup, ierr)
+
+ call mpi_bcast(collbuf_size, 1, MPI_INTEGER,
+ > root, comm_setup, ierr)
+
+ if (collbuf_nodes .eq. 0) then
+ info = MPI_INFO_NULL
+ else
+ write (cb_nodes,*) collbuf_nodes
+ write (cb_size,*) collbuf_size
+ call MPI_Info_create(info, ierr)
+ call MPI_Info_set(info, 'cb_nodes', cb_nodes, ierr)
+ call MPI_Info_set(info, 'cb_buffer_size', cb_size, ierr)
+ call MPI_Info_set(info, 'collective_buffering', 'true', ierr)
+ endif
+
+ call MPI_Type_contiguous(5, MPI_DOUBLE_PRECISION,
+ $ element, ierr)
+ call MPI_Type_commit(element, ierr)
+ call MPI_Type_extent(element, eltext, ierr)
+
+ do c = 1, ncells
+c
+c Outer array dimensions ar same for every cell
+c
+ sizes(1) = IMAX+4
+ sizes(2) = JMAX+4
+ sizes(3) = KMAX+4
+c
+c 4th dimension is cell number, total of maxcells cells
+c
+ sizes(4) = maxcells
+c
+c Internal dimensions of cells can differ slightly between cells
+c
+ subsizes(1) = cell_size(1, c)
+ subsizes(2) = cell_size(2, c)
+ subsizes(3) = cell_size(3, c)
+c
+c Cell is 4th dimension, 1 cell per cell type to handle varying
+c cell sub-array sizes
+c
+ subsizes(4) = 1
+
+c
+c type constructors use 0-based start addresses
+c
+ starts(1) = 2
+ starts(2) = 2
+ starts(3) = 2
+ starts(4) = c-1
+
+c
+c Create buftype for a cell
+c
+ call MPI_Type_create_subarray(4, sizes, subsizes,
+ $ starts, MPI_ORDER_FORTRAN, element,
+ $ cell_btype(c), ierr)
+c
+c block length and displacement for joining cells -
+c 1 cell buftype per block, cell buftypes have own displacment
+c generated from cell number (4th array dimension)
+c
+ cell_blength(c) = 1
+ cell_disp(c) = 0
+
+ enddo
+c
+c Create combined buftype for all cells
+c
+ call MPI_Type_struct(ncells, cell_blength, cell_disp,
+ $ cell_btype, combined_btype, ierr)
+ call MPI_Type_commit(combined_btype, ierr)
+
+ do c = 1, ncells
+c
+c Entire array size
+c
+ sizes(1) = PROBLEM_SIZE
+ sizes(2) = PROBLEM_SIZE
+ sizes(3) = PROBLEM_SIZE
+
+c
+c Size of c'th cell
+c
+ subsizes(1) = cell_size(1, c)
+ subsizes(2) = cell_size(2, c)
+ subsizes(3) = cell_size(3, c)
+
+c
+c Starting point in full array of c'th cell
+c
+ starts(1) = cell_low(1,c)
+ starts(2) = cell_low(2,c)
+ starts(3) = cell_low(3,c)
+
+ call MPI_Type_create_subarray(3, sizes, subsizes,
+ $ starts, MPI_ORDER_FORTRAN,
+ $ element, cell_ftype(c), ierr)
+ cell_blength(c) = 1
+ cell_disp(c) = 0
+ enddo
+
+ call MPI_Type_struct(ncells, cell_blength, cell_disp,
+ $ cell_ftype, combined_ftype, ierr)
+ call MPI_Type_commit(combined_ftype, ierr)
+
+ iseek=0
+ if (node .eq. root) then
+ call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
+ endif
+
+
+ call MPI_Barrier(comm_solve, ierr)
+
+ call MPI_File_open(comm_solve,
+ $ filenm,
+ $ MPI_MODE_RDWR+MPI_MODE_CREATE,
+ $ MPI_INFO_NULL, fp, ierr)
+
+ if (ierr .ne. MPI_SUCCESS) then
+ print *, 'Error opening file'
+ stop
+ endif
+
+ call MPI_File_set_view(fp, iseek, element,
+ $ combined_ftype, 'native', info, ierr)
+
+ if (ierr .ne. MPI_SUCCESS) then
+ print *, 'Error setting file view'
+ stop
+ endif
+
+ do m = 1, 5
+ xce_sub(m) = 0.d0
+ end do
+
+ idump_sub = 0
+
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer mstatus(MPI_STATUS_SIZE)
+ integer ierr
+
+ call MPI_File_write_at_all(fp, iseek, u,
+ $ 1, combined_btype, mstatus, ierr)
+ if (ierr .ne. MPI_SUCCESS) then
+ print *, 'Error writing to file'
+ stop
+ endif
+
+ call MPI_Type_size(combined_btype, iosize, ierr)
+ iseek = iseek + iosize/eltext
+
+ idump_sub = idump_sub + 1
+ if (rd_interval .gt. 0) then
+ if (idump_sub .ge. rd_interval) then
+
+ iseek = 0
+ call acc_sub_norms(idump+1)
+
+ iseek = 0
+ idump_sub = 0
+ endif
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine acc_sub_norms(idump_cur)
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer idump_cur
+
+ integer ii, m, ichunk
+ integer ierr
+ integer mstatus(MPI_STATUS_SIZE)
+ double precision xce_single(5)
+
+ ichunk = idump_cur - idump_sub + 1
+ do ii=0, idump_sub-1
+
+ call MPI_File_read_at_all(fp, iseek, u,
+ $ 1, combined_btype, mstatus, ierr)
+ if (ierr .ne. MPI_SUCCESS) then
+ print *, 'Error reading back file'
+ call MPI_File_close(fp, ierr)
+ stop
+ endif
+
+ call MPI_Type_size(combined_btype, iosize, ierr)
+ iseek = iseek + iosize/eltext
+
+ if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+ call error_norm(xce_single)
+ do m = 1, 5
+ xce_sub(m) = xce_sub(m) + xce_single(m)
+ end do
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer ierr
+
+ call MPI_File_close(fp, ierr)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+ subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ double precision xce_acc(5)
+ integer m, ierr
+
+ if (rd_interval .gt. 0) goto 20
+
+ call MPI_File_open(comm_solve,
+ $ filenm,
+ $ MPI_MODE_RDONLY,
+ $ MPI_INFO_NULL,
+ $ fp,
+ $ ierr)
+
+ iseek = 0
+ call MPI_File_set_view(fp, iseek, element, combined_ftype,
+ $ 'native', MPI_INFO_NULL, ierr)
+
+c clear the last time step
+
+ call clear_timestep
+
+c read back the time steps and accumulate norms
+
+ call acc_sub_norms(idump)
+
+ call MPI_File_close(fp, ierr)
+
+ 20 continue
+ do m = 1, 5
+ xce_acc(m) = xce_sub(m) / dble(idump)
+ end do
+
+ return
+ end
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c
+c header.h
+c
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+c---------------------------------------------------------------------
+c The following include file is generated automatically by the
+c "setparams" utility. It defines
+c maxcells: the square root of the maximum number of processors
+c problem_size: 12, 64, 102, 162 (for class T, A, B, C)
+c dt_default: default time step for this problem size if no
+c config file
+c niter_default: default number of iterations for this problem size
+c---------------------------------------------------------------------
+
+ include 'npbparams.h'
+
+ integer aa, bb, cc, BLOCK_SIZE
+ parameter (aa=1, bb=2, cc=3, BLOCK_SIZE=5)
+
+ integer ncells, grid_points(3)
+ double precision elapsed_time
+ common /global/ elapsed_time, ncells, grid_points
+
+ double precision tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
+ > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4,
+ > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt,
+ > ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2,
+ > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+ > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+ > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+ > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1,
+ > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1,
+ > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2,
+ > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+ > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1,
+ > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6,
+ > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+ common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
+ > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4,
+ > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt,
+ > ce, dxmax, dymax, dzmax, xxcon1, xxcon2,
+ > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+ > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+ > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+ > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1,
+ > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1,
+ > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2,
+ > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+ > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1,
+ > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6,
+ > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+ integer EAST, WEST, NORTH, SOUTH,
+ > BOTTOM, TOP
+
+ parameter (EAST=2000, WEST=3000, NORTH=4000, SOUTH=5000,
+ > BOTTOM=6000, TOP=7000)
+
+ integer cell_coord (3,maxcells), cell_low (3,maxcells),
+ > cell_high (3,maxcells), cell_size(3,maxcells),
+ > predecessor(3), slice (3,maxcells),
+ > grid_size (3), successor(3) ,
+ > start (3,maxcells), end (3,maxcells)
+ common /partition/ cell_coord, cell_low, cell_high, cell_size,
+ > grid_size, successor, predecessor, slice,
+ > start, end
+
+ integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE
+
+ parameter (MAX_CELL_DIM = (problem_size/maxcells)+1)
+
+ parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM)
+
+ parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60+1)
+
+ double precision
+ > us ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > vs ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > ws ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > qs ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > rho_i ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > square ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > forcing (5, 0:IMAX-1, 0:JMAX-1, 0:KMAX-1, maxcells),
+ > u (5, -2:IMAX+1,-2:JMAX+1,-2:KMAX+1, maxcells),
+ > rhs (5, -1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
+ > lhsc (5,5,-1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
+ > backsub_info (5, 0:MAX_CELL_DIM, 0:MAX_CELL_DIM, maxcells),
+ > in_buffer(BUF_SIZE), out_buffer(BUF_SIZE)
+ common /fields/ u, us, vs, ws, qs, rho_i, square,
+ > rhs, forcing, lhsc, in_buffer, out_buffer,
+ > backsub_info
+
+ double precision cv(-2:MAX_CELL_DIM+1), rhon(-2:MAX_CELL_DIM+1),
+ > rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1),
+ > cuf(-2:MAX_CELL_DIM+1), q(-2:MAX_CELL_DIM+1),
+ > ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5)
+ common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf
+
+ integer west_size, east_size, bottom_size, top_size,
+ > north_size, south_size, start_send_west,
+ > start_send_east, start_send_south, start_send_north,
+ > start_send_bottom, start_send_top, start_recv_west,
+ > start_recv_east, start_recv_south, start_recv_north,
+ > start_recv_bottom, start_recv_top
+ common /box/ west_size, east_size, bottom_size,
+ > top_size, north_size, south_size,
+ > start_send_west, start_send_east, start_send_south,
+ > start_send_north, start_send_bottom, start_send_top,
+ > start_recv_west, start_recv_east, start_recv_south,
+ > start_recv_north, start_recv_bottom, start_recv_top
+
+ double precision tmp_block(5,5), b_inverse(5,5), tmp_vec(5)
+ common /work_solve/ tmp_block, b_inverse, tmp_vec
+
+c
+c These are used by btio
+c
+ integer collbuf_nodes, collbuf_size, iosize, eltext,
+ $ combined_btype, fp, idump, record_length, element,
+ $ combined_ftype, idump_sub, rd_interval
+ common /btio/ collbuf_nodes, collbuf_size, iosize, eltext,
+ $ combined_btype, fp, idump, record_length,
+ $ idump_sub, rd_interval
+ double precision sum(niter_default), xce_sub(5)
+ common /btio/ sum, xce_sub
+ integer*8 iseek
+ common /btio/ iseek, element, combined_ftype
+
+
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine initialize
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This subroutine initializes the field variable u using
+c tri-linear transfinite interpolation of the boundary values
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, i, j, k, m, ii, jj, kk, ix, iy, iz
+ double precision xi, eta, zeta, Pface(5,3,2), Pxi, Peta,
+ > Pzeta, temp(5)
+
+c---------------------------------------------------------------------
+c Later (in compute_rhs) we compute 1/u for every element. A few of
+c the corner elements are not used, but it convenient (and faster)
+c to compute the whole thing with a simple loop. Make sure those
+c values are nonzero by initializing the whole thing here.
+c---------------------------------------------------------------------
+ do c = 1, ncells
+ do kk = -1, KMAX
+ do jj = -1, JMAX
+ do ii = -1, IMAX
+ do m = 1, 5
+ u(m, ii, jj, kk, c) = 1.0
+ end do
+ end do
+ end do
+ end do
+ end do
+c---------------------------------------------------------------------
+
+
+
+c---------------------------------------------------------------------
+c first store the "interpolated" values everywhere on the grid
+c---------------------------------------------------------------------
+ do c=1, ncells
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ ii = 0
+ do i = cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+
+ do ix = 1, 2
+ call exact_solution(dble(ix-1), eta, zeta,
+ > Pface(1,1,ix))
+ enddo
+
+ do iy = 1, 2
+ call exact_solution(xi, dble(iy-1) , zeta,
+ > Pface(1,2,iy))
+ enddo
+
+ do iz = 1, 2
+ call exact_solution(xi, eta, dble(iz-1),
+ > Pface(1,3,iz))
+ enddo
+
+ do m = 1, 5
+ Pxi = xi * Pface(m,1,2) +
+ > (1.0d0-xi) * Pface(m,1,1)
+ Peta = eta * Pface(m,2,2) +
+ > (1.0d0-eta) * Pface(m,2,1)
+ Pzeta = zeta * Pface(m,3,2) +
+ > (1.0d0-zeta) * Pface(m,3,1)
+
+ u(m,ii,jj,kk,c) = Pxi + Peta + Pzeta -
+ > Pxi*Peta - Pxi*Pzeta - Peta*Pzeta +
+ > Pxi*Peta*Pzeta
+
+ enddo
+ ii = ii + 1
+ enddo
+ jj = jj + 1
+ enddo
+ kk = kk+1
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c now store the exact values on the boundaries
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c west face
+c---------------------------------------------------------------------
+ c = slice(1,1)
+ ii = 0
+ xi = 0.0d0
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(m,ii,jj,kk,c) = temp(m)
+ enddo
+ jj = jj + 1
+ enddo
+ kk = kk + 1
+ enddo
+
+c---------------------------------------------------------------------
+c east face
+c---------------------------------------------------------------------
+ c = slice(1,ncells)
+ ii = cell_size(1,c)-1
+ xi = 1.0d0
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(m,ii,jj,kk,c) = temp(m)
+ enddo
+ jj = jj + 1
+ enddo
+ kk = kk + 1
+ enddo
+
+c---------------------------------------------------------------------
+c south face
+c---------------------------------------------------------------------
+ c = slice(2,1)
+ jj = 0
+ eta = 0.0d0
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ ii = 0
+ do i = cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(m,ii,jj,kk,c) = temp(m)
+ enddo
+ ii = ii + 1
+ enddo
+ kk = kk + 1
+ enddo
+
+
+c---------------------------------------------------------------------
+c north face
+c---------------------------------------------------------------------
+ c = slice(2,ncells)
+ jj = cell_size(2,c)-1
+ eta = 1.0d0
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ ii = 0
+ do i = cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(m,ii,jj,kk,c) = temp(m)
+ enddo
+ ii = ii + 1
+ enddo
+ kk = kk + 1
+ enddo
+
+c---------------------------------------------------------------------
+c bottom face
+c---------------------------------------------------------------------
+ c = slice(3,1)
+ kk = 0
+ zeta = 0.0d0
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ ii = 0
+ do i =cell_low(1,c), cell_high(1,c)
+ xi = dble(i) *dnxm1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(m,ii,jj,kk,c) = temp(m)
+ enddo
+ ii = ii + 1
+ enddo
+ jj = jj + 1
+ enddo
+
+c---------------------------------------------------------------------
+c top face
+c---------------------------------------------------------------------
+ c = slice(3,ncells)
+ kk = cell_size(3,c)-1
+ zeta = 1.0d0
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ ii = 0
+ do i =cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(m,ii,jj,kk,c) = temp(m)
+ enddo
+ ii = ii + 1
+ enddo
+ jj = jj + 1
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine lhsinit
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i, j, k, d, c, m, n
+
+c---------------------------------------------------------------------
+c loop over all cells
+c---------------------------------------------------------------------
+ do c = 1, ncells
+
+c---------------------------------------------------------------------
+c first, initialize the start and end arrays
+c---------------------------------------------------------------------
+ do d = 1, 3
+ if (cell_coord(d,c) .eq. 1) then
+ start(d,c) = 1
+ else
+ start(d,c) = 0
+ endif
+ if (cell_coord(d,c) .eq. ncells) then
+ end(d,c) = 1
+ else
+ end(d,c) = 0
+ endif
+ enddo
+
+c---------------------------------------------------------------------
+c zero the whole left hand side for starters
+c---------------------------------------------------------------------
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ do m = 1,5
+ do n = 1, 5
+ lhsc(m,n,i,j,k,c) = 0.0d0
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine lhsabinit(lhsa, lhsb, size)
+ implicit none
+
+ integer size
+ double precision lhsa(5, 5, -1:size), lhsb(5, 5, -1:size)
+
+ integer i, m, n
+
+c---------------------------------------------------------------------
+c next, set all diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+ do i = 0, size
+ do m = 1, 5
+ do n = 1, 5
+ lhsa(m,n,i) = 0.0d0
+ lhsb(m,n,i) = 0.0d0
+ enddo
+ lhsb(m,m,i) = 1.0d0
+ enddo
+ enddo
+
+ return
+ end
+
+
+
--- /dev/null
+200 number of time steps
+0.0008d0 dt for class A = 0.0008d0. class B = 0.0003d0 class C = 0.0001d0
+64 64 64
+5 0 write interval (optional read interval) for BTIO
+0 1000000 number of nodes in collective buffering and buffer size for BTIO
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine make_set
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This function allocates space for a set of cells and fills the set
+c such that communication between cells on different nodes is only
+c nearest neighbor
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+
+ integer p, i, j, c, dir, size, excess, ierr,ierrcode
+
+c---------------------------------------------------------------------
+c compute square root; add small number to allow for roundoff
+c (note: this is computed in setup_mpi.f also, but prefer to do
+c it twice because of some include file problems).
+c---------------------------------------------------------------------
+ ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c this makes coding easier
+c---------------------------------------------------------------------
+ p = ncells
+
+c---------------------------------------------------------------------
+c determine the location of the cell at the bottom of the 3D
+c array of cells
+c---------------------------------------------------------------------
+ cell_coord(1,1) = mod(node,p)
+ cell_coord(2,1) = node/p
+ cell_coord(3,1) = 0
+
+c---------------------------------------------------------------------
+c set the cell_coords for cells in the rest of the z-layers;
+c this comes down to a simple linear numbering in the z-direct-
+c ion, and to the doubly-cyclic numbering in the other dirs
+c---------------------------------------------------------------------
+ do c=2, p
+ cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p)
+ cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p)
+ cell_coord(3,c) = c-1
+ end do
+
+c---------------------------------------------------------------------
+c offset all the coordinates by 1 to adjust for Fortran arrays
+c---------------------------------------------------------------------
+ do dir = 1, 3
+ do c = 1, p
+ cell_coord(dir,c) = cell_coord(dir,c) + 1
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c slice(dir,n) contains the sequence number of the cell that is in
+c coordinate plane n in the dir direction
+c---------------------------------------------------------------------
+ do dir = 1, 3
+ do c = 1, p
+ slice(dir,cell_coord(dir,c)) = c
+ end do
+ end do
+
+
+c---------------------------------------------------------------------
+c fill the predecessor and successor entries, using the indices
+c of the bottom cells (they are the same at each level of k
+c anyway) acting as if full periodicity pertains; note that p is
+c added to those arguments to the mod functions that might
+c otherwise return wrong values when using the modulo function
+c---------------------------------------------------------------------
+ i = cell_coord(1,1)-1
+ j = cell_coord(2,1)-1
+
+ predecessor(1) = mod(i-1+p,p) + p*j
+ predecessor(2) = i + p*mod(j-1+p,p)
+ predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p)
+ successor(1) = mod(i+1,p) + p*j
+ successor(2) = i + p*mod(j+1,p)
+ successor(3) = mod(i-1+p,p) + p*mod(j+1,p)
+
+c---------------------------------------------------------------------
+c now compute the sizes of the cells
+c---------------------------------------------------------------------
+ do dir= 1, 3
+c---------------------------------------------------------------------
+c set cell_coord range for each direction
+c---------------------------------------------------------------------
+ size = grid_points(dir)/p
+ excess = mod(grid_points(dir),p)
+ do c=1, ncells
+ if (cell_coord(dir,c) .le. excess) then
+ cell_size(dir,c) = size+1
+ cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1)
+ cell_high(dir,c) = cell_low(dir,c)+size
+ else
+ cell_size(dir,c) = size
+ cell_low(dir,c) = excess*(size+1)+
+ > (cell_coord(dir,c)-excess-1)*size
+ cell_high(dir,c) = cell_low(dir,c)+size-1
+ endif
+ if (cell_size(dir, c) .le. 2) then
+ write(*,50)
+ 50 format(' Error: Cell size too small. Min size is 3')
+ call MPI_Abort(mpi_comm_world,ierrcode,ierr)
+ stop
+ endif
+ end do
+ end do
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'mpif.h'
+
+ integer node, no_nodes, total_nodes, root, comm_setup,
+ > comm_solve, comm_rhs, dp_type
+ logical active
+ common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup,
+ > comm_solve, comm_rhs, dp_type, active
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine compute_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, i, j, k, m
+ double precision rho_inv, uijk, up1, um1, vijk, vp1, vm1,
+ > wijk, wp1, wm1
+
+
+c---------------------------------------------------------------------
+c loop over all cells owned by this node
+c---------------------------------------------------------------------
+ do c = 1, ncells
+
+c---------------------------------------------------------------------
+c compute the reciprocal of density, and the kinetic energy,
+c and the speed of sound.
+c---------------------------------------------------------------------
+ do k = -1, cell_size(3,c)
+ do j = -1, cell_size(2,c)
+ do i = -1, cell_size(1,c)
+ rho_inv = 1.0d0/u(1,i,j,k,c)
+ rho_i(i,j,k,c) = rho_inv
+ us(i,j,k,c) = u(2,i,j,k,c) * rho_inv
+ vs(i,j,k,c) = u(3,i,j,k,c) * rho_inv
+ ws(i,j,k,c) = u(4,i,j,k,c) * rho_inv
+ square(i,j,k,c) = 0.5d0* (
+ > u(2,i,j,k,c)*u(2,i,j,k,c) +
+ > u(3,i,j,k,c)*u(3,i,j,k,c) +
+ > u(4,i,j,k,c)*u(4,i,j,k,c) ) * rho_inv
+ qs(i,j,k,c) = square(i,j,k,c) * rho_inv
+ enddo
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c copy the exact forcing term to the right hand side; because
+c this forcing term is known, we can store it on the whole of every
+c cell, including the boundary
+c---------------------------------------------------------------------
+
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = forcing(m,i,j,k,c)
+ enddo
+ enddo
+ enddo
+ enddo
+
+
+c---------------------------------------------------------------------
+c compute xi-direction fluxes
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ uijk = us(i,j,k,c)
+ up1 = us(i+1,j,k,c)
+ um1 = us(i-1,j,k,c)
+
+ rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dx1tx1 *
+ > (u(1,i+1,j,k,c) - 2.0d0*u(1,i,j,k,c) +
+ > u(1,i-1,j,k,c)) -
+ > tx2 * (u(2,i+1,j,k,c) - u(2,i-1,j,k,c))
+
+ rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dx2tx1 *
+ > (u(2,i+1,j,k,c) - 2.0d0*u(2,i,j,k,c) +
+ > u(2,i-1,j,k,c)) +
+ > xxcon2*con43 * (up1 - 2.0d0*uijk + um1) -
+ > tx2 * (u(2,i+1,j,k,c)*up1 -
+ > u(2,i-1,j,k,c)*um1 +
+ > (u(5,i+1,j,k,c)- square(i+1,j,k,c)-
+ > u(5,i-1,j,k,c)+ square(i-1,j,k,c))*
+ > c2)
+
+ rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dx3tx1 *
+ > (u(3,i+1,j,k,c) - 2.0d0*u(3,i,j,k,c) +
+ > u(3,i-1,j,k,c)) +
+ > xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) +
+ > vs(i-1,j,k,c)) -
+ > tx2 * (u(3,i+1,j,k,c)*up1 -
+ > u(3,i-1,j,k,c)*um1)
+
+ rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dx4tx1 *
+ > (u(4,i+1,j,k,c) - 2.0d0*u(4,i,j,k,c) +
+ > u(4,i-1,j,k,c)) +
+ > xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) +
+ > ws(i-1,j,k,c)) -
+ > tx2 * (u(4,i+1,j,k,c)*up1 -
+ > u(4,i-1,j,k,c)*um1)
+
+ rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dx5tx1 *
+ > (u(5,i+1,j,k,c) - 2.0d0*u(5,i,j,k,c) +
+ > u(5,i-1,j,k,c)) +
+ > xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) +
+ > qs(i-1,j,k,c)) +
+ > xxcon4 * (up1*up1 - 2.0d0*uijk*uijk +
+ > um1*um1) +
+ > xxcon5 * (u(5,i+1,j,k,c)*rho_i(i+1,j,k,c) -
+ > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+ > u(5,i-1,j,k,c)*rho_i(i-1,j,k,c)) -
+ > tx2 * ( (c1*u(5,i+1,j,k,c) -
+ > c2*square(i+1,j,k,c))*up1 -
+ > (c1*u(5,i-1,j,k,c) -
+ > c2*square(i-1,j,k,c))*um1 )
+ enddo
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c add fourth order xi-direction dissipation
+c---------------------------------------------------------------------
+ if (start(1,c) .gt. 0) then
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ i = 1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp *
+ > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) +
+ > u(m,i+2,j,k,c))
+ enddo
+
+ i = 2
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > (-4.0d0*u(m,i-1,j,k,c) + 6.0d0*u(m,i,j,k,c) -
+ > 4.0d0*u(m,i+1,j,k,c) + u(m,i+2,j,k,c))
+ enddo
+ enddo
+ enddo
+ endif
+
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) +
+ > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) +
+ > u(m,i+2,j,k,c) )
+ enddo
+ enddo
+ enddo
+ enddo
+
+
+ if (end(1,c) .gt. 0) then
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ i = cell_size(1,c)-3
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) +
+ > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) )
+ enddo
+
+ i = cell_size(1,c)-2
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > ( u(m,i-2,j,k,c) - 4.d0*u(m,i-1,j,k,c) +
+ > 5.d0*u(m,i,j,k,c) )
+ enddo
+ enddo
+ enddo
+ endif
+
+c---------------------------------------------------------------------
+c compute eta-direction fluxes
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ vijk = vs(i,j,k,c)
+ vp1 = vs(i,j+1,k,c)
+ vm1 = vs(i,j-1,k,c)
+ rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dy1ty1 *
+ > (u(1,i,j+1,k,c) - 2.0d0*u(1,i,j,k,c) +
+ > u(1,i,j-1,k,c)) -
+ > ty2 * (u(3,i,j+1,k,c) - u(3,i,j-1,k,c))
+ rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dy2ty1 *
+ > (u(2,i,j+1,k,c) - 2.0d0*u(2,i,j,k,c) +
+ > u(2,i,j-1,k,c)) +
+ > yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) +
+ > us(i,j-1,k,c)) -
+ > ty2 * (u(2,i,j+1,k,c)*vp1 -
+ > u(2,i,j-1,k,c)*vm1)
+ rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dy3ty1 *
+ > (u(3,i,j+1,k,c) - 2.0d0*u(3,i,j,k,c) +
+ > u(3,i,j-1,k,c)) +
+ > yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) -
+ > ty2 * (u(3,i,j+1,k,c)*vp1 -
+ > u(3,i,j-1,k,c)*vm1 +
+ > (u(5,i,j+1,k,c) - square(i,j+1,k,c) -
+ > u(5,i,j-1,k,c) + square(i,j-1,k,c))
+ > *c2)
+ rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dy4ty1 *
+ > (u(4,i,j+1,k,c) - 2.0d0*u(4,i,j,k,c) +
+ > u(4,i,j-1,k,c)) +
+ > yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) +
+ > ws(i,j-1,k,c)) -
+ > ty2 * (u(4,i,j+1,k,c)*vp1 -
+ > u(4,i,j-1,k,c)*vm1)
+ rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dy5ty1 *
+ > (u(5,i,j+1,k,c) - 2.0d0*u(5,i,j,k,c) +
+ > u(5,i,j-1,k,c)) +
+ > yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) +
+ > qs(i,j-1,k,c)) +
+ > yycon4 * (vp1*vp1 - 2.0d0*vijk*vijk +
+ > vm1*vm1) +
+ > yycon5 * (u(5,i,j+1,k,c)*rho_i(i,j+1,k,c) -
+ > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+ > u(5,i,j-1,k,c)*rho_i(i,j-1,k,c)) -
+ > ty2 * ((c1*u(5,i,j+1,k,c) -
+ > c2*square(i,j+1,k,c)) * vp1 -
+ > (c1*u(5,i,j-1,k,c) -
+ > c2*square(i,j-1,k,c)) * vm1)
+ enddo
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c add fourth order eta-direction dissipation
+c---------------------------------------------------------------------
+ if (start(2,c) .gt. 0) then
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ j = 1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp *
+ > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) +
+ > u(m,i,j+2,k,c))
+ enddo
+ enddo
+
+ j = 2
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > (-4.0d0*u(m,i,j-1,k,c) + 6.0d0*u(m,i,j,k,c) -
+ > 4.0d0*u(m,i,j+1,k,c) + u(m,i,j+2,k,c))
+ enddo
+ enddo
+ enddo
+ endif
+
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1
+ do i = start(1,c),cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) +
+ > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) +
+ > u(m,i,j+2,k,c) )
+ enddo
+ enddo
+ enddo
+ enddo
+
+ if (end(2,c) .gt. 0) then
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ j = cell_size(2,c)-3
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) +
+ > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) )
+ enddo
+ enddo
+
+ j = cell_size(2,c)-2
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > ( u(m,i,j-2,k,c) - 4.d0*u(m,i,j-1,k,c) +
+ > 5.d0*u(m,i,j,k,c) )
+ enddo
+ enddo
+ enddo
+ endif
+
+c---------------------------------------------------------------------
+c compute zeta-direction fluxes
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ wijk = ws(i,j,k,c)
+ wp1 = ws(i,j,k+1,c)
+ wm1 = ws(i,j,k-1,c)
+
+ rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dz1tz1 *
+ > (u(1,i,j,k+1,c) - 2.0d0*u(1,i,j,k,c) +
+ > u(1,i,j,k-1,c)) -
+ > tz2 * (u(4,i,j,k+1,c) - u(4,i,j,k-1,c))
+ rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dz2tz1 *
+ > (u(2,i,j,k+1,c) - 2.0d0*u(2,i,j,k,c) +
+ > u(2,i,j,k-1,c)) +
+ > zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) +
+ > us(i,j,k-1,c)) -
+ > tz2 * (u(2,i,j,k+1,c)*wp1 -
+ > u(2,i,j,k-1,c)*wm1)
+ rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dz3tz1 *
+ > (u(3,i,j,k+1,c) - 2.0d0*u(3,i,j,k,c) +
+ > u(3,i,j,k-1,c)) +
+ > zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) +
+ > vs(i,j,k-1,c)) -
+ > tz2 * (u(3,i,j,k+1,c)*wp1 -
+ > u(3,i,j,k-1,c)*wm1)
+ rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dz4tz1 *
+ > (u(4,i,j,k+1,c) - 2.0d0*u(4,i,j,k,c) +
+ > u(4,i,j,k-1,c)) +
+ > zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) -
+ > tz2 * (u(4,i,j,k+1,c)*wp1 -
+ > u(4,i,j,k-1,c)*wm1 +
+ > (u(5,i,j,k+1,c) - square(i,j,k+1,c) -
+ > u(5,i,j,k-1,c) + square(i,j,k-1,c))
+ > *c2)
+ rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dz5tz1 *
+ > (u(5,i,j,k+1,c) - 2.0d0*u(5,i,j,k,c) +
+ > u(5,i,j,k-1,c)) +
+ > zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) +
+ > qs(i,j,k-1,c)) +
+ > zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk +
+ > wm1*wm1) +
+ > zzcon5 * (u(5,i,j,k+1,c)*rho_i(i,j,k+1,c) -
+ > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+ > u(5,i,j,k-1,c)*rho_i(i,j,k-1,c)) -
+ > tz2 * ( (c1*u(5,i,j,k+1,c) -
+ > c2*square(i,j,k+1,c))*wp1 -
+ > (c1*u(5,i,j,k-1,c) -
+ > c2*square(i,j,k-1,c))*wm1)
+ enddo
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c add fourth order zeta-direction dissipation
+c---------------------------------------------------------------------
+ if (start(3,c) .gt. 0) then
+ k = 1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp *
+ > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) +
+ > u(m,i,j,k+2,c))
+ enddo
+ enddo
+ enddo
+
+ k = 2
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > (-4.0d0*u(m,i,j,k-1,c) + 6.0d0*u(m,i,j,k,c) -
+ > 4.0d0*u(m,i,j,k+1,c) + u(m,i,j,k+2,c))
+ enddo
+ enddo
+ enddo
+ endif
+
+ do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c),cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) +
+ > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) +
+ > u(m,i,j,k+2,c) )
+ enddo
+ enddo
+ enddo
+ enddo
+
+ if (end(3,c) .gt. 0) then
+ k = cell_size(3,c)-3
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) +
+ > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) )
+ enddo
+ enddo
+ enddo
+
+ k = cell_size(3,c)-2
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+ > ( u(m,i,j,k-2,c) - 4.d0*u(m,i,j,k-1,c) +
+ > 5.d0*u(m,i,j,k,c) )
+ enddo
+ enddo
+ enddo
+ endif
+
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c) * dt
+ enddo
+ enddo
+ enddo
+ enddo
+
+ enddo
+
+ return
+ end
+
+
+
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine set_constants
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ ce(1,1) = 2.0d0
+ ce(1,2) = 0.0d0
+ ce(1,3) = 0.0d0
+ ce(1,4) = 4.0d0
+ ce(1,5) = 5.0d0
+ ce(1,6) = 3.0d0
+ ce(1,7) = 0.5d0
+ ce(1,8) = 0.02d0
+ ce(1,9) = 0.01d0
+ ce(1,10) = 0.03d0
+ ce(1,11) = 0.5d0
+ ce(1,12) = 0.4d0
+ ce(1,13) = 0.3d0
+
+ ce(2,1) = 1.0d0
+ ce(2,2) = 0.0d0
+ ce(2,3) = 0.0d0
+ ce(2,4) = 0.0d0
+ ce(2,5) = 1.0d0
+ ce(2,6) = 2.0d0
+ ce(2,7) = 3.0d0
+ ce(2,8) = 0.01d0
+ ce(2,9) = 0.03d0
+ ce(2,10) = 0.02d0
+ ce(2,11) = 0.4d0
+ ce(2,12) = 0.3d0
+ ce(2,13) = 0.5d0
+
+ ce(3,1) = 2.0d0
+ ce(3,2) = 2.0d0
+ ce(3,3) = 0.0d0
+ ce(3,4) = 0.0d0
+ ce(3,5) = 0.0d0
+ ce(3,6) = 2.0d0
+ ce(3,7) = 3.0d0
+ ce(3,8) = 0.04d0
+ ce(3,9) = 0.03d0
+ ce(3,10) = 0.05d0
+ ce(3,11) = 0.3d0
+ ce(3,12) = 0.5d0
+ ce(3,13) = 0.4d0
+
+ ce(4,1) = 2.0d0
+ ce(4,2) = 2.0d0
+ ce(4,3) = 0.0d0
+ ce(4,4) = 0.0d0
+ ce(4,5) = 0.0d0
+ ce(4,6) = 2.0d0
+ ce(4,7) = 3.0d0
+ ce(4,8) = 0.03d0
+ ce(4,9) = 0.05d0
+ ce(4,10) = 0.04d0
+ ce(4,11) = 0.2d0
+ ce(4,12) = 0.1d0
+ ce(4,13) = 0.3d0
+
+ ce(5,1) = 5.0d0
+ ce(5,2) = 4.0d0
+ ce(5,3) = 3.0d0
+ ce(5,4) = 2.0d0
+ ce(5,5) = 0.1d0
+ ce(5,6) = 0.4d0
+ ce(5,7) = 0.3d0
+ ce(5,8) = 0.05d0
+ ce(5,9) = 0.04d0
+ ce(5,10) = 0.03d0
+ ce(5,11) = 0.1d0
+ ce(5,12) = 0.3d0
+ ce(5,13) = 0.2d0
+
+ c1 = 1.4d0
+ c2 = 0.4d0
+ c3 = 0.1d0
+ c4 = 1.0d0
+ c5 = 1.4d0
+
+ bt = dsqrt(0.5d0)
+
+ dnxm1 = 1.0d0 / dble(grid_points(1)-1)
+ dnym1 = 1.0d0 / dble(grid_points(2)-1)
+ dnzm1 = 1.0d0 / dble(grid_points(3)-1)
+
+ c1c2 = c1 * c2
+ c1c5 = c1 * c5
+ c3c4 = c3 * c4
+ c1345 = c1c5 * c3c4
+
+ conz1 = (1.0d0-c1c5)
+
+ tx1 = 1.0d0 / (dnxm1 * dnxm1)
+ tx2 = 1.0d0 / (2.0d0 * dnxm1)
+ tx3 = 1.0d0 / dnxm1
+
+ ty1 = 1.0d0 / (dnym1 * dnym1)
+ ty2 = 1.0d0 / (2.0d0 * dnym1)
+ ty3 = 1.0d0 / dnym1
+
+ tz1 = 1.0d0 / (dnzm1 * dnzm1)
+ tz2 = 1.0d0 / (2.0d0 * dnzm1)
+ tz3 = 1.0d0 / dnzm1
+
+ dx1 = 0.75d0
+ dx2 = 0.75d0
+ dx3 = 0.75d0
+ dx4 = 0.75d0
+ dx5 = 0.75d0
+
+ dy1 = 0.75d0
+ dy2 = 0.75d0
+ dy3 = 0.75d0
+ dy4 = 0.75d0
+ dy5 = 0.75d0
+
+ dz1 = 1.0d0
+ dz2 = 1.0d0
+ dz3 = 1.0d0
+ dz4 = 1.0d0
+ dz5 = 1.0d0
+
+ dxmax = dmax1(dx3, dx4)
+ dymax = dmax1(dy2, dy4)
+ dzmax = dmax1(dz2, dz3)
+
+ dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) )
+
+ c4dssp = 4.0d0 * dssp
+ c5dssp = 5.0d0 * dssp
+
+ dttx1 = dt*tx1
+ dttx2 = dt*tx2
+ dtty1 = dt*ty1
+ dtty2 = dt*ty2
+ dttz1 = dt*tz1
+ dttz2 = dt*tz2
+
+ c2dttx1 = 2.0d0*dttx1
+ c2dtty1 = 2.0d0*dtty1
+ c2dttz1 = 2.0d0*dttz1
+
+ dtdssp = dt*dssp
+
+ comz1 = dtdssp
+ comz4 = 4.0d0*dtdssp
+ comz5 = 5.0d0*dtdssp
+ comz6 = 6.0d0*dtdssp
+
+ c3c4tx3 = c3c4*tx3
+ c3c4ty3 = c3c4*ty3
+ c3c4tz3 = c3c4*tz3
+
+ dx1tx1 = dx1*tx1
+ dx2tx1 = dx2*tx1
+ dx3tx1 = dx3*tx1
+ dx4tx1 = dx4*tx1
+ dx5tx1 = dx5*tx1
+
+ dy1ty1 = dy1*ty1
+ dy2ty1 = dy2*ty1
+ dy3ty1 = dy3*ty1
+ dy4ty1 = dy4*ty1
+ dy5ty1 = dy5*ty1
+
+ dz1tz1 = dz1*tz1
+ dz2tz1 = dz2*tz1
+ dz3tz1 = dz3*tz1
+ dz4tz1 = dz4*tz1
+ dz5tz1 = dz5*tz1
+
+ c2iv = 2.5d0
+ con43 = 4.0d0/3.0d0
+ con16 = 1.0d0/6.0d0
+
+ xxcon1 = c3c4tx3*con43*tx3
+ xxcon2 = c3c4tx3*tx3
+ xxcon3 = c3c4tx3*conz1*tx3
+ xxcon4 = c3c4tx3*con16*tx3
+ xxcon5 = c3c4tx3*c1c5*tx3
+
+ yycon1 = c3c4ty3*con43*ty3
+ yycon2 = c3c4ty3*ty3
+ yycon3 = c3c4ty3*conz1*ty3
+ yycon4 = c3c4ty3*con16*ty3
+ yycon5 = c3c4ty3*c1c5*ty3
+
+ zzcon1 = c3c4tz3*con43*tz3
+ zzcon2 = c3c4tz3*tz3
+ zzcon3 = c3c4tz3*conz1*tz3
+ zzcon4 = c3c4tz3*con16*tz3
+ zzcon5 = c3c4tz3*c1c5*tz3
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setup_mpi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c set up MPI stuff
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'mpinpb.h'
+ include 'npbparams.h'
+ integer error, color, nc
+
+ call mpi_init(error)
+
+ call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error)
+ call mpi_comm_rank(MPI_COMM_WORLD, node, error)
+
+ if (.not. convertdouble) then
+ dp_type = MPI_DOUBLE_PRECISION
+ else
+ dp_type = MPI_REAL
+ endif
+
+c---------------------------------------------------------------------
+c compute square root; add small number to allow for roundoff
+c---------------------------------------------------------------------
+ nc = dint(dsqrt(dble(total_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c We handle a non-square number of nodes by making the excess nodes
+c inactive. However, we can never handle more cells than were compiled
+c in.
+c---------------------------------------------------------------------
+
+ if (nc .gt. maxcells) nc = maxcells
+ if (node .ge. nc*nc) then
+ active = .false.
+ color = 1
+ else
+ active = .true.
+ color = 0
+ end if
+
+ call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error)
+ if (.not. active) return
+
+ call mpi_comm_size(comm_setup, no_nodes, error)
+ call mpi_comm_dup(comm_setup, comm_solve, error)
+ call mpi_comm_dup(comm_setup, comm_rhs, error)
+
+c---------------------------------------------------------------------
+c let node 0 be the root for the group (there is only one)
+c---------------------------------------------------------------------
+ root = 0
+
+ return
+ end
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer m, ierr
+
+ iseek=0
+
+ if (node .eq. root) then
+ call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
+ endif
+
+ call MPI_Barrier(comm_solve, ierr)
+
+ call MPI_File_open(comm_solve,
+ $ filenm,
+ $ MPI_MODE_RDWR + MPI_MODE_CREATE,
+ $ MPI_INFO_NULL,
+ $ fp,
+ $ ierr)
+
+ call MPI_File_set_view(fp,
+ $ iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
+ $ 'native', MPI_INFO_NULL, ierr)
+
+ if (ierr .ne. MPI_SUCCESS) then
+ print *, 'Error opening file'
+ stop
+ endif
+
+ do m = 1, 5
+ xce_sub(m) = 0.d0
+ end do
+
+ idump_sub = 0
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer count, jio, kio, cio, aio
+ integer ierr
+ integer mstatus(MPI_STATUS_SIZE)
+
+ do cio=1,ncells
+ do kio=0, cell_size(3,cio)-1
+ do jio=0, cell_size(2,cio)-1
+ iseek=5*(cell_low(1,cio) +
+ $ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+ $ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+ $ PROBLEM_SIZE*idump_sub)))
+
+ count=5*cell_size(1,cio)
+
+ call MPI_File_write_at(fp, iseek,
+ $ u(1,0,jio,kio,cio),
+ $ count, MPI_DOUBLE_PRECISION,
+ $ mstatus, ierr)
+
+ if (ierr .ne. MPI_SUCCESS) then
+ print *, 'Error writing to file'
+ stop
+ endif
+ enddo
+ enddo
+ enddo
+
+ idump_sub = idump_sub + 1
+ if (rd_interval .gt. 0) then
+ if (idump_sub .ge. rd_interval) then
+
+ call acc_sub_norms(idump+1)
+
+ idump_sub = 0
+ endif
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine acc_sub_norms(idump_cur)
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer idump_cur
+
+ integer count, jio, kio, cio, ii, m, ichunk
+ integer ierr
+ integer mstatus(MPI_STATUS_SIZE)
+ double precision xce_single(5)
+
+ ichunk = idump_cur - idump_sub + 1
+ do ii=0, idump_sub-1
+ do cio=1,ncells
+ do kio=0, cell_size(3,cio)-1
+ do jio=0, cell_size(2,cio)-1
+ iseek=5*(cell_low(1,cio) +
+ $ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+ $ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+ $ PROBLEM_SIZE*ii)))
+
+ count=5*cell_size(1,cio)
+
+ call MPI_File_read_at(fp, iseek,
+ $ u(1,0,jio,kio,cio),
+ $ count, MPI_DOUBLE_PRECISION,
+ $ mstatus, ierr)
+
+ if (ierr .ne. MPI_SUCCESS) then
+ print *, 'Error reading back file'
+ call MPI_File_close(fp, ierr)
+ stop
+ endif
+ enddo
+ enddo
+ enddo
+
+ if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+ call error_norm(xce_single)
+ do m = 1, 5
+ xce_sub(m) = xce_sub(m) + xce_single(m)
+ end do
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer ierr
+
+ call MPI_File_close(fp, ierr)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ double precision xce_acc(5)
+ integer m, ierr
+
+ if (rd_interval .gt. 0) goto 20
+
+ call MPI_File_open(comm_solve,
+ $ filenm,
+ $ MPI_MODE_RDONLY,
+ $ MPI_INFO_NULL,
+ $ fp,
+ $ ierr)
+
+ iseek = 0
+ call MPI_File_set_view(fp,
+ $ iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
+ $ 'native', MPI_INFO_NULL, ierr)
+
+c clear the last time step
+
+ call clear_timestep
+
+c read back the time steps and accumulate norms
+
+ call acc_sub_norms(idump)
+
+ call MPI_File_close(fp, ierr)
+
+ 20 continue
+ do m = 1, 5
+ xce_acc(m) = xce_sub(m) / dble(idump)
+ end do
+
+ return
+ end
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine matvec_sub(ablock,avec,bvec)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c subtracts bvec=bvec - ablock*avec
+c---------------------------------------------------------------------
+
+ implicit none
+
+ double precision ablock,avec,bvec
+ dimension ablock(5,5),avec(5),bvec(5)
+
+c---------------------------------------------------------------------
+c rhs(i,ic,jc,kc,ccell) = rhs(i,ic,jc,kc,ccell)
+c $ - lhs(i,1,ablock,ia,ja,ka,acell)*
+c---------------------------------------------------------------------
+ bvec(1) = bvec(1) - ablock(1,1)*avec(1)
+ > - ablock(1,2)*avec(2)
+ > - ablock(1,3)*avec(3)
+ > - ablock(1,4)*avec(4)
+ > - ablock(1,5)*avec(5)
+ bvec(2) = bvec(2) - ablock(2,1)*avec(1)
+ > - ablock(2,2)*avec(2)
+ > - ablock(2,3)*avec(3)
+ > - ablock(2,4)*avec(4)
+ > - ablock(2,5)*avec(5)
+ bvec(3) = bvec(3) - ablock(3,1)*avec(1)
+ > - ablock(3,2)*avec(2)
+ > - ablock(3,3)*avec(3)
+ > - ablock(3,4)*avec(4)
+ > - ablock(3,5)*avec(5)
+ bvec(4) = bvec(4) - ablock(4,1)*avec(1)
+ > - ablock(4,2)*avec(2)
+ > - ablock(4,3)*avec(3)
+ > - ablock(4,4)*avec(4)
+ > - ablock(4,5)*avec(5)
+ bvec(5) = bvec(5) - ablock(5,1)*avec(1)
+ > - ablock(5,2)*avec(2)
+ > - ablock(5,3)*avec(3)
+ > - ablock(5,4)*avec(4)
+ > - ablock(5,5)*avec(5)
+
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine matmul_sub(ablock, bblock, cblock)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c subtracts a(i,j,k) X b(i,j,k) from c(i,j,k)
+c---------------------------------------------------------------------
+
+ implicit none
+
+ double precision ablock, bblock, cblock
+ dimension ablock(5,5), bblock(5,5), cblock(5,5)
+
+
+ cblock(1,1) = cblock(1,1) - ablock(1,1)*bblock(1,1)
+ > - ablock(1,2)*bblock(2,1)
+ > - ablock(1,3)*bblock(3,1)
+ > - ablock(1,4)*bblock(4,1)
+ > - ablock(1,5)*bblock(5,1)
+ cblock(2,1) = cblock(2,1) - ablock(2,1)*bblock(1,1)
+ > - ablock(2,2)*bblock(2,1)
+ > - ablock(2,3)*bblock(3,1)
+ > - ablock(2,4)*bblock(4,1)
+ > - ablock(2,5)*bblock(5,1)
+ cblock(3,1) = cblock(3,1) - ablock(3,1)*bblock(1,1)
+ > - ablock(3,2)*bblock(2,1)
+ > - ablock(3,3)*bblock(3,1)
+ > - ablock(3,4)*bblock(4,1)
+ > - ablock(3,5)*bblock(5,1)
+ cblock(4,1) = cblock(4,1) - ablock(4,1)*bblock(1,1)
+ > - ablock(4,2)*bblock(2,1)
+ > - ablock(4,3)*bblock(3,1)
+ > - ablock(4,4)*bblock(4,1)
+ > - ablock(4,5)*bblock(5,1)
+ cblock(5,1) = cblock(5,1) - ablock(5,1)*bblock(1,1)
+ > - ablock(5,2)*bblock(2,1)
+ > - ablock(5,3)*bblock(3,1)
+ > - ablock(5,4)*bblock(4,1)
+ > - ablock(5,5)*bblock(5,1)
+ cblock(1,2) = cblock(1,2) - ablock(1,1)*bblock(1,2)
+ > - ablock(1,2)*bblock(2,2)
+ > - ablock(1,3)*bblock(3,2)
+ > - ablock(1,4)*bblock(4,2)
+ > - ablock(1,5)*bblock(5,2)
+ cblock(2,2) = cblock(2,2) - ablock(2,1)*bblock(1,2)
+ > - ablock(2,2)*bblock(2,2)
+ > - ablock(2,3)*bblock(3,2)
+ > - ablock(2,4)*bblock(4,2)
+ > - ablock(2,5)*bblock(5,2)
+ cblock(3,2) = cblock(3,2) - ablock(3,1)*bblock(1,2)
+ > - ablock(3,2)*bblock(2,2)
+ > - ablock(3,3)*bblock(3,2)
+ > - ablock(3,4)*bblock(4,2)
+ > - ablock(3,5)*bblock(5,2)
+ cblock(4,2) = cblock(4,2) - ablock(4,1)*bblock(1,2)
+ > - ablock(4,2)*bblock(2,2)
+ > - ablock(4,3)*bblock(3,2)
+ > - ablock(4,4)*bblock(4,2)
+ > - ablock(4,5)*bblock(5,2)
+ cblock(5,2) = cblock(5,2) - ablock(5,1)*bblock(1,2)
+ > - ablock(5,2)*bblock(2,2)
+ > - ablock(5,3)*bblock(3,2)
+ > - ablock(5,4)*bblock(4,2)
+ > - ablock(5,5)*bblock(5,2)
+ cblock(1,3) = cblock(1,3) - ablock(1,1)*bblock(1,3)
+ > - ablock(1,2)*bblock(2,3)
+ > - ablock(1,3)*bblock(3,3)
+ > - ablock(1,4)*bblock(4,3)
+ > - ablock(1,5)*bblock(5,3)
+ cblock(2,3) = cblock(2,3) - ablock(2,1)*bblock(1,3)
+ > - ablock(2,2)*bblock(2,3)
+ > - ablock(2,3)*bblock(3,3)
+ > - ablock(2,4)*bblock(4,3)
+ > - ablock(2,5)*bblock(5,3)
+ cblock(3,3) = cblock(3,3) - ablock(3,1)*bblock(1,3)
+ > - ablock(3,2)*bblock(2,3)
+ > - ablock(3,3)*bblock(3,3)
+ > - ablock(3,4)*bblock(4,3)
+ > - ablock(3,5)*bblock(5,3)
+ cblock(4,3) = cblock(4,3) - ablock(4,1)*bblock(1,3)
+ > - ablock(4,2)*bblock(2,3)
+ > - ablock(4,3)*bblock(3,3)
+ > - ablock(4,4)*bblock(4,3)
+ > - ablock(4,5)*bblock(5,3)
+ cblock(5,3) = cblock(5,3) - ablock(5,1)*bblock(1,3)
+ > - ablock(5,2)*bblock(2,3)
+ > - ablock(5,3)*bblock(3,3)
+ > - ablock(5,4)*bblock(4,3)
+ > - ablock(5,5)*bblock(5,3)
+ cblock(1,4) = cblock(1,4) - ablock(1,1)*bblock(1,4)
+ > - ablock(1,2)*bblock(2,4)
+ > - ablock(1,3)*bblock(3,4)
+ > - ablock(1,4)*bblock(4,4)
+ > - ablock(1,5)*bblock(5,4)
+ cblock(2,4) = cblock(2,4) - ablock(2,1)*bblock(1,4)
+ > - ablock(2,2)*bblock(2,4)
+ > - ablock(2,3)*bblock(3,4)
+ > - ablock(2,4)*bblock(4,4)
+ > - ablock(2,5)*bblock(5,4)
+ cblock(3,4) = cblock(3,4) - ablock(3,1)*bblock(1,4)
+ > - ablock(3,2)*bblock(2,4)
+ > - ablock(3,3)*bblock(3,4)
+ > - ablock(3,4)*bblock(4,4)
+ > - ablock(3,5)*bblock(5,4)
+ cblock(4,4) = cblock(4,4) - ablock(4,1)*bblock(1,4)
+ > - ablock(4,2)*bblock(2,4)
+ > - ablock(4,3)*bblock(3,4)
+ > - ablock(4,4)*bblock(4,4)
+ > - ablock(4,5)*bblock(5,4)
+ cblock(5,4) = cblock(5,4) - ablock(5,1)*bblock(1,4)
+ > - ablock(5,2)*bblock(2,4)
+ > - ablock(5,3)*bblock(3,4)
+ > - ablock(5,4)*bblock(4,4)
+ > - ablock(5,5)*bblock(5,4)
+ cblock(1,5) = cblock(1,5) - ablock(1,1)*bblock(1,5)
+ > - ablock(1,2)*bblock(2,5)
+ > - ablock(1,3)*bblock(3,5)
+ > - ablock(1,4)*bblock(4,5)
+ > - ablock(1,5)*bblock(5,5)
+ cblock(2,5) = cblock(2,5) - ablock(2,1)*bblock(1,5)
+ > - ablock(2,2)*bblock(2,5)
+ > - ablock(2,3)*bblock(3,5)
+ > - ablock(2,4)*bblock(4,5)
+ > - ablock(2,5)*bblock(5,5)
+ cblock(3,5) = cblock(3,5) - ablock(3,1)*bblock(1,5)
+ > - ablock(3,2)*bblock(2,5)
+ > - ablock(3,3)*bblock(3,5)
+ > - ablock(3,4)*bblock(4,5)
+ > - ablock(3,5)*bblock(5,5)
+ cblock(4,5) = cblock(4,5) - ablock(4,1)*bblock(1,5)
+ > - ablock(4,2)*bblock(2,5)
+ > - ablock(4,3)*bblock(3,5)
+ > - ablock(4,4)*bblock(4,5)
+ > - ablock(4,5)*bblock(5,5)
+ cblock(5,5) = cblock(5,5) - ablock(5,1)*bblock(1,5)
+ > - ablock(5,2)*bblock(2,5)
+ > - ablock(5,3)*bblock(3,5)
+ > - ablock(5,4)*bblock(4,5)
+ > - ablock(5,5)*bblock(5,5)
+
+
+ return
+ end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine binvcrhs( lhs,c,r )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ double precision pivot, coeff, lhs
+ dimension lhs(5,5)
+ double precision c(5,5), r(5)
+
+c---------------------------------------------------------------------
+c
+c---------------------------------------------------------------------
+
+ pivot = 1.00d0/lhs(1,1)
+ lhs(1,2) = lhs(1,2)*pivot
+ lhs(1,3) = lhs(1,3)*pivot
+ lhs(1,4) = lhs(1,4)*pivot
+ lhs(1,5) = lhs(1,5)*pivot
+ c(1,1) = c(1,1)*pivot
+ c(1,2) = c(1,2)*pivot
+ c(1,3) = c(1,3)*pivot
+ c(1,4) = c(1,4)*pivot
+ c(1,5) = c(1,5)*pivot
+ r(1) = r(1) *pivot
+
+ coeff = lhs(2,1)
+ lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
+ lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
+ lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
+ lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
+ c(2,1) = c(2,1) - coeff*c(1,1)
+ c(2,2) = c(2,2) - coeff*c(1,2)
+ c(2,3) = c(2,3) - coeff*c(1,3)
+ c(2,4) = c(2,4) - coeff*c(1,4)
+ c(2,5) = c(2,5) - coeff*c(1,5)
+ r(2) = r(2) - coeff*r(1)
+
+ coeff = lhs(3,1)
+ lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
+ lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
+ lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
+ lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
+ c(3,1) = c(3,1) - coeff*c(1,1)
+ c(3,2) = c(3,2) - coeff*c(1,2)
+ c(3,3) = c(3,3) - coeff*c(1,3)
+ c(3,4) = c(3,4) - coeff*c(1,4)
+ c(3,5) = c(3,5) - coeff*c(1,5)
+ r(3) = r(3) - coeff*r(1)
+
+ coeff = lhs(4,1)
+ lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
+ lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
+ lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
+ lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
+ c(4,1) = c(4,1) - coeff*c(1,1)
+ c(4,2) = c(4,2) - coeff*c(1,2)
+ c(4,3) = c(4,3) - coeff*c(1,3)
+ c(4,4) = c(4,4) - coeff*c(1,4)
+ c(4,5) = c(4,5) - coeff*c(1,5)
+ r(4) = r(4) - coeff*r(1)
+
+ coeff = lhs(5,1)
+ lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
+ lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
+ lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
+ lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
+ c(5,1) = c(5,1) - coeff*c(1,1)
+ c(5,2) = c(5,2) - coeff*c(1,2)
+ c(5,3) = c(5,3) - coeff*c(1,3)
+ c(5,4) = c(5,4) - coeff*c(1,4)
+ c(5,5) = c(5,5) - coeff*c(1,5)
+ r(5) = r(5) - coeff*r(1)
+
+
+ pivot = 1.00d0/lhs(2,2)
+ lhs(2,3) = lhs(2,3)*pivot
+ lhs(2,4) = lhs(2,4)*pivot
+ lhs(2,5) = lhs(2,5)*pivot
+ c(2,1) = c(2,1)*pivot
+ c(2,2) = c(2,2)*pivot
+ c(2,3) = c(2,3)*pivot
+ c(2,4) = c(2,4)*pivot
+ c(2,5) = c(2,5)*pivot
+ r(2) = r(2) *pivot
+
+ coeff = lhs(1,2)
+ lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
+ lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
+ lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
+ c(1,1) = c(1,1) - coeff*c(2,1)
+ c(1,2) = c(1,2) - coeff*c(2,2)
+ c(1,3) = c(1,3) - coeff*c(2,3)
+ c(1,4) = c(1,4) - coeff*c(2,4)
+ c(1,5) = c(1,5) - coeff*c(2,5)
+ r(1) = r(1) - coeff*r(2)
+
+ coeff = lhs(3,2)
+ lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
+ lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
+ lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
+ c(3,1) = c(3,1) - coeff*c(2,1)
+ c(3,2) = c(3,2) - coeff*c(2,2)
+ c(3,3) = c(3,3) - coeff*c(2,3)
+ c(3,4) = c(3,4) - coeff*c(2,4)
+ c(3,5) = c(3,5) - coeff*c(2,5)
+ r(3) = r(3) - coeff*r(2)
+
+ coeff = lhs(4,2)
+ lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
+ lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
+ lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
+ c(4,1) = c(4,1) - coeff*c(2,1)
+ c(4,2) = c(4,2) - coeff*c(2,2)
+ c(4,3) = c(4,3) - coeff*c(2,3)
+ c(4,4) = c(4,4) - coeff*c(2,4)
+ c(4,5) = c(4,5) - coeff*c(2,5)
+ r(4) = r(4) - coeff*r(2)
+
+ coeff = lhs(5,2)
+ lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
+ lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
+ lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
+ c(5,1) = c(5,1) - coeff*c(2,1)
+ c(5,2) = c(5,2) - coeff*c(2,2)
+ c(5,3) = c(5,3) - coeff*c(2,3)
+ c(5,4) = c(5,4) - coeff*c(2,4)
+ c(5,5) = c(5,5) - coeff*c(2,5)
+ r(5) = r(5) - coeff*r(2)
+
+
+ pivot = 1.00d0/lhs(3,3)
+ lhs(3,4) = lhs(3,4)*pivot
+ lhs(3,5) = lhs(3,5)*pivot
+ c(3,1) = c(3,1)*pivot
+ c(3,2) = c(3,2)*pivot
+ c(3,3) = c(3,3)*pivot
+ c(3,4) = c(3,4)*pivot
+ c(3,5) = c(3,5)*pivot
+ r(3) = r(3) *pivot
+
+ coeff = lhs(1,3)
+ lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
+ lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
+ c(1,1) = c(1,1) - coeff*c(3,1)
+ c(1,2) = c(1,2) - coeff*c(3,2)
+ c(1,3) = c(1,3) - coeff*c(3,3)
+ c(1,4) = c(1,4) - coeff*c(3,4)
+ c(1,5) = c(1,5) - coeff*c(3,5)
+ r(1) = r(1) - coeff*r(3)
+
+ coeff = lhs(2,3)
+ lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
+ lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
+ c(2,1) = c(2,1) - coeff*c(3,1)
+ c(2,2) = c(2,2) - coeff*c(3,2)
+ c(2,3) = c(2,3) - coeff*c(3,3)
+ c(2,4) = c(2,4) - coeff*c(3,4)
+ c(2,5) = c(2,5) - coeff*c(3,5)
+ r(2) = r(2) - coeff*r(3)
+
+ coeff = lhs(4,3)
+ lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
+ lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
+ c(4,1) = c(4,1) - coeff*c(3,1)
+ c(4,2) = c(4,2) - coeff*c(3,2)
+ c(4,3) = c(4,3) - coeff*c(3,3)
+ c(4,4) = c(4,4) - coeff*c(3,4)
+ c(4,5) = c(4,5) - coeff*c(3,5)
+ r(4) = r(4) - coeff*r(3)
+
+ coeff = lhs(5,3)
+ lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
+ lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
+ c(5,1) = c(5,1) - coeff*c(3,1)
+ c(5,2) = c(5,2) - coeff*c(3,2)
+ c(5,3) = c(5,3) - coeff*c(3,3)
+ c(5,4) = c(5,4) - coeff*c(3,4)
+ c(5,5) = c(5,5) - coeff*c(3,5)
+ r(5) = r(5) - coeff*r(3)
+
+
+ pivot = 1.00d0/lhs(4,4)
+ lhs(4,5) = lhs(4,5)*pivot
+ c(4,1) = c(4,1)*pivot
+ c(4,2) = c(4,2)*pivot
+ c(4,3) = c(4,3)*pivot
+ c(4,4) = c(4,4)*pivot
+ c(4,5) = c(4,5)*pivot
+ r(4) = r(4) *pivot
+
+ coeff = lhs(1,4)
+ lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
+ c(1,1) = c(1,1) - coeff*c(4,1)
+ c(1,2) = c(1,2) - coeff*c(4,2)
+ c(1,3) = c(1,3) - coeff*c(4,3)
+ c(1,4) = c(1,4) - coeff*c(4,4)
+ c(1,5) = c(1,5) - coeff*c(4,5)
+ r(1) = r(1) - coeff*r(4)
+
+ coeff = lhs(2,4)
+ lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
+ c(2,1) = c(2,1) - coeff*c(4,1)
+ c(2,2) = c(2,2) - coeff*c(4,2)
+ c(2,3) = c(2,3) - coeff*c(4,3)
+ c(2,4) = c(2,4) - coeff*c(4,4)
+ c(2,5) = c(2,5) - coeff*c(4,5)
+ r(2) = r(2) - coeff*r(4)
+
+ coeff = lhs(3,4)
+ lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
+ c(3,1) = c(3,1) - coeff*c(4,1)
+ c(3,2) = c(3,2) - coeff*c(4,2)
+ c(3,3) = c(3,3) - coeff*c(4,3)
+ c(3,4) = c(3,4) - coeff*c(4,4)
+ c(3,5) = c(3,5) - coeff*c(4,5)
+ r(3) = r(3) - coeff*r(4)
+
+ coeff = lhs(5,4)
+ lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
+ c(5,1) = c(5,1) - coeff*c(4,1)
+ c(5,2) = c(5,2) - coeff*c(4,2)
+ c(5,3) = c(5,3) - coeff*c(4,3)
+ c(5,4) = c(5,4) - coeff*c(4,4)
+ c(5,5) = c(5,5) - coeff*c(4,5)
+ r(5) = r(5) - coeff*r(4)
+
+
+ pivot = 1.00d0/lhs(5,5)
+ c(5,1) = c(5,1)*pivot
+ c(5,2) = c(5,2)*pivot
+ c(5,3) = c(5,3)*pivot
+ c(5,4) = c(5,4)*pivot
+ c(5,5) = c(5,5)*pivot
+ r(5) = r(5) *pivot
+
+ coeff = lhs(1,5)
+ c(1,1) = c(1,1) - coeff*c(5,1)
+ c(1,2) = c(1,2) - coeff*c(5,2)
+ c(1,3) = c(1,3) - coeff*c(5,3)
+ c(1,4) = c(1,4) - coeff*c(5,4)
+ c(1,5) = c(1,5) - coeff*c(5,5)
+ r(1) = r(1) - coeff*r(5)
+
+ coeff = lhs(2,5)
+ c(2,1) = c(2,1) - coeff*c(5,1)
+ c(2,2) = c(2,2) - coeff*c(5,2)
+ c(2,3) = c(2,3) - coeff*c(5,3)
+ c(2,4) = c(2,4) - coeff*c(5,4)
+ c(2,5) = c(2,5) - coeff*c(5,5)
+ r(2) = r(2) - coeff*r(5)
+
+ coeff = lhs(3,5)
+ c(3,1) = c(3,1) - coeff*c(5,1)
+ c(3,2) = c(3,2) - coeff*c(5,2)
+ c(3,3) = c(3,3) - coeff*c(5,3)
+ c(3,4) = c(3,4) - coeff*c(5,4)
+ c(3,5) = c(3,5) - coeff*c(5,5)
+ r(3) = r(3) - coeff*r(5)
+
+ coeff = lhs(4,5)
+ c(4,1) = c(4,1) - coeff*c(5,1)
+ c(4,2) = c(4,2) - coeff*c(5,2)
+ c(4,3) = c(4,3) - coeff*c(5,3)
+ c(4,4) = c(4,4) - coeff*c(5,4)
+ c(4,5) = c(4,5) - coeff*c(5,5)
+ r(4) = r(4) - coeff*r(5)
+
+
+ return
+ end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine binvrhs( lhs,r )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ double precision pivot, coeff, lhs
+ dimension lhs(5,5)
+ double precision r(5)
+
+c---------------------------------------------------------------------
+c
+c---------------------------------------------------------------------
+
+
+ pivot = 1.00d0/lhs(1,1)
+ lhs(1,2) = lhs(1,2)*pivot
+ lhs(1,3) = lhs(1,3)*pivot
+ lhs(1,4) = lhs(1,4)*pivot
+ lhs(1,5) = lhs(1,5)*pivot
+ r(1) = r(1) *pivot
+
+ coeff = lhs(2,1)
+ lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
+ lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
+ lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
+ lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
+ r(2) = r(2) - coeff*r(1)
+
+ coeff = lhs(3,1)
+ lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
+ lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
+ lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
+ lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
+ r(3) = r(3) - coeff*r(1)
+
+ coeff = lhs(4,1)
+ lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
+ lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
+ lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
+ lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
+ r(4) = r(4) - coeff*r(1)
+
+ coeff = lhs(5,1)
+ lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
+ lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
+ lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
+ lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
+ r(5) = r(5) - coeff*r(1)
+
+
+ pivot = 1.00d0/lhs(2,2)
+ lhs(2,3) = lhs(2,3)*pivot
+ lhs(2,4) = lhs(2,4)*pivot
+ lhs(2,5) = lhs(2,5)*pivot
+ r(2) = r(2) *pivot
+
+ coeff = lhs(1,2)
+ lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
+ lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
+ lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
+ r(1) = r(1) - coeff*r(2)
+
+ coeff = lhs(3,2)
+ lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
+ lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
+ lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
+ r(3) = r(3) - coeff*r(2)
+
+ coeff = lhs(4,2)
+ lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
+ lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
+ lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
+ r(4) = r(4) - coeff*r(2)
+
+ coeff = lhs(5,2)
+ lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
+ lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
+ lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
+ r(5) = r(5) - coeff*r(2)
+
+
+ pivot = 1.00d0/lhs(3,3)
+ lhs(3,4) = lhs(3,4)*pivot
+ lhs(3,5) = lhs(3,5)*pivot
+ r(3) = r(3) *pivot
+
+ coeff = lhs(1,3)
+ lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
+ lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
+ r(1) = r(1) - coeff*r(3)
+
+ coeff = lhs(2,3)
+ lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
+ lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
+ r(2) = r(2) - coeff*r(3)
+
+ coeff = lhs(4,3)
+ lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
+ lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
+ r(4) = r(4) - coeff*r(3)
+
+ coeff = lhs(5,3)
+ lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
+ lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
+ r(5) = r(5) - coeff*r(3)
+
+
+ pivot = 1.00d0/lhs(4,4)
+ lhs(4,5) = lhs(4,5)*pivot
+ r(4) = r(4) *pivot
+
+ coeff = lhs(1,4)
+ lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
+ r(1) = r(1) - coeff*r(4)
+
+ coeff = lhs(2,4)
+ lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
+ r(2) = r(2) - coeff*r(4)
+
+ coeff = lhs(3,4)
+ lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
+ r(3) = r(3) - coeff*r(4)
+
+ coeff = lhs(5,4)
+ lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
+ r(5) = r(5) - coeff*r(4)
+
+
+ pivot = 1.00d0/lhs(5,5)
+ r(5) = r(5) *pivot
+
+ coeff = lhs(1,5)
+ r(1) = r(1) - coeff*r(5)
+
+ coeff = lhs(2,5)
+ r(2) = r(2) - coeff*r(5)
+
+ coeff = lhs(3,5)
+ r(3) = r(3) - coeff*r(5)
+
+ coeff = lhs(4,5)
+ r(4) = r(4) - coeff*r(5)
+
+
+ return
+ end
+
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine verify(no_time_steps, class, verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c verification routine
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5),
+ > epsilon, xce(5), xcr(5), dtref
+ integer m, no_time_steps
+ character class
+ logical verified
+
+c---------------------------------------------------------------------
+c tolerance level
+c---------------------------------------------------------------------
+ epsilon = 1.0d-08
+ verified = .true.
+
+c---------------------------------------------------------------------
+c compute the error norm and the residual norm, and exit if not printing
+c---------------------------------------------------------------------
+
+ if (iotype .ne. 0) then
+ call accumulate_norms(xce)
+ else
+ call error_norm(xce)
+ endif
+
+ call copy_faces
+
+ call rhs_norm(xcr)
+
+ do m = 1, 5
+ xcr(m) = xcr(m) / dt
+ enddo
+
+ if (node .ne. 0) return
+
+ class = 'U'
+
+ do m = 1,5
+ xcrref(m) = 1.0
+ xceref(m) = 1.0
+ end do
+
+c---------------------------------------------------------------------
+c reference data for 12X12X12 grids after 60 time steps, with DT = 1.0d-02
+c---------------------------------------------------------------------
+ if ( (grid_points(1) .eq. 12 ) .and.
+ > (grid_points(2) .eq. 12 ) .and.
+ > (grid_points(3) .eq. 12 ) .and.
+ > (no_time_steps .eq. 60 )) then
+
+ class = 'S'
+ dtref = 1.0d-2
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 1.7034283709541311d-01
+ xcrref(2) = 1.2975252070034097d-02
+ xcrref(3) = 3.2527926989486055d-02
+ xcrref(4) = 2.6436421275166801d-02
+ xcrref(5) = 1.9211784131744430d-01
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+ if (iotype .eq. 0) then
+ xceref(1) = 4.9976913345811579d-04
+ xceref(2) = 4.5195666782961927d-05
+ xceref(3) = 7.3973765172921357d-05
+ xceref(4) = 7.3821238632439731d-05
+ xceref(5) = 8.9269630987491446d-04
+ else
+ xceref(1) = 0.1149036328945d+02
+ xceref(2) = 0.9156788904727d+00
+ xceref(3) = 0.2857899428614d+01
+ xceref(4) = 0.2598273346734d+01
+ xceref(5) = 0.2652795397547d+02
+ endif
+
+c---------------------------------------------------------------------
+c reference data for 24X24X24 grids after 200 time steps, with DT = 0.8d-3
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 24) .and.
+ > (grid_points(2) .eq. 24) .and.
+ > (grid_points(3) .eq. 24) .and.
+ > (no_time_steps . eq. 200) ) then
+
+ class = 'W'
+ dtref = 0.8d-3
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 0.1125590409344d+03
+ xcrref(2) = 0.1180007595731d+02
+ xcrref(3) = 0.2710329767846d+02
+ xcrref(4) = 0.2469174937669d+02
+ xcrref(5) = 0.2638427874317d+03
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+ if (iotype .eq. 0) then
+ xceref(1) = 0.4419655736008d+01
+ xceref(2) = 0.4638531260002d+00
+ xceref(3) = 0.1011551749967d+01
+ xceref(4) = 0.9235878729944d+00
+ xceref(5) = 0.1018045837718d+02
+ else
+ xceref(1) = 0.6729594398612d+02
+ xceref(2) = 0.5264523081690d+01
+ xceref(3) = 0.1677107142637d+02
+ xceref(4) = 0.1508721463436d+02
+ xceref(5) = 0.1477018363393d+03
+ endif
+
+
+c---------------------------------------------------------------------
+c reference data for 64X64X64 grids after 200 time steps, with DT = 0.8d-3
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 64) .and.
+ > (grid_points(2) .eq. 64) .and.
+ > (grid_points(3) .eq. 64) .and.
+ > (no_time_steps . eq. 200) ) then
+
+ class = 'A'
+ dtref = 0.8d-3
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 1.0806346714637264d+02
+ xcrref(2) = 1.1319730901220813d+01
+ xcrref(3) = 2.5974354511582465d+01
+ xcrref(4) = 2.3665622544678910d+01
+ xcrref(5) = 2.5278963211748344d+02
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+ if (iotype .eq. 0) then
+ xceref(1) = 4.2348416040525025d+00
+ xceref(2) = 4.4390282496995698d-01
+ xceref(3) = 9.6692480136345650d-01
+ xceref(4) = 8.8302063039765474d-01
+ xceref(5) = 9.7379901770829278d+00
+ else
+ xceref(1) = 0.6482218724961d+02
+ xceref(2) = 0.5066461714527d+01
+ xceref(3) = 0.1613931961359d+02
+ xceref(4) = 0.1452010201481d+02
+ xceref(5) = 0.1420099377681d+03
+ endif
+
+c---------------------------------------------------------------------
+c reference data for 102X102X102 grids after 200 time steps,
+c with DT = 3.0d-04
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 102) .and.
+ > (grid_points(2) .eq. 102) .and.
+ > (grid_points(3) .eq. 102) .and.
+ > (no_time_steps . eq. 200) ) then
+
+ class = 'B'
+ dtref = 3.0d-4
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 1.4233597229287254d+03
+ xcrref(2) = 9.9330522590150238d+01
+ xcrref(3) = 3.5646025644535285d+02
+ xcrref(4) = 3.2485447959084092d+02
+ xcrref(5) = 3.2707541254659363d+03
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+ if (iotype .eq. 0) then
+ xceref(1) = 5.2969847140936856d+01
+ xceref(2) = 4.4632896115670668d+00
+ xceref(3) = 1.3122573342210174d+01
+ xceref(4) = 1.2006925323559144d+01
+ xceref(5) = 1.2459576151035986d+02
+ else
+ xceref(1) = 0.1477545106464d+03
+ xceref(2) = 0.1108895555053d+02
+ xceref(3) = 0.3698065590331d+02
+ xceref(4) = 0.3310505581440d+02
+ xceref(5) = 0.3157928282563d+03
+ endif
+
+c---------------------------------------------------------------------
+c reference data for 162X162X162 grids after 200 time steps,
+c with DT = 1.0d-04
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 162) .and.
+ > (grid_points(2) .eq. 162) .and.
+ > (grid_points(3) .eq. 162) .and.
+ > (no_time_steps . eq. 200) ) then
+
+ class = 'C'
+ dtref = 1.0d-4
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 0.62398116551764615d+04
+ xcrref(2) = 0.50793239190423964d+03
+ xcrref(3) = 0.15423530093013596d+04
+ xcrref(4) = 0.13302387929291190d+04
+ xcrref(5) = 0.11604087428436455d+05
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+ if (iotype .eq. 0) then
+ xceref(1) = 0.16462008369091265d+03
+ xceref(2) = 0.11497107903824313d+02
+ xceref(3) = 0.41207446207461508d+02
+ xceref(4) = 0.37087651059694167d+02
+ xceref(5) = 0.36211053051841265d+03
+ else
+ xceref(1) = 0.2597156483475d+03
+ xceref(2) = 0.1985384289495d+02
+ xceref(3) = 0.6517950485788d+02
+ xceref(4) = 0.5757235541520d+02
+ xceref(5) = 0.5215668188726d+03
+ endif
+
+
+c---------------------------------------------------------------------
+c reference data for 408x408x408 grids after 250 time steps,
+c with DT = 0.2d-04
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 408) .and.
+ > (grid_points(2) .eq. 408) .and.
+ > (grid_points(3) .eq. 408) .and.
+ > (no_time_steps . eq. 250) ) then
+
+ class = 'D'
+ dtref = 0.2d-4
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 0.2533188551738d+05
+ xcrref(2) = 0.2346393716980d+04
+ xcrref(3) = 0.6294554366904d+04
+ xcrref(4) = 0.5352565376030d+04
+ xcrref(5) = 0.3905864038618d+05
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+ if (iotype .eq. 0) then
+ xceref(1) = 0.3100009377557d+03
+ xceref(2) = 0.2424086324913d+02
+ xceref(3) = 0.7782212022645d+02
+ xceref(4) = 0.6835623860116d+02
+ xceref(5) = 0.6065737200368d+03
+ else
+ xceref(1) = 0.3813781566713d+03
+ xceref(2) = 0.3160872966198d+02
+ xceref(3) = 0.9593576357290d+02
+ xceref(4) = 0.8363391989815d+02
+ xceref(5) = 0.7063466087423d+03
+ endif
+
+
+c---------------------------------------------------------------------
+c reference data for 1020x1020x1020 grids after 250 time steps,
+c with DT = 0.4d-05
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 1020) .and.
+ > (grid_points(2) .eq. 1020) .and.
+ > (grid_points(3) .eq. 1020) .and.
+ > (no_time_steps . eq. 250) ) then
+
+ class = 'E'
+ dtref = 0.4d-5
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 0.9795372484517d+05
+ xcrref(2) = 0.9739814511521d+04
+ xcrref(3) = 0.2467606342965d+05
+ xcrref(4) = 0.2092419572860d+05
+ xcrref(5) = 0.1392138856939d+06
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+ if (iotype .eq. 0) then
+ xceref(1) = 0.4327562208414d+03
+ xceref(2) = 0.3699051964887d+02
+ xceref(3) = 0.1089845040954d+03
+ xceref(4) = 0.9462517622043d+02
+ xceref(5) = 0.7765512765309d+03
+ else
+c wr_interval = 5
+ xceref(1) = 0.4729898413058d+03
+ xceref(2) = 0.4145899331704d+02
+ xceref(3) = 0.1192850917138d+03
+ xceref(4) = 0.1032746026932d+03
+ xceref(5) = 0.8270322177634d+03
+c wr_interval = 10
+c xceref(1) = 0.4718135916251d+03
+c xceref(2) = 0.4132620259096d+02
+c xceref(3) = 0.1189831133503d+03
+c xceref(4) = 0.1030212798803d+03
+c xceref(5) = 0.8255924078458d+03
+ endif
+
+ else
+ verified = .false.
+ endif
+
+c---------------------------------------------------------------------
+c verification test for residuals if gridsize is one of
+c the defined grid sizes above (class .ne. 'U')
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Compute the difference of solution values and the known reference
+c values.
+c---------------------------------------------------------------------
+ do m = 1, 5
+
+ xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m))
+ xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
+
+ enddo
+
+c---------------------------------------------------------------------
+c Output the comparison of computed results to known cases.
+c---------------------------------------------------------------------
+
+ if (class .ne. 'U') then
+ write(*, 1990) class
+ 1990 format(' Verification being performed for class ', a)
+ write (*,2000) epsilon
+ 2000 format(' accuracy setting for epsilon = ', E20.13)
+ verified = (dabs(dt-dtref) .le. epsilon)
+ if (.not.verified) then
+ verified = .false.
+ class = 'U'
+ write (*,1000) dtref
+ 1000 format(' DT does not match the reference value of ',
+ > E15.8)
+ endif
+ else
+ write(*, 1995)
+ 1995 format(' Unknown class')
+ endif
+
+
+ if (class .ne. 'U') then
+ write (*,2001)
+ else
+ write (*, 2005)
+ endif
+
+ 2001 format(' Comparison of RMS-norms of residual')
+ 2005 format(' RMS-norms of residual')
+ do m = 1, 5
+ if (class .eq. 'U') then
+ write(*, 2015) m, xcr(m)
+ else if (xcrdif(m) .le. epsilon) then
+ write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
+ else
+ verified = .false.
+ write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
+ endif
+ enddo
+
+ if (class .ne. 'U') then
+ write (*,2002)
+ else
+ write (*,2006)
+ endif
+ 2002 format(' Comparison of RMS-norms of solution error')
+ 2006 format(' RMS-norms of solution error')
+
+ do m = 1, 5
+ if (class .eq. 'U') then
+ write(*, 2015) m, xce(m)
+ else if (xcedif(m) .le. epsilon) then
+ write (*,2011) m,xce(m),xceref(m),xcedif(m)
+ else
+ verified = .false.
+ write (*,2010) m,xce(m),xceref(m),xcedif(m)
+ endif
+ enddo
+
+ 2010 format(' FAILURE: ', i2, E20.13, E20.13, E20.13)
+ 2011 format(' ', i2, E20.13, E20.13, E20.13)
+ 2015 format(' ', i2, E20.13)
+
+ if (class .eq. 'U') then
+ write(*, 2022)
+ write(*, 2023)
+ 2022 format(' No reference values provided')
+ 2023 format(' No verification performed')
+ else if (verified) then
+ write(*, 2020)
+ 2020 format(' Verification Successful')
+ else
+ write(*, 2021)
+ 2021 format(' Verification failed')
+ endif
+
+ return
+
+
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c
+c work_lhs.h
+c
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ double precision fjac(5, 5, -2:MAX_CELL_DIM+1),
+ > njac(5, 5, -2:MAX_CELL_DIM+1),
+ > lhsa(5, 5, -1:MAX_CELL_DIM),
+ > lhsb(5, 5, -1:MAX_CELL_DIM),
+ > tmp1, tmp2, tmp3
+ common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c
+c work_lhs_vec.h
+c
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ double precision fjac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
+ > njac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
+ > lhsa(5, 5, -1:MAX_CELL_DIM, -1:MAX_CELL_DIM),
+ > lhsb(5, 5, -1:MAX_CELL_DIM, -1:MAX_CELL_DIM),
+ > tmp1, tmp2, tmp3
+ common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c Performs line solves in X direction by first factoring
+c the block-tridiagonal matrix into an upper triangular matrix,
+c and then performing back substitution to solve for the unknow
+c vectors of each line.
+c
+c Make sure we treat elements zero to cell_size in the direction
+c of the sweep.
+c
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+ integer c, istart, stage,
+ > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+ > isize,jsize,ksize,send_id
+
+ istart = 0
+
+c---------------------------------------------------------------------
+c in our terminology stage is the number of the cell in the x-direction
+c i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+ do stage = 1,ncells
+ c = slice(1,stage)
+ isize = cell_size(1,c) - 1
+ jsize = cell_size(2,c) - 1
+ ksize = cell_size(3,c) - 1
+
+c---------------------------------------------------------------------
+c set last-cell flag
+c---------------------------------------------------------------------
+ if (stage .eq. ncells) then
+ last = 1
+ else
+ last = 0
+ endif
+
+ if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+ first = 1
+c call lhsx(c)
+ call x_solve_cell(first,last,c)
+ else
+c---------------------------------------------------------------------
+c Not the first cell of this line, so receive info from
+c processor working on preceeding cell
+c---------------------------------------------------------------------
+ first = 0
+ call x_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c overlap computations and communications
+c---------------------------------------------------------------------
+c call lhsx(c)
+c---------------------------------------------------------------------
+c wait for completion
+c---------------------------------------------------------------------
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c install C'(istart) and rhs'(istart) to be used in this cell
+c---------------------------------------------------------------------
+ call x_unpack_solve_info(c)
+ call x_solve_cell(first,last,c)
+ endif
+
+ if (last .eq. 0) call x_send_solve_info(send_id,c)
+ enddo
+
+c---------------------------------------------------------------------
+c now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+ do stage = ncells, 1, -1
+ c = slice(1,stage)
+ first = 0
+ last = 0
+ if (stage .eq. 1) first = 1
+ if (stage .eq. ncells) then
+ last = 1
+c---------------------------------------------------------------------
+c last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+ call x_backsubstitute(first, last,c)
+ else
+ call x_receive_backsub_info(recv_id,c)
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+ call x_unpack_backsub_info(c)
+ call x_backsubstitute(first,last,c)
+ endif
+ if (first .eq. 0) call x_send_backsub_info(send_id,c)
+ enddo
+
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack C'(-1) and rhs'(-1) for
+c all j and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ integer j,k,m,n,ptr,c,istart
+
+ istart = 0
+ ptr = 0
+ do k=0,KMAX-1
+ do j=0,JMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send C'(iend) and rhs'(iend) for
+c all j and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer j,k,m,n,isize,ptr,c,jp,kp
+ integer error,send_id,buffer_size
+
+ isize = cell_size(1,c)-1
+ jp = cell_coord(2,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c pack up buffer
+c---------------------------------------------------------------------
+ ptr = 0
+ do k=0,KMAX-1
+ do j=0,JMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,isize,j,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c send buffer
+c---------------------------------------------------------------------
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, successor(1),
+ > WEST+jp+kp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send U(istart) for all j and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer j,k,n,ptr,c,istart,jp,kp
+ integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c Send element 0 to previous processor
+c---------------------------------------------------------------------
+ istart = 0
+ jp = cell_coord(2,c)-1
+ kp = cell_coord(3,c)-1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ ptr = 0
+ do k=0,KMAX-1
+ do j=0,JMAX-1
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,istart,j,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, predecessor(1),
+ > EAST+jp+kp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack U(isize) for all j and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ integer j,k,n,ptr,c
+
+ ptr = 0
+ do k=0,KMAX-1
+ do j=0,JMAX-1
+ do n=1,BLOCK_SIZE
+ backsub_info(n,j,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer error,recv_id,jp,kp,c,buffer_size
+ jp = cell_coord(2,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, successor(1),
+ > EAST+jp+kp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer jp,kp,recv_id,error,c,buffer_size
+ jp = cell_coord(2,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, predecessor(1),
+ > WEST+jp+kp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c back solve: if last cell, then generate U(isize)=rhs(isize)
+c else assume U(isize) is loaded in un pack backsub_info
+c so just use it
+c after call u(istart) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer first, last, c, i, j, k
+ integer m,n,isize,jsize,ksize,istart
+
+ istart = 0
+ isize = cell_size(1,c)-1
+ jsize = cell_size(2,c)-end(2,c)-1
+ ksize = cell_size(3,c)-end(3,c)-1
+ if (last .eq. 0) then
+ do k=start(3,c),ksize
+ do j=start(2,c),jsize
+c---------------------------------------------------------------------
+c U(isize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c)
+ > - lhsc(m,n,isize,j,k,c)*
+ > backsub_info(n,j,k,c)
+c---------------------------------------------------------------------
+c rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c)
+c $ - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
+c---------------------------------------------------------------------
+ enddo
+ enddo
+ enddo
+ enddo
+ endif
+ do k=start(3,c),ksize
+ do j=start(2,c),jsize
+ do i=isize-1,istart,-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
+ > - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c performs guaussian elimination on this cell.
+c
+c assumes that unpacking routines for non-first cells
+c preload C' and rhs' from previous cell.
+c
+c assumed send happens outside this routine, but that
+c c'(IMAX) and rhs'(IMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'work_lhs.h'
+
+ integer first,last,c
+ integer i,j,k,isize,ksize,jsize,istart
+
+ istart = 0
+ isize = cell_size(1,c)-1
+ jsize = cell_size(2,c)-end(2,c)-1
+ ksize = cell_size(3,c)-end(3,c)-1
+
+ call lhsabinit(lhsa, lhsb, isize)
+
+ do k=start(3,c),ksize
+ do j=start(2,c),jsize
+
+c---------------------------------------------------------------------
+c This function computes the left hand side in the xi-direction
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+ do i = start(1,c)-1, cell_size(1,c) - end(1,c)
+
+ tmp1 = rho_i(i,j,k,c)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+c---------------------------------------------------------------------
+c
+c---------------------------------------------------------------------
+ fjac(1,1,i) = 0.0d+00
+ fjac(1,2,i) = 1.0d+00
+ fjac(1,3,i) = 0.0d+00
+ fjac(1,4,i) = 0.0d+00
+ fjac(1,5,i) = 0.0d+00
+
+ fjac(2,1,i) = -(u(2,i,j,k,c) * tmp2 *
+ > u(2,i,j,k,c))
+ > + c2 * qs(i,j,k,c)
+ fjac(2,2,i) = ( 2.0d+00 - c2 )
+ > * ( u(2,i,j,k,c) * tmp1 )
+ fjac(2,3,i) = - c2 * ( u(3,i,j,k,c) * tmp1 )
+ fjac(2,4,i) = - c2 * ( u(4,i,j,k,c) * tmp1 )
+ fjac(2,5,i) = c2
+
+ fjac(3,1,i) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
+ fjac(3,2,i) = u(3,i,j,k,c) * tmp1
+ fjac(3,3,i) = u(2,i,j,k,c) * tmp1
+ fjac(3,4,i) = 0.0d+00
+ fjac(3,5,i) = 0.0d+00
+
+ fjac(4,1,i) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
+ fjac(4,2,i) = u(4,i,j,k,c) * tmp1
+ fjac(4,3,i) = 0.0d+00
+ fjac(4,4,i) = u(2,i,j,k,c) * tmp1
+ fjac(4,5,i) = 0.0d+00
+
+ fjac(5,1,i) = ( c2 * 2.0d0 * qs(i,j,k,c)
+ > - c1 * ( u(5,i,j,k,c) * tmp1 ) )
+ > * ( u(2,i,j,k,c) * tmp1 )
+ fjac(5,2,i) = c1 * u(5,i,j,k,c) * tmp1
+ > - c2
+ > * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
+ > + qs(i,j,k,c) )
+ fjac(5,3,i) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
+ > * tmp2
+ fjac(5,4,i) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
+ > * tmp2
+ fjac(5,5,i) = c1 * ( u(2,i,j,k,c) * tmp1 )
+
+ njac(1,1,i) = 0.0d+00
+ njac(1,2,i) = 0.0d+00
+ njac(1,3,i) = 0.0d+00
+ njac(1,4,i) = 0.0d+00
+ njac(1,5,i) = 0.0d+00
+
+ njac(2,1,i) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
+ njac(2,2,i) = con43 * c3c4 * tmp1
+ njac(2,3,i) = 0.0d+00
+ njac(2,4,i) = 0.0d+00
+ njac(2,5,i) = 0.0d+00
+
+ njac(3,1,i) = - c3c4 * tmp2 * u(3,i,j,k,c)
+ njac(3,2,i) = 0.0d+00
+ njac(3,3,i) = c3c4 * tmp1
+ njac(3,4,i) = 0.0d+00
+ njac(3,5,i) = 0.0d+00
+
+ njac(4,1,i) = - c3c4 * tmp2 * u(4,i,j,k,c)
+ njac(4,2,i) = 0.0d+00
+ njac(4,3,i) = 0.0d+00
+ njac(4,4,i) = c3c4 * tmp1
+ njac(4,5,i) = 0.0d+00
+
+ njac(5,1,i) = - ( con43 * c3c4
+ > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+ > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+ > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+ > - c1345 * tmp2 * u(5,i,j,k,c)
+
+ njac(5,2,i) = ( con43 * c3c4
+ > - c1345 ) * tmp2 * u(2,i,j,k,c)
+ njac(5,3,i) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
+ njac(5,4,i) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
+ njac(5,5,i) = ( c1345 ) * tmp1
+
+ enddo
+c---------------------------------------------------------------------
+c now jacobians set, so form left hand side in x direction
+c---------------------------------------------------------------------
+ do i = start(1,c), isize - end(1,c)
+
+ tmp1 = dt * tx1
+ tmp2 = dt * tx2
+
+ lhsa(1,1,i) = - tmp2 * fjac(1,1,i-1)
+ > - tmp1 * njac(1,1,i-1)
+ > - tmp1 * dx1
+ lhsa(1,2,i) = - tmp2 * fjac(1,2,i-1)
+ > - tmp1 * njac(1,2,i-1)
+ lhsa(1,3,i) = - tmp2 * fjac(1,3,i-1)
+ > - tmp1 * njac(1,3,i-1)
+ lhsa(1,4,i) = - tmp2 * fjac(1,4,i-1)
+ > - tmp1 * njac(1,4,i-1)
+ lhsa(1,5,i) = - tmp2 * fjac(1,5,i-1)
+ > - tmp1 * njac(1,5,i-1)
+
+ lhsa(2,1,i) = - tmp2 * fjac(2,1,i-1)
+ > - tmp1 * njac(2,1,i-1)
+ lhsa(2,2,i) = - tmp2 * fjac(2,2,i-1)
+ > - tmp1 * njac(2,2,i-1)
+ > - tmp1 * dx2
+ lhsa(2,3,i) = - tmp2 * fjac(2,3,i-1)
+ > - tmp1 * njac(2,3,i-1)
+ lhsa(2,4,i) = - tmp2 * fjac(2,4,i-1)
+ > - tmp1 * njac(2,4,i-1)
+ lhsa(2,5,i) = - tmp2 * fjac(2,5,i-1)
+ > - tmp1 * njac(2,5,i-1)
+
+ lhsa(3,1,i) = - tmp2 * fjac(3,1,i-1)
+ > - tmp1 * njac(3,1,i-1)
+ lhsa(3,2,i) = - tmp2 * fjac(3,2,i-1)
+ > - tmp1 * njac(3,2,i-1)
+ lhsa(3,3,i) = - tmp2 * fjac(3,3,i-1)
+ > - tmp1 * njac(3,3,i-1)
+ > - tmp1 * dx3
+ lhsa(3,4,i) = - tmp2 * fjac(3,4,i-1)
+ > - tmp1 * njac(3,4,i-1)
+ lhsa(3,5,i) = - tmp2 * fjac(3,5,i-1)
+ > - tmp1 * njac(3,5,i-1)
+
+ lhsa(4,1,i) = - tmp2 * fjac(4,1,i-1)
+ > - tmp1 * njac(4,1,i-1)
+ lhsa(4,2,i) = - tmp2 * fjac(4,2,i-1)
+ > - tmp1 * njac(4,2,i-1)
+ lhsa(4,3,i) = - tmp2 * fjac(4,3,i-1)
+ > - tmp1 * njac(4,3,i-1)
+ lhsa(4,4,i) = - tmp2 * fjac(4,4,i-1)
+ > - tmp1 * njac(4,4,i-1)
+ > - tmp1 * dx4
+ lhsa(4,5,i) = - tmp2 * fjac(4,5,i-1)
+ > - tmp1 * njac(4,5,i-1)
+
+ lhsa(5,1,i) = - tmp2 * fjac(5,1,i-1)
+ > - tmp1 * njac(5,1,i-1)
+ lhsa(5,2,i) = - tmp2 * fjac(5,2,i-1)
+ > - tmp1 * njac(5,2,i-1)
+ lhsa(5,3,i) = - tmp2 * fjac(5,3,i-1)
+ > - tmp1 * njac(5,3,i-1)
+ lhsa(5,4,i) = - tmp2 * fjac(5,4,i-1)
+ > - tmp1 * njac(5,4,i-1)
+ lhsa(5,5,i) = - tmp2 * fjac(5,5,i-1)
+ > - tmp1 * njac(5,5,i-1)
+ > - tmp1 * dx5
+
+ lhsb(1,1,i) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(1,1,i)
+ > + tmp1 * 2.0d+00 * dx1
+ lhsb(1,2,i) = tmp1 * 2.0d+00 * njac(1,2,i)
+ lhsb(1,3,i) = tmp1 * 2.0d+00 * njac(1,3,i)
+ lhsb(1,4,i) = tmp1 * 2.0d+00 * njac(1,4,i)
+ lhsb(1,5,i) = tmp1 * 2.0d+00 * njac(1,5,i)
+
+ lhsb(2,1,i) = tmp1 * 2.0d+00 * njac(2,1,i)
+ lhsb(2,2,i) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(2,2,i)
+ > + tmp1 * 2.0d+00 * dx2
+ lhsb(2,3,i) = tmp1 * 2.0d+00 * njac(2,3,i)
+ lhsb(2,4,i) = tmp1 * 2.0d+00 * njac(2,4,i)
+ lhsb(2,5,i) = tmp1 * 2.0d+00 * njac(2,5,i)
+
+ lhsb(3,1,i) = tmp1 * 2.0d+00 * njac(3,1,i)
+ lhsb(3,2,i) = tmp1 * 2.0d+00 * njac(3,2,i)
+ lhsb(3,3,i) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(3,3,i)
+ > + tmp1 * 2.0d+00 * dx3
+ lhsb(3,4,i) = tmp1 * 2.0d+00 * njac(3,4,i)
+ lhsb(3,5,i) = tmp1 * 2.0d+00 * njac(3,5,i)
+
+ lhsb(4,1,i) = tmp1 * 2.0d+00 * njac(4,1,i)
+ lhsb(4,2,i) = tmp1 * 2.0d+00 * njac(4,2,i)
+ lhsb(4,3,i) = tmp1 * 2.0d+00 * njac(4,3,i)
+ lhsb(4,4,i) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(4,4,i)
+ > + tmp1 * 2.0d+00 * dx4
+ lhsb(4,5,i) = tmp1 * 2.0d+00 * njac(4,5,i)
+
+ lhsb(5,1,i) = tmp1 * 2.0d+00 * njac(5,1,i)
+ lhsb(5,2,i) = tmp1 * 2.0d+00 * njac(5,2,i)
+ lhsb(5,3,i) = tmp1 * 2.0d+00 * njac(5,3,i)
+ lhsb(5,4,i) = tmp1 * 2.0d+00 * njac(5,4,i)
+ lhsb(5,5,i) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(5,5,i)
+ > + tmp1 * 2.0d+00 * dx5
+
+ lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i+1)
+ > - tmp1 * njac(1,1,i+1)
+ > - tmp1 * dx1
+ lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i+1)
+ > - tmp1 * njac(1,2,i+1)
+ lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i+1)
+ > - tmp1 * njac(1,3,i+1)
+ lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i+1)
+ > - tmp1 * njac(1,4,i+1)
+ lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i+1)
+ > - tmp1 * njac(1,5,i+1)
+
+ lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i+1)
+ > - tmp1 * njac(2,1,i+1)
+ lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i+1)
+ > - tmp1 * njac(2,2,i+1)
+ > - tmp1 * dx2
+ lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i+1)
+ > - tmp1 * njac(2,3,i+1)
+ lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i+1)
+ > - tmp1 * njac(2,4,i+1)
+ lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i+1)
+ > - tmp1 * njac(2,5,i+1)
+
+ lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i+1)
+ > - tmp1 * njac(3,1,i+1)
+ lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i+1)
+ > - tmp1 * njac(3,2,i+1)
+ lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i+1)
+ > - tmp1 * njac(3,3,i+1)
+ > - tmp1 * dx3
+ lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i+1)
+ > - tmp1 * njac(3,4,i+1)
+ lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i+1)
+ > - tmp1 * njac(3,5,i+1)
+
+ lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i+1)
+ > - tmp1 * njac(4,1,i+1)
+ lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i+1)
+ > - tmp1 * njac(4,2,i+1)
+ lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i+1)
+ > - tmp1 * njac(4,3,i+1)
+ lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i+1)
+ > - tmp1 * njac(4,4,i+1)
+ > - tmp1 * dx4
+ lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i+1)
+ > - tmp1 * njac(4,5,i+1)
+
+ lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i+1)
+ > - tmp1 * njac(5,1,i+1)
+ lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i+1)
+ > - tmp1 * njac(5,2,i+1)
+ lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i+1)
+ > - tmp1 * njac(5,3,i+1)
+ lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i+1)
+ > - tmp1 * njac(5,4,i+1)
+ lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i+1)
+ > - tmp1 * njac(5,5,i+1)
+ > - tmp1 * dx5
+
+ enddo
+
+
+c---------------------------------------------------------------------
+c outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+ if (first .eq. 1) then
+
+c---------------------------------------------------------------------
+c multiply c(istart,j,k) by b_inverse and copy back to c
+c multiply rhs(istart) by b_inverse(istart) and copy to rhs
+c---------------------------------------------------------------------
+ call binvcrhs( lhsb(1,1,istart),
+ > lhsc(1,1,istart,j,k,c),
+ > rhs(1,istart,j,k,c) )
+
+ endif
+
+c---------------------------------------------------------------------
+c begin inner most do loop
+c do all the elements of the cell unless last
+c---------------------------------------------------------------------
+ do i=istart+first,isize-last
+
+c---------------------------------------------------------------------
+c rhs(i) = rhs(i) - A*rhs(i-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,i),
+ > rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c B(i) = B(i) - C(i-1)*A(i)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,i),
+ > lhsc(1,1,i-1,j,k,c),
+ > lhsb(1,1,i))
+
+
+c---------------------------------------------------------------------
+c multiply c(i,j,k) by b_inverse and copy back to c
+c multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
+c---------------------------------------------------------------------
+ call binvcrhs( lhsb(1,1,i),
+ > lhsc(1,1,i,j,k,c),
+ > rhs(1,i,j,k,c) )
+
+ enddo
+
+c---------------------------------------------------------------------
+c Now finish up special cases for last cell
+c---------------------------------------------------------------------
+ if (last .eq. 1) then
+
+c---------------------------------------------------------------------
+c rhs(isize) = rhs(isize) - A*rhs(isize-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,isize),
+ > rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
+
+c---------------------------------------------------------------------
+c B(isize) = B(isize) - C(isize-1)*A(isize)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,isize),
+ > lhsc(1,1,isize-1,j,k,c),
+ > lhsb(1,1,isize))
+
+c---------------------------------------------------------------------
+c multiply rhs() by b_inverse() and copy to rhs
+c---------------------------------------------------------------------
+ call binvrhs( lhsb(1,1,isize),
+ > rhs(1,isize,j,k,c) )
+
+ endif
+ enddo
+ enddo
+
+
+ return
+ end
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c Performs line solves in X direction by first factoring
+c the block-tridiagonal matrix into an upper triangular matrix,
+c and then performing back substitution to solve for the unknow
+c vectors of each line.
+c
+c Make sure we treat elements zero to cell_size in the direction
+c of the sweep.
+c
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+ integer c, istart, stage,
+ > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+ > isize,jsize,ksize,send_id
+
+ istart = 0
+
+c---------------------------------------------------------------------
+c in our terminology stage is the number of the cell in the x-direct
+c i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+ do stage = 1,ncells
+ c = slice(1,stage)
+ isize = cell_size(1,c) - 1
+ jsize = cell_size(2,c) - 1
+ ksize = cell_size(3,c) - 1
+
+c---------------------------------------------------------------------
+c set last-cell flag
+c---------------------------------------------------------------------
+ if (stage .eq. ncells) then
+ last = 1
+ else
+ last = 0
+ endif
+
+ if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+ first = 1
+c call lhsx(c)
+ call x_solve_cell(first,last,c)
+ else
+c---------------------------------------------------------------------
+c Not the first cell of this line, so receive info from
+c processor working on preceeding cell
+c---------------------------------------------------------------------
+ first = 0
+ call x_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c overlap computations and communications
+c---------------------------------------------------------------------
+c call lhsx(c)
+c---------------------------------------------------------------------
+c wait for completion
+c---------------------------------------------------------------------
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c install C'(istart) and rhs'(istart) to be used in this cell
+c---------------------------------------------------------------------
+ call x_unpack_solve_info(c)
+ call x_solve_cell(first,last,c)
+ endif
+
+ if (last .eq. 0) call x_send_solve_info(send_id,c)
+ enddo
+
+c---------------------------------------------------------------------
+c now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+ do stage = ncells, 1, -1
+ c = slice(1,stage)
+ first = 0
+ last = 0
+ if (stage .eq. 1) first = 1
+ if (stage .eq. ncells) then
+ last = 1
+c---------------------------------------------------------------------
+c last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+ call x_backsubstitute(first, last,c)
+ else
+ call x_receive_backsub_info(recv_id,c)
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+ call x_unpack_backsub_info(c)
+ call x_backsubstitute(first,last,c)
+ endif
+ if (first .eq. 0) call x_send_backsub_info(send_id,c)
+ enddo
+
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack C'(-1) and rhs'(-1) for
+c all j and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ integer j,k,m,n,ptr,c,istart
+
+ istart = 0
+ ptr = 0
+ do k=0,KMAX-1
+ do j=0,JMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send C'(iend) and rhs'(iend) for
+c all j and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer j,k,m,n,isize,ptr,c,jp,kp
+ integer error,send_id,buffer_size
+
+ isize = cell_size(1,c)-1
+ jp = cell_coord(2,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c pack up buffer
+c---------------------------------------------------------------------
+ ptr = 0
+ do k=0,KMAX-1
+ do j=0,JMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,isize,j,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c send buffer
+c---------------------------------------------------------------------
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, successor(1),
+ > WEST+jp+kp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send U(istart) for all j and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer j,k,n,ptr,c,istart,jp,kp
+ integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c Send element 0 to previous processor
+c---------------------------------------------------------------------
+ istart = 0
+ jp = cell_coord(2,c)-1
+ kp = cell_coord(3,c)-1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ ptr = 0
+ do k=0,KMAX-1
+ do j=0,JMAX-1
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,istart,j,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, predecessor(1),
+ > EAST+jp+kp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack U(isize) for all j and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ integer j,k,n,ptr,c
+
+ ptr = 0
+ do k=0,KMAX-1
+ do j=0,JMAX-1
+ do n=1,BLOCK_SIZE
+ backsub_info(n,j,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer error,recv_id,jp,kp,c,buffer_size
+ jp = cell_coord(2,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, successor(1),
+ > EAST+jp+kp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer jp,kp,recv_id,error,c,buffer_size
+ jp = cell_coord(2,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, predecessor(1),
+ > WEST+jp+kp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c back solve: if last cell, then generate U(isize)=rhs(isize)
+c else assume U(isize) is loaded in un pack backsub_info
+c so just use it
+c after call u(istart) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer first, last, c, i, j, k
+ integer m,n,isize,jsize,ksize,istart
+
+ istart = 0
+ isize = cell_size(1,c)-1
+ jsize = cell_size(2,c)-end(2,c)-1
+ ksize = cell_size(3,c)-end(3,c)-1
+ if (last .eq. 0) then
+ do k=start(3,c),ksize
+ do j=start(2,c),jsize
+c---------------------------------------------------------------------
+c U(isize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c)
+ > - lhsc(m,n,isize,j,k,c)*
+ > backsub_info(n,j,k,c)
+c---------------------------------------------------------------------
+c rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c)
+c $ - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
+c---------------------------------------------------------------------
+ enddo
+ enddo
+ enddo
+ enddo
+ endif
+ do k=start(3,c),ksize
+ do j=start(2,c),jsize
+ do i=isize-1,istart,-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
+ > - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c performs guaussian elimination on this cell.
+c
+c assumes that unpacking routines for non-first cells
+c preload C' and rhs' from previous cell.
+c
+c assumed send happens outside this routine, but that
+c c'(IMAX) and rhs'(IMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'work_lhs_vec.h'
+
+ integer first,last,c
+ integer i,j,k,m,n,isize,ksize,jsize,istart
+
+ istart = 0
+ isize = cell_size(1,c)-1
+ jsize = cell_size(2,c)-end(2,c)-1
+ ksize = cell_size(3,c)-end(3,c)-1
+
+c---------------------------------------------------------------------
+c zero the left hand side for starters
+c set diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+ do j = 0, jsize
+ do m = 1, 5
+ do n = 1, 5
+ lhsa(m,n,0,j) = 0.0d0
+ lhsb(m,n,0,j) = 0.0d0
+ lhsa(m,n,isize,j) = 0.0d0
+ lhsb(m,n,isize,j) = 0.0d0
+ enddo
+ lhsb(m,m,0,j) = 1.0d0
+ lhsb(m,m,isize,j) = 1.0d0
+ enddo
+ enddo
+
+ do k=start(3,c),ksize
+
+c---------------------------------------------------------------------
+c This function computes the left hand side in the xi-direction
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+ do j=start(2,c),jsize
+ do i = start(1,c)-1, cell_size(1,c) - end(1,c)
+
+ tmp1 = rho_i(i,j,k,c)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+c---------------------------------------------------------------------
+c
+c---------------------------------------------------------------------
+ fjac(1,1,i,j) = 0.0d+00
+ fjac(1,2,i,j) = 1.0d+00
+ fjac(1,3,i,j) = 0.0d+00
+ fjac(1,4,i,j) = 0.0d+00
+ fjac(1,5,i,j) = 0.0d+00
+
+ fjac(2,1,i,j) = -(u(2,i,j,k,c) * tmp2 *
+ > u(2,i,j,k,c))
+ > + c2 * qs(i,j,k,c)
+ fjac(2,2,i,j) = ( 2.0d+00 - c2 )
+ > * ( u(2,i,j,k,c) * tmp1 )
+ fjac(2,3,i,j) = - c2 * ( u(3,i,j,k,c) * tmp1 )
+ fjac(2,4,i,j) = - c2 * ( u(4,i,j,k,c) * tmp1 )
+ fjac(2,5,i,j) = c2
+
+ fjac(3,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
+ fjac(3,2,i,j) = u(3,i,j,k,c) * tmp1
+ fjac(3,3,i,j) = u(2,i,j,k,c) * tmp1
+ fjac(3,4,i,j) = 0.0d+00
+ fjac(3,5,i,j) = 0.0d+00
+
+ fjac(4,1,i,j) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
+ fjac(4,2,i,j) = u(4,i,j,k,c) * tmp1
+ fjac(4,3,i,j) = 0.0d+00
+ fjac(4,4,i,j) = u(2,i,j,k,c) * tmp1
+ fjac(4,5,i,j) = 0.0d+00
+
+ fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
+ > - c1 * ( u(5,i,j,k,c) * tmp1 ) )
+ > * ( u(2,i,j,k,c) * tmp1 )
+ fjac(5,2,i,j) = c1 * u(5,i,j,k,c) * tmp1
+ > - c2
+ > * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
+ > + qs(i,j,k,c) )
+ fjac(5,3,i,j) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
+ > * tmp2
+ fjac(5,4,i,j) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
+ > * tmp2
+ fjac(5,5,i,j) = c1 * ( u(2,i,j,k,c) * tmp1 )
+
+ njac(1,1,i,j) = 0.0d+00
+ njac(1,2,i,j) = 0.0d+00
+ njac(1,3,i,j) = 0.0d+00
+ njac(1,4,i,j) = 0.0d+00
+ njac(1,5,i,j) = 0.0d+00
+
+ njac(2,1,i,j) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
+ njac(2,2,i,j) = con43 * c3c4 * tmp1
+ njac(2,3,i,j) = 0.0d+00
+ njac(2,4,i,j) = 0.0d+00
+ njac(2,5,i,j) = 0.0d+00
+
+ njac(3,1,i,j) = - c3c4 * tmp2 * u(3,i,j,k,c)
+ njac(3,2,i,j) = 0.0d+00
+ njac(3,3,i,j) = c3c4 * tmp1
+ njac(3,4,i,j) = 0.0d+00
+ njac(3,5,i,j) = 0.0d+00
+
+ njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
+ njac(4,2,i,j) = 0.0d+00
+ njac(4,3,i,j) = 0.0d+00
+ njac(4,4,i,j) = c3c4 * tmp1
+ njac(4,5,i,j) = 0.0d+00
+
+ njac(5,1,i,j) = - ( con43 * c3c4
+ > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+ > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+ > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+ > - c1345 * tmp2 * u(5,i,j,k,c)
+
+ njac(5,2,i,j) = ( con43 * c3c4
+ > - c1345 ) * tmp2 * u(2,i,j,k,c)
+ njac(5,3,i,j) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
+ njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
+ njac(5,5,i,j) = ( c1345 ) * tmp1
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c now jacobians set, so form left hand side in x direction
+c---------------------------------------------------------------------
+ do j=start(2,c),jsize
+ do i = start(1,c), isize - end(1,c)
+
+ tmp1 = dt * tx1
+ tmp2 = dt * tx2
+
+ lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i-1,j)
+ > - tmp1 * njac(1,1,i-1,j)
+ > - tmp1 * dx1
+ lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i-1,j)
+ > - tmp1 * njac(1,2,i-1,j)
+ lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i-1,j)
+ > - tmp1 * njac(1,3,i-1,j)
+ lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i-1,j)
+ > - tmp1 * njac(1,4,i-1,j)
+ lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i-1,j)
+ > - tmp1 * njac(1,5,i-1,j)
+
+ lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i-1,j)
+ > - tmp1 * njac(2,1,i-1,j)
+ lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i-1,j)
+ > - tmp1 * njac(2,2,i-1,j)
+ > - tmp1 * dx2
+ lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i-1,j)
+ > - tmp1 * njac(2,3,i-1,j)
+ lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i-1,j)
+ > - tmp1 * njac(2,4,i-1,j)
+ lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i-1,j)
+ > - tmp1 * njac(2,5,i-1,j)
+
+ lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i-1,j)
+ > - tmp1 * njac(3,1,i-1,j)
+ lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i-1,j)
+ > - tmp1 * njac(3,2,i-1,j)
+ lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i-1,j)
+ > - tmp1 * njac(3,3,i-1,j)
+ > - tmp1 * dx3
+ lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i-1,j)
+ > - tmp1 * njac(3,4,i-1,j)
+ lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i-1,j)
+ > - tmp1 * njac(3,5,i-1,j)
+
+ lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i-1,j)
+ > - tmp1 * njac(4,1,i-1,j)
+ lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i-1,j)
+ > - tmp1 * njac(4,2,i-1,j)
+ lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i-1,j)
+ > - tmp1 * njac(4,3,i-1,j)
+ lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i-1,j)
+ > - tmp1 * njac(4,4,i-1,j)
+ > - tmp1 * dx4
+ lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i-1,j)
+ > - tmp1 * njac(4,5,i-1,j)
+
+ lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i-1,j)
+ > - tmp1 * njac(5,1,i-1,j)
+ lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i-1,j)
+ > - tmp1 * njac(5,2,i-1,j)
+ lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i-1,j)
+ > - tmp1 * njac(5,3,i-1,j)
+ lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i-1,j)
+ > - tmp1 * njac(5,4,i-1,j)
+ lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i-1,j)
+ > - tmp1 * njac(5,5,i-1,j)
+ > - tmp1 * dx5
+
+ lhsb(1,1,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(1,1,i,j)
+ > + tmp1 * 2.0d+00 * dx1
+ lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
+ lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
+ lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
+ lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
+
+ lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
+ lhsb(2,2,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(2,2,i,j)
+ > + tmp1 * 2.0d+00 * dx2
+ lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
+ lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
+ lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
+
+ lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
+ lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
+ lhsb(3,3,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(3,3,i,j)
+ > + tmp1 * 2.0d+00 * dx3
+ lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
+ lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
+
+ lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
+ lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
+ lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
+ lhsb(4,4,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(4,4,i,j)
+ > + tmp1 * 2.0d+00 * dx4
+ lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
+
+ lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
+ lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
+ lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
+ lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
+ lhsb(5,5,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(5,5,i,j)
+ > + tmp1 * 2.0d+00 * dx5
+
+ lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i+1,j)
+ > - tmp1 * njac(1,1,i+1,j)
+ > - tmp1 * dx1
+ lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i+1,j)
+ > - tmp1 * njac(1,2,i+1,j)
+ lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i+1,j)
+ > - tmp1 * njac(1,3,i+1,j)
+ lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i+1,j)
+ > - tmp1 * njac(1,4,i+1,j)
+ lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i+1,j)
+ > - tmp1 * njac(1,5,i+1,j)
+
+ lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i+1,j)
+ > - tmp1 * njac(2,1,i+1,j)
+ lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i+1,j)
+ > - tmp1 * njac(2,2,i+1,j)
+ > - tmp1 * dx2
+ lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i+1,j)
+ > - tmp1 * njac(2,3,i+1,j)
+ lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i+1,j)
+ > - tmp1 * njac(2,4,i+1,j)
+ lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i+1,j)
+ > - tmp1 * njac(2,5,i+1,j)
+
+ lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i+1,j)
+ > - tmp1 * njac(3,1,i+1,j)
+ lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i+1,j)
+ > - tmp1 * njac(3,2,i+1,j)
+ lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i+1,j)
+ > - tmp1 * njac(3,3,i+1,j)
+ > - tmp1 * dx3
+ lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i+1,j)
+ > - tmp1 * njac(3,4,i+1,j)
+ lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i+1,j)
+ > - tmp1 * njac(3,5,i+1,j)
+
+ lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i+1,j)
+ > - tmp1 * njac(4,1,i+1,j)
+ lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i+1,j)
+ > - tmp1 * njac(4,2,i+1,j)
+ lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i+1,j)
+ > - tmp1 * njac(4,3,i+1,j)
+ lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i+1,j)
+ > - tmp1 * njac(4,4,i+1,j)
+ > - tmp1 * dx4
+ lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i+1,j)
+ > - tmp1 * njac(4,5,i+1,j)
+
+ lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i+1,j)
+ > - tmp1 * njac(5,1,i+1,j)
+ lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i+1,j)
+ > - tmp1 * njac(5,2,i+1,j)
+ lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i+1,j)
+ > - tmp1 * njac(5,3,i+1,j)
+ lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i+1,j)
+ > - tmp1 * njac(5,4,i+1,j)
+ lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i+1,j)
+ > - tmp1 * njac(5,5,i+1,j)
+ > - tmp1 * dx5
+
+ enddo
+ enddo
+
+
+c---------------------------------------------------------------------
+c outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+ if (first .eq. 1) then
+
+c---------------------------------------------------------------------
+c multiply c(istart,j,k) by b_inverse and copy back to c
+c multiply rhs(istart) by b_inverse(istart) and copy to rhs
+c---------------------------------------------------------------------
+!dir$ ivdep
+ do j=start(2,c),jsize
+ call binvcrhs( lhsb(1,1,istart,j),
+ > lhsc(1,1,istart,j,k,c),
+ > rhs(1,istart,j,k,c) )
+ enddo
+
+ endif
+
+c---------------------------------------------------------------------
+c begin inner most do loop
+c do all the elements of the cell unless last
+c---------------------------------------------------------------------
+!dir$ ivdep
+!dir$ interchange(i,j)
+ do j=start(2,c),jsize
+ do i=istart+first,isize-last
+
+c---------------------------------------------------------------------
+c rhs(i) = rhs(i) - A*rhs(i-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,i,j),
+ > rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c B(i) = B(i) - C(i-1)*A(i)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,i,j),
+ > lhsc(1,1,i-1,j,k,c),
+ > lhsb(1,1,i,j))
+
+
+c---------------------------------------------------------------------
+c multiply c(i,j,k) by b_inverse and copy back to c
+c multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
+c---------------------------------------------------------------------
+ call binvcrhs( lhsb(1,1,i,j),
+ > lhsc(1,1,i,j,k,c),
+ > rhs(1,i,j,k,c) )
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c Now finish up special cases for last cell
+c---------------------------------------------------------------------
+ if (last .eq. 1) then
+
+!dir$ ivdep
+ do j=start(2,c),jsize
+c---------------------------------------------------------------------
+c rhs(isize) = rhs(isize) - A*rhs(isize-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,isize,j),
+ > rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
+
+c---------------------------------------------------------------------
+c B(isize) = B(isize) - C(isize-1)*A(isize)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,isize,j),
+ > lhsc(1,1,isize-1,j,k,c),
+ > lhsb(1,1,isize,j))
+
+c---------------------------------------------------------------------
+c multiply rhs() by b_inverse() and copy to rhs
+c---------------------------------------------------------------------
+ call binvrhs( lhsb(1,1,isize,j),
+ > rhs(1,isize,j,k,c) )
+ enddo
+
+ endif
+ enddo
+
+
+ return
+ end
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Performs line solves in Y direction by first factoring
+c the block-tridiagonal matrix into an upper triangular matrix,
+c and then performing back substitution to solve for the unknow
+c vectors of each line.
+c
+c Make sure we treat elements zero to cell_size in the direction
+c of the sweep.
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer
+ > c, jstart, stage,
+ > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+ > isize,jsize,ksize,send_id
+
+ jstart = 0
+
+c---------------------------------------------------------------------
+c in our terminology stage is the number of the cell in the y-direction
+c i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+ do stage = 1,ncells
+ c = slice(2,stage)
+ isize = cell_size(1,c) - 1
+ jsize = cell_size(2,c) - 1
+ ksize = cell_size(3,c) - 1
+
+c---------------------------------------------------------------------
+c set last-cell flag
+c---------------------------------------------------------------------
+ if (stage .eq. ncells) then
+ last = 1
+ else
+ last = 0
+ endif
+
+ if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+ first = 1
+c call lhsy(c)
+ call y_solve_cell(first,last,c)
+ else
+c---------------------------------------------------------------------
+c Not the first cell of this line, so receive info from
+c processor working on preceeding cell
+c---------------------------------------------------------------------
+ first = 0
+ call y_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c overlap computations and communications
+c---------------------------------------------------------------------
+c call lhsy(c)
+c---------------------------------------------------------------------
+c wait for completion
+c---------------------------------------------------------------------
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
+c---------------------------------------------------------------------
+ call y_unpack_solve_info(c)
+ call y_solve_cell(first,last,c)
+ endif
+
+ if (last .eq. 0) call y_send_solve_info(send_id,c)
+ enddo
+
+c---------------------------------------------------------------------
+c now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+ do stage = ncells, 1, -1
+ c = slice(2,stage)
+ first = 0
+ last = 0
+ if (stage .eq. 1) first = 1
+ if (stage .eq. ncells) then
+ last = 1
+c---------------------------------------------------------------------
+c last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+ call y_backsubstitute(first, last,c)
+ else
+ call y_receive_backsub_info(recv_id,c)
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+ call y_unpack_backsub_info(c)
+ call y_backsubstitute(first,last,c)
+ endif
+ if (first .eq. 0) call y_send_backsub_info(send_id,c)
+ enddo
+
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack C'(-1) and rhs'(-1) for
+c all i and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i,k,m,n,ptr,c,jstart
+
+ jstart = 0
+ ptr = 0
+ do k=0,KMAX-1
+ do i=0,IMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send C'(jend) and rhs'(jend) for
+c all i and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i,k,m,n,jsize,ptr,c,ip,kp
+ integer error,send_id,buffer_size
+
+ jsize = cell_size(2,c)-1
+ ip = cell_coord(1,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c pack up buffer
+c---------------------------------------------------------------------
+ ptr = 0
+ do k=0,KMAX-1
+ do i=0,IMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c send buffer
+c---------------------------------------------------------------------
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, successor(2),
+ > SOUTH+ip+kp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send U(jstart) for all i and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i,k,n,ptr,c,jstart,ip,kp
+ integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c Send element 0 to previous processor
+c---------------------------------------------------------------------
+ jstart = 0
+ ip = cell_coord(1,c)-1
+ kp = cell_coord(3,c)-1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ ptr = 0
+ do k=0,KMAX-1
+ do i=0,IMAX-1
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, predecessor(2),
+ > NORTH+ip+kp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack U(jsize) for all i and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i,k,n,ptr,c
+
+ ptr = 0
+ do k=0,KMAX-1
+ do i=0,IMAX-1
+ do n=1,BLOCK_SIZE
+ backsub_info(n,i,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer error,recv_id,ip,kp,c,buffer_size
+ ip = cell_coord(1,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, successor(2),
+ > NORTH+ip+kp*NCELLS, comm_solve,
+ > recv_id, error)
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer ip,kp,recv_id,error,c,buffer_size
+ ip = cell_coord(1,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, predecessor(2),
+ > SOUTH+ip+kp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c back solve: if last cell, then generate U(jsize)=rhs(jsize)
+c else assume U(jsize) is loaded in un pack backsub_info
+c so just use it
+c after call u(jstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer first, last, c, i, k
+ integer m,n,j,jsize,isize,ksize,jstart
+
+ jstart = 0
+ isize = cell_size(1,c)-end(1,c)-1
+ jsize = cell_size(2,c)-1
+ ksize = cell_size(3,c)-end(3,c)-1
+ if (last .eq. 0) then
+ do k=start(3,c),ksize
+ do i=start(1,c),isize
+c---------------------------------------------------------------------
+c U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c)
+ > - lhsc(m,n,i,jsize,k,c)*
+ > backsub_info(n,i,k,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ endif
+ do k=start(3,c),ksize
+ do j=jsize-1,jstart,-1
+ do i=start(1,c),isize
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
+ > - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c performs guaussian elimination on this cell.
+c
+c assumes that unpacking routines for non-first cells
+c preload C' and rhs' from previous cell.
+c
+c assumed send happens outside this routine, but that
+c c'(JMAX) and rhs'(JMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'work_lhs.h'
+
+ integer first,last,c
+ integer i,j,k,isize,ksize,jsize,jstart
+ double precision utmp(6,-2:JMAX+1)
+
+ jstart = 0
+ isize = cell_size(1,c)-end(1,c)-1
+ jsize = cell_size(2,c)-1
+ ksize = cell_size(3,c)-end(3,c)-1
+
+ call lhsabinit(lhsa, lhsb, jsize)
+
+ do k=start(3,c),ksize
+ do i=start(1,c),isize
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three y-factors
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Compute the indices for storing the tri-diagonal matrix;
+c determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+ do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+ utmp(1,j) = 1.0d0 / u(1,i,j,k,c)
+ utmp(2,j) = u(2,i,j,k,c)
+ utmp(3,j) = u(3,i,j,k,c)
+ utmp(4,j) = u(4,i,j,k,c)
+ utmp(5,j) = u(5,i,j,k,c)
+ utmp(6,j) = qs(i,j,k,c)
+ end do
+
+ do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+
+ tmp1 = utmp(1,j)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ fjac(1,1,j) = 0.0d+00
+ fjac(1,2,j) = 0.0d+00
+ fjac(1,3,j) = 1.0d+00
+ fjac(1,4,j) = 0.0d+00
+ fjac(1,5,j) = 0.0d+00
+
+ fjac(2,1,j) = - ( utmp(2,j)*utmp(3,j) )
+ > * tmp2
+ fjac(2,2,j) = utmp(3,j) * tmp1
+ fjac(2,3,j) = utmp(2,j) * tmp1
+ fjac(2,4,j) = 0.0d+00
+ fjac(2,5,j) = 0.0d+00
+
+ fjac(3,1,j) = - ( utmp(3,j)*utmp(3,j)*tmp2)
+ > + c2 * utmp(6,j)
+ fjac(3,2,j) = - c2 * utmp(2,j) * tmp1
+ fjac(3,3,j) = ( 2.0d+00 - c2 )
+ > * utmp(3,j) * tmp1
+ fjac(3,4,j) = - c2 * utmp(4,j) * tmp1
+ fjac(3,5,j) = c2
+
+ fjac(4,1,j) = - ( utmp(3,j)*utmp(4,j) )
+ > * tmp2
+ fjac(4,2,j) = 0.0d+00
+ fjac(4,3,j) = utmp(4,j) * tmp1
+ fjac(4,4,j) = utmp(3,j) * tmp1
+ fjac(4,5,j) = 0.0d+00
+
+ fjac(5,1,j) = ( c2 * 2.0d0 * utmp(6,j)
+ > - c1 * utmp(5,j) * tmp1 )
+ > * utmp(3,j) * tmp1
+ fjac(5,2,j) = - c2 * utmp(2,j)*utmp(3,j)
+ > * tmp2
+ fjac(5,3,j) = c1 * utmp(5,j) * tmp1
+ > - c2 * ( utmp(6,j)
+ > + utmp(3,j)*utmp(3,j) * tmp2 )
+ fjac(5,4,j) = - c2 * ( utmp(3,j)*utmp(4,j) )
+ > * tmp2
+ fjac(5,5,j) = c1 * utmp(3,j) * tmp1
+
+ njac(1,1,j) = 0.0d+00
+ njac(1,2,j) = 0.0d+00
+ njac(1,3,j) = 0.0d+00
+ njac(1,4,j) = 0.0d+00
+ njac(1,5,j) = 0.0d+00
+
+ njac(2,1,j) = - c3c4 * tmp2 * utmp(2,j)
+ njac(2,2,j) = c3c4 * tmp1
+ njac(2,3,j) = 0.0d+00
+ njac(2,4,j) = 0.0d+00
+ njac(2,5,j) = 0.0d+00
+
+ njac(3,1,j) = - con43 * c3c4 * tmp2 * utmp(3,j)
+ njac(3,2,j) = 0.0d+00
+ njac(3,3,j) = con43 * c3c4 * tmp1
+ njac(3,4,j) = 0.0d+00
+ njac(3,5,j) = 0.0d+00
+
+ njac(4,1,j) = - c3c4 * tmp2 * utmp(4,j)
+ njac(4,2,j) = 0.0d+00
+ njac(4,3,j) = 0.0d+00
+ njac(4,4,j) = c3c4 * tmp1
+ njac(4,5,j) = 0.0d+00
+
+ njac(5,1,j) = - ( c3c4
+ > - c1345 ) * tmp3 * (utmp(2,j)**2)
+ > - ( con43 * c3c4
+ > - c1345 ) * tmp3 * (utmp(3,j)**2)
+ > - ( c3c4 - c1345 ) * tmp3 * (utmp(4,j)**2)
+ > - c1345 * tmp2 * utmp(5,j)
+
+ njac(5,2,j) = ( c3c4 - c1345 ) * tmp2 * utmp(2,j)
+ njac(5,3,j) = ( con43 * c3c4
+ > - c1345 ) * tmp2 * utmp(3,j)
+ njac(5,4,j) = ( c3c4 - c1345 ) * tmp2 * utmp(4,j)
+ njac(5,5,j) = ( c1345 ) * tmp1
+
+ enddo
+
+c---------------------------------------------------------------------
+c now joacobians set, so form left hand side in y direction
+c---------------------------------------------------------------------
+ do j = start(2,c), jsize-end(2,c)
+
+ tmp1 = dt * ty1
+ tmp2 = dt * ty2
+
+ lhsa(1,1,j) = - tmp2 * fjac(1,1,j-1)
+ > - tmp1 * njac(1,1,j-1)
+ > - tmp1 * dy1
+ lhsa(1,2,j) = - tmp2 * fjac(1,2,j-1)
+ > - tmp1 * njac(1,2,j-1)
+ lhsa(1,3,j) = - tmp2 * fjac(1,3,j-1)
+ > - tmp1 * njac(1,3,j-1)
+ lhsa(1,4,j) = - tmp2 * fjac(1,4,j-1)
+ > - tmp1 * njac(1,4,j-1)
+ lhsa(1,5,j) = - tmp2 * fjac(1,5,j-1)
+ > - tmp1 * njac(1,5,j-1)
+
+ lhsa(2,1,j) = - tmp2 * fjac(2,1,j-1)
+ > - tmp1 * njac(2,1,j-1)
+ lhsa(2,2,j) = - tmp2 * fjac(2,2,j-1)
+ > - tmp1 * njac(2,2,j-1)
+ > - tmp1 * dy2
+ lhsa(2,3,j) = - tmp2 * fjac(2,3,j-1)
+ > - tmp1 * njac(2,3,j-1)
+ lhsa(2,4,j) = - tmp2 * fjac(2,4,j-1)
+ > - tmp1 * njac(2,4,j-1)
+ lhsa(2,5,j) = - tmp2 * fjac(2,5,j-1)
+ > - tmp1 * njac(2,5,j-1)
+
+ lhsa(3,1,j) = - tmp2 * fjac(3,1,j-1)
+ > - tmp1 * njac(3,1,j-1)
+ lhsa(3,2,j) = - tmp2 * fjac(3,2,j-1)
+ > - tmp1 * njac(3,2,j-1)
+ lhsa(3,3,j) = - tmp2 * fjac(3,3,j-1)
+ > - tmp1 * njac(3,3,j-1)
+ > - tmp1 * dy3
+ lhsa(3,4,j) = - tmp2 * fjac(3,4,j-1)
+ > - tmp1 * njac(3,4,j-1)
+ lhsa(3,5,j) = - tmp2 * fjac(3,5,j-1)
+ > - tmp1 * njac(3,5,j-1)
+
+ lhsa(4,1,j) = - tmp2 * fjac(4,1,j-1)
+ > - tmp1 * njac(4,1,j-1)
+ lhsa(4,2,j) = - tmp2 * fjac(4,2,j-1)
+ > - tmp1 * njac(4,2,j-1)
+ lhsa(4,3,j) = - tmp2 * fjac(4,3,j-1)
+ > - tmp1 * njac(4,3,j-1)
+ lhsa(4,4,j) = - tmp2 * fjac(4,4,j-1)
+ > - tmp1 * njac(4,4,j-1)
+ > - tmp1 * dy4
+ lhsa(4,5,j) = - tmp2 * fjac(4,5,j-1)
+ > - tmp1 * njac(4,5,j-1)
+
+ lhsa(5,1,j) = - tmp2 * fjac(5,1,j-1)
+ > - tmp1 * njac(5,1,j-1)
+ lhsa(5,2,j) = - tmp2 * fjac(5,2,j-1)
+ > - tmp1 * njac(5,2,j-1)
+ lhsa(5,3,j) = - tmp2 * fjac(5,3,j-1)
+ > - tmp1 * njac(5,3,j-1)
+ lhsa(5,4,j) = - tmp2 * fjac(5,4,j-1)
+ > - tmp1 * njac(5,4,j-1)
+ lhsa(5,5,j) = - tmp2 * fjac(5,5,j-1)
+ > - tmp1 * njac(5,5,j-1)
+ > - tmp1 * dy5
+
+ lhsb(1,1,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(1,1,j)
+ > + tmp1 * 2.0d+00 * dy1
+ lhsb(1,2,j) = tmp1 * 2.0d+00 * njac(1,2,j)
+ lhsb(1,3,j) = tmp1 * 2.0d+00 * njac(1,3,j)
+ lhsb(1,4,j) = tmp1 * 2.0d+00 * njac(1,4,j)
+ lhsb(1,5,j) = tmp1 * 2.0d+00 * njac(1,5,j)
+
+ lhsb(2,1,j) = tmp1 * 2.0d+00 * njac(2,1,j)
+ lhsb(2,2,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(2,2,j)
+ > + tmp1 * 2.0d+00 * dy2
+ lhsb(2,3,j) = tmp1 * 2.0d+00 * njac(2,3,j)
+ lhsb(2,4,j) = tmp1 * 2.0d+00 * njac(2,4,j)
+ lhsb(2,5,j) = tmp1 * 2.0d+00 * njac(2,5,j)
+
+ lhsb(3,1,j) = tmp1 * 2.0d+00 * njac(3,1,j)
+ lhsb(3,2,j) = tmp1 * 2.0d+00 * njac(3,2,j)
+ lhsb(3,3,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(3,3,j)
+ > + tmp1 * 2.0d+00 * dy3
+ lhsb(3,4,j) = tmp1 * 2.0d+00 * njac(3,4,j)
+ lhsb(3,5,j) = tmp1 * 2.0d+00 * njac(3,5,j)
+
+ lhsb(4,1,j) = tmp1 * 2.0d+00 * njac(4,1,j)
+ lhsb(4,2,j) = tmp1 * 2.0d+00 * njac(4,2,j)
+ lhsb(4,3,j) = tmp1 * 2.0d+00 * njac(4,3,j)
+ lhsb(4,4,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(4,4,j)
+ > + tmp1 * 2.0d+00 * dy4
+ lhsb(4,5,j) = tmp1 * 2.0d+00 * njac(4,5,j)
+
+ lhsb(5,1,j) = tmp1 * 2.0d+00 * njac(5,1,j)
+ lhsb(5,2,j) = tmp1 * 2.0d+00 * njac(5,2,j)
+ lhsb(5,3,j) = tmp1 * 2.0d+00 * njac(5,3,j)
+ lhsb(5,4,j) = tmp1 * 2.0d+00 * njac(5,4,j)
+ lhsb(5,5,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(5,5,j)
+ > + tmp1 * 2.0d+00 * dy5
+
+ lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,j+1)
+ > - tmp1 * njac(1,1,j+1)
+ > - tmp1 * dy1
+ lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,j+1)
+ > - tmp1 * njac(1,2,j+1)
+ lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,j+1)
+ > - tmp1 * njac(1,3,j+1)
+ lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,j+1)
+ > - tmp1 * njac(1,4,j+1)
+ lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,j+1)
+ > - tmp1 * njac(1,5,j+1)
+
+ lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,j+1)
+ > - tmp1 * njac(2,1,j+1)
+ lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,j+1)
+ > - tmp1 * njac(2,2,j+1)
+ > - tmp1 * dy2
+ lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,j+1)
+ > - tmp1 * njac(2,3,j+1)
+ lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,j+1)
+ > - tmp1 * njac(2,4,j+1)
+ lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,j+1)
+ > - tmp1 * njac(2,5,j+1)
+
+ lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,j+1)
+ > - tmp1 * njac(3,1,j+1)
+ lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,j+1)
+ > - tmp1 * njac(3,2,j+1)
+ lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,j+1)
+ > - tmp1 * njac(3,3,j+1)
+ > - tmp1 * dy3
+ lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,j+1)
+ > - tmp1 * njac(3,4,j+1)
+ lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,j+1)
+ > - tmp1 * njac(3,5,j+1)
+
+ lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,j+1)
+ > - tmp1 * njac(4,1,j+1)
+ lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,j+1)
+ > - tmp1 * njac(4,2,j+1)
+ lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,j+1)
+ > - tmp1 * njac(4,3,j+1)
+ lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,j+1)
+ > - tmp1 * njac(4,4,j+1)
+ > - tmp1 * dy4
+ lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,j+1)
+ > - tmp1 * njac(4,5,j+1)
+
+ lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,j+1)
+ > - tmp1 * njac(5,1,j+1)
+ lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,j+1)
+ > - tmp1 * njac(5,2,j+1)
+ lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,j+1)
+ > - tmp1 * njac(5,3,j+1)
+ lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,j+1)
+ > - tmp1 * njac(5,4,j+1)
+ lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,j+1)
+ > - tmp1 * njac(5,5,j+1)
+ > - tmp1 * dy5
+
+ enddo
+
+
+c---------------------------------------------------------------------
+c outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+ if (first .eq. 1) then
+
+c---------------------------------------------------------------------
+c multiply c(i,jstart,k) by b_inverse and copy back to c
+c multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
+c---------------------------------------------------------------------
+ call binvcrhs( lhsb(1,1,jstart),
+ > lhsc(1,1,i,jstart,k,c),
+ > rhs(1,i,jstart,k,c) )
+
+ endif
+
+c---------------------------------------------------------------------
+c begin inner most do loop
+c do all the elements of the cell unless last
+c---------------------------------------------------------------------
+ do j=jstart+first,jsize-last
+
+c---------------------------------------------------------------------
+c subtract A*lhs_vector(j-1) from lhs_vector(j)
+c
+c rhs(j) = rhs(j) - A*rhs(j-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,j),
+ > rhs(1,i,j-1,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c B(j) = B(j) - C(j-1)*A(j)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,j),
+ > lhsc(1,1,i,j-1,k,c),
+ > lhsb(1,1,j))
+
+c---------------------------------------------------------------------
+c multiply c(i,j,k) by b_inverse and copy back to c
+c multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
+c---------------------------------------------------------------------
+ call binvcrhs( lhsb(1,1,j),
+ > lhsc(1,1,i,j,k,c),
+ > rhs(1,i,j,k,c) )
+
+ enddo
+
+c---------------------------------------------------------------------
+c Now finish up special cases for last cell
+c---------------------------------------------------------------------
+ if (last .eq. 1) then
+
+c---------------------------------------------------------------------
+c rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,jsize),
+ > rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c))
+
+c---------------------------------------------------------------------
+c B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
+c call matmul_sub(aa,i,jsize,k,c,
+c $ cc,i,jsize-1,k,c,bb,i,jsize,k,c)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,jsize),
+ > lhsc(1,1,i,jsize-1,k,c),
+ > lhsb(1,1,jsize))
+
+c---------------------------------------------------------------------
+c multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
+c---------------------------------------------------------------------
+ call binvrhs( lhsb(1,1,jsize),
+ > rhs(1,i,jsize,k,c) )
+
+ endif
+ enddo
+ enddo
+
+
+ return
+ end
+
+
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Performs line solves in Y direction by first factoring
+c the block-tridiagonal matrix into an upper triangular matrix,
+c and then performing back substitution to solve for the unknow
+c vectors of each line.
+c
+c Make sure we treat elements zero to cell_size in the direction
+c of the sweep.
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer
+ > c, jstart, stage,
+ > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+ > isize,jsize,ksize,send_id
+
+ jstart = 0
+
+c---------------------------------------------------------------------
+c in our terminology stage is the number of the cell in the y-direct
+c i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+ do stage = 1,ncells
+ c = slice(2,stage)
+ isize = cell_size(1,c) - 1
+ jsize = cell_size(2,c) - 1
+ ksize = cell_size(3,c) - 1
+
+c---------------------------------------------------------------------
+c set last-cell flag
+c---------------------------------------------------------------------
+ if (stage .eq. ncells) then
+ last = 1
+ else
+ last = 0
+ endif
+
+ if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+ first = 1
+c call lhsy(c)
+ call y_solve_cell(first,last,c)
+ else
+c---------------------------------------------------------------------
+c Not the first cell of this line, so receive info from
+c processor working on preceeding cell
+c---------------------------------------------------------------------
+ first = 0
+ call y_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c overlap computations and communications
+c---------------------------------------------------------------------
+c call lhsy(c)
+c---------------------------------------------------------------------
+c wait for completion
+c---------------------------------------------------------------------
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
+c---------------------------------------------------------------------
+ call y_unpack_solve_info(c)
+ call y_solve_cell(first,last,c)
+ endif
+
+ if (last .eq. 0) call y_send_solve_info(send_id,c)
+ enddo
+
+c---------------------------------------------------------------------
+c now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+ do stage = ncells, 1, -1
+ c = slice(2,stage)
+ first = 0
+ last = 0
+ if (stage .eq. 1) first = 1
+ if (stage .eq. ncells) then
+ last = 1
+c---------------------------------------------------------------------
+c last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+ call y_backsubstitute(first, last,c)
+ else
+ call y_receive_backsub_info(recv_id,c)
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+ call y_unpack_backsub_info(c)
+ call y_backsubstitute(first,last,c)
+ endif
+ if (first .eq. 0) call y_send_backsub_info(send_id,c)
+ enddo
+
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack C'(-1) and rhs'(-1) for
+c all i and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i,k,m,n,ptr,c,jstart
+
+ jstart = 0
+ ptr = 0
+ do k=0,KMAX-1
+ do i=0,IMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send C'(jend) and rhs'(jend) for
+c all i and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i,k,m,n,jsize,ptr,c,ip,kp
+ integer error,send_id,buffer_size
+
+ jsize = cell_size(2,c)-1
+ ip = cell_coord(1,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c pack up buffer
+c---------------------------------------------------------------------
+ ptr = 0
+ do k=0,KMAX-1
+ do i=0,IMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c send buffer
+c---------------------------------------------------------------------
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, successor(2),
+ > SOUTH+ip+kp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send U(jstart) for all i and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i,k,n,ptr,c,jstart,ip,kp
+ integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c Send element 0 to previous processor
+c---------------------------------------------------------------------
+ jstart = 0
+ ip = cell_coord(1,c)-1
+ kp = cell_coord(3,c)-1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ ptr = 0
+ do k=0,KMAX-1
+ do i=0,IMAX-1
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, predecessor(2),
+ > NORTH+ip+kp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack U(jsize) for all i and k
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i,k,n,ptr,c
+
+ ptr = 0
+ do k=0,KMAX-1
+ do i=0,IMAX-1
+ do n=1,BLOCK_SIZE
+ backsub_info(n,i,k,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer error,recv_id,ip,kp,c,buffer_size
+ ip = cell_coord(1,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, successor(2),
+ > NORTH+ip+kp*NCELLS, comm_solve,
+ > recv_id, error)
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer ip,kp,recv_id,error,c,buffer_size
+ ip = cell_coord(1,c) - 1
+ kp = cell_coord(3,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, predecessor(2),
+ > SOUTH+ip+kp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c back solve: if last cell, then generate U(jsize)=rhs(jsize)
+c else assume U(jsize) is loaded in un pack backsub_info
+c so just use it
+c after call u(jstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer first, last, c, i, k
+ integer m,n,j,jsize,isize,ksize,jstart
+
+ jstart = 0
+ isize = cell_size(1,c)-end(1,c)-1
+ jsize = cell_size(2,c)-1
+ ksize = cell_size(3,c)-end(3,c)-1
+ if (last .eq. 0) then
+ do k=start(3,c),ksize
+ do i=start(1,c),isize
+c---------------------------------------------------------------------
+c U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c)
+ > - lhsc(m,n,i,jsize,k,c)*
+ > backsub_info(n,i,k,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ endif
+ do k=start(3,c),ksize
+ do j=jsize-1,jstart,-1
+ do i=start(1,c),isize
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
+ > - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c performs guaussian elimination on this cell.
+c
+c assumes that unpacking routines for non-first cells
+c preload C' and rhs' from previous cell.
+c
+c assumed send happens outside this routine, but that
+c c'(JMAX) and rhs'(JMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'work_lhs_vec.h'
+
+ integer first,last,c
+ integer i,j,k,m,n,isize,ksize,jsize,jstart
+
+ jstart = 0
+ isize = cell_size(1,c)-end(1,c)-1
+ jsize = cell_size(2,c)-1
+ ksize = cell_size(3,c)-end(3,c)-1
+
+c---------------------------------------------------------------------
+c zero the left hand side for starters
+c set diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+ do i = 0, isize
+ do m = 1, 5
+ do n = 1, 5
+ lhsa(m,n,i,0) = 0.0d0
+ lhsb(m,n,i,0) = 0.0d0
+ lhsa(m,n,i,jsize) = 0.0d0
+ lhsb(m,n,i,jsize) = 0.0d0
+ enddo
+ lhsb(m,m,i,0) = 1.0d0
+ lhsb(m,m,i,jsize) = 1.0d0
+ enddo
+ enddo
+
+ do k=start(3,c),ksize
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three y-factors
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Compute the indices for storing the tri-diagonal matrix;
+c determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+
+ do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+ do i=start(1,c),isize
+
+ tmp1 = 1.0d0 / u(1,i,j,k,c)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ fjac(1,1,i,j) = 0.0d+00
+ fjac(1,2,i,j) = 0.0d+00
+ fjac(1,3,i,j) = 1.0d+00
+ fjac(1,4,i,j) = 0.0d+00
+ fjac(1,5,i,j) = 0.0d+00
+
+ fjac(2,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) )
+ > * tmp2
+ fjac(2,2,i,j) = u(3,i,j,k,c) * tmp1
+ fjac(2,3,i,j) = u(2,i,j,k,c) * tmp1
+ fjac(2,4,i,j) = 0.0d+00
+ fjac(2,5,i,j) = 0.0d+00
+
+ fjac(3,1,i,j) = - ( u(3,i,j,k,c)*u(3,i,j,k,c)*tmp2)
+ > + c2 * qs(i,j,k,c)
+ fjac(3,2,i,j) = - c2 * u(2,i,j,k,c) * tmp1
+ fjac(3,3,i,j) = ( 2.0d+00 - c2 )
+ > * u(3,i,j,k,c) * tmp1
+ fjac(3,4,i,j) = - c2 * u(4,i,j,k,c) * tmp1
+ fjac(3,5,i,j) = c2
+
+ fjac(4,1,i,j) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+ > * tmp2
+ fjac(4,2,i,j) = 0.0d+00
+ fjac(4,3,i,j) = u(4,i,j,k,c) * tmp1
+ fjac(4,4,i,j) = u(3,i,j,k,c) * tmp1
+ fjac(4,5,i,j) = 0.0d+00
+
+ fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
+ > - c1 * u(5,i,j,k,c) * tmp1 )
+ > * u(3,i,j,k,c) * tmp1
+ fjac(5,2,i,j) = - c2 * u(2,i,j,k,c)*u(3,i,j,k,c)
+ > * tmp2
+ fjac(5,3,i,j) = c1 * u(5,i,j,k,c) * tmp1
+ > - c2 * ( qs(i,j,k,c)
+ > + u(3,i,j,k,c)*u(3,i,j,k,c) * tmp2 )
+ fjac(5,4,i,j) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+ > * tmp2
+ fjac(5,5,i,j) = c1 * u(3,i,j,k,c) * tmp1
+
+ njac(1,1,i,j) = 0.0d+00
+ njac(1,2,i,j) = 0.0d+00
+ njac(1,3,i,j) = 0.0d+00
+ njac(1,4,i,j) = 0.0d+00
+ njac(1,5,i,j) = 0.0d+00
+
+ njac(2,1,i,j) = - c3c4 * tmp2 * u(2,i,j,k,c)
+ njac(2,2,i,j) = c3c4 * tmp1
+ njac(2,3,i,j) = 0.0d+00
+ njac(2,4,i,j) = 0.0d+00
+ njac(2,5,i,j) = 0.0d+00
+
+ njac(3,1,i,j) = - con43 * c3c4 * tmp2 * u(3,i,j,k,c)
+ njac(3,2,i,j) = 0.0d+00
+ njac(3,3,i,j) = con43 * c3c4 * tmp1
+ njac(3,4,i,j) = 0.0d+00
+ njac(3,5,i,j) = 0.0d+00
+
+ njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
+ njac(4,2,i,j) = 0.0d+00
+ njac(4,3,i,j) = 0.0d+00
+ njac(4,4,i,j) = c3c4 * tmp1
+ njac(4,5,i,j) = 0.0d+00
+
+ njac(5,1,i,j) = - ( c3c4
+ > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+ > - ( con43 * c3c4
+ > - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+ > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+ > - c1345 * tmp2 * u(5,i,j,k,c)
+
+ njac(5,2,i,j) = ( c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
+ njac(5,3,i,j) = ( con43 * c3c4
+ > - c1345 ) * tmp2 * u(3,i,j,k,c)
+ njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
+ njac(5,5,i,j) = ( c1345 ) * tmp1
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c now joacobians set, so form left hand side in y direction
+c---------------------------------------------------------------------
+ do j = start(2,c), jsize-end(2,c)
+ do i=start(1,c),isize
+
+ tmp1 = dt * ty1
+ tmp2 = dt * ty2
+
+ lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i,j-1)
+ > - tmp1 * njac(1,1,i,j-1)
+ > - tmp1 * dy1
+ lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i,j-1)
+ > - tmp1 * njac(1,2,i,j-1)
+ lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i,j-1)
+ > - tmp1 * njac(1,3,i,j-1)
+ lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i,j-1)
+ > - tmp1 * njac(1,4,i,j-1)
+ lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i,j-1)
+ > - tmp1 * njac(1,5,i,j-1)
+
+ lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i,j-1)
+ > - tmp1 * njac(2,1,i,j-1)
+ lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i,j-1)
+ > - tmp1 * njac(2,2,i,j-1)
+ > - tmp1 * dy2
+ lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i,j-1)
+ > - tmp1 * njac(2,3,i,j-1)
+ lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i,j-1)
+ > - tmp1 * njac(2,4,i,j-1)
+ lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i,j-1)
+ > - tmp1 * njac(2,5,i,j-1)
+
+ lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i,j-1)
+ > - tmp1 * njac(3,1,i,j-1)
+ lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i,j-1)
+ > - tmp1 * njac(3,2,i,j-1)
+ lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i,j-1)
+ > - tmp1 * njac(3,3,i,j-1)
+ > - tmp1 * dy3
+ lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i,j-1)
+ > - tmp1 * njac(3,4,i,j-1)
+ lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i,j-1)
+ > - tmp1 * njac(3,5,i,j-1)
+
+ lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i,j-1)
+ > - tmp1 * njac(4,1,i,j-1)
+ lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i,j-1)
+ > - tmp1 * njac(4,2,i,j-1)
+ lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i,j-1)
+ > - tmp1 * njac(4,3,i,j-1)
+ lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i,j-1)
+ > - tmp1 * njac(4,4,i,j-1)
+ > - tmp1 * dy4
+ lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i,j-1)
+ > - tmp1 * njac(4,5,i,j-1)
+
+ lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i,j-1)
+ > - tmp1 * njac(5,1,i,j-1)
+ lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i,j-1)
+ > - tmp1 * njac(5,2,i,j-1)
+ lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i,j-1)
+ > - tmp1 * njac(5,3,i,j-1)
+ lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i,j-1)
+ > - tmp1 * njac(5,4,i,j-1)
+ lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i,j-1)
+ > - tmp1 * njac(5,5,i,j-1)
+ > - tmp1 * dy5
+
+ lhsb(1,1,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(1,1,i,j)
+ > + tmp1 * 2.0d+00 * dy1
+ lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
+ lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
+ lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
+ lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
+
+ lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
+ lhsb(2,2,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(2,2,i,j)
+ > + tmp1 * 2.0d+00 * dy2
+ lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
+ lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
+ lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
+
+ lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
+ lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
+ lhsb(3,3,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(3,3,i,j)
+ > + tmp1 * 2.0d+00 * dy3
+ lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
+ lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
+
+ lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
+ lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
+ lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
+ lhsb(4,4,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(4,4,i,j)
+ > + tmp1 * 2.0d+00 * dy4
+ lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
+
+ lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
+ lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
+ lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
+ lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
+ lhsb(5,5,i,j) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(5,5,i,j)
+ > + tmp1 * 2.0d+00 * dy5
+
+ lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i,j+1)
+ > - tmp1 * njac(1,1,i,j+1)
+ > - tmp1 * dy1
+ lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i,j+1)
+ > - tmp1 * njac(1,2,i,j+1)
+ lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i,j+1)
+ > - tmp1 * njac(1,3,i,j+1)
+ lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i,j+1)
+ > - tmp1 * njac(1,4,i,j+1)
+ lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i,j+1)
+ > - tmp1 * njac(1,5,i,j+1)
+
+ lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i,j+1)
+ > - tmp1 * njac(2,1,i,j+1)
+ lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i,j+1)
+ > - tmp1 * njac(2,2,i,j+1)
+ > - tmp1 * dy2
+ lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i,j+1)
+ > - tmp1 * njac(2,3,i,j+1)
+ lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i,j+1)
+ > - tmp1 * njac(2,4,i,j+1)
+ lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i,j+1)
+ > - tmp1 * njac(2,5,i,j+1)
+
+ lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i,j+1)
+ > - tmp1 * njac(3,1,i,j+1)
+ lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i,j+1)
+ > - tmp1 * njac(3,2,i,j+1)
+ lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i,j+1)
+ > - tmp1 * njac(3,3,i,j+1)
+ > - tmp1 * dy3
+ lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i,j+1)
+ > - tmp1 * njac(3,4,i,j+1)
+ lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i,j+1)
+ > - tmp1 * njac(3,5,i,j+1)
+
+ lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i,j+1)
+ > - tmp1 * njac(4,1,i,j+1)
+ lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i,j+1)
+ > - tmp1 * njac(4,2,i,j+1)
+ lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i,j+1)
+ > - tmp1 * njac(4,3,i,j+1)
+ lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i,j+1)
+ > - tmp1 * njac(4,4,i,j+1)
+ > - tmp1 * dy4
+ lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i,j+1)
+ > - tmp1 * njac(4,5,i,j+1)
+
+ lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i,j+1)
+ > - tmp1 * njac(5,1,i,j+1)
+ lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i,j+1)
+ > - tmp1 * njac(5,2,i,j+1)
+ lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i,j+1)
+ > - tmp1 * njac(5,3,i,j+1)
+ lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i,j+1)
+ > - tmp1 * njac(5,4,i,j+1)
+ lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i,j+1)
+ > - tmp1 * njac(5,5,i,j+1)
+ > - tmp1 * dy5
+
+ enddo
+ enddo
+
+
+c---------------------------------------------------------------------
+c outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+ if (first .eq. 1) then
+
+c---------------------------------------------------------------------
+c multiply c(i,jstart,k) by b_inverse and copy back to c
+c multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
+c---------------------------------------------------------------------
+!dir$ ivdep
+ do i=start(1,c),isize
+ call binvcrhs( lhsb(1,1,i,jstart),
+ > lhsc(1,1,i,jstart,k,c),
+ > rhs(1,i,jstart,k,c) )
+ enddo
+
+ endif
+
+c---------------------------------------------------------------------
+c begin inner most do loop
+c do all the elements of the cell unless last
+c---------------------------------------------------------------------
+ do j=jstart+first,jsize-last
+!dir$ ivdep
+ do i=start(1,c),isize
+
+c---------------------------------------------------------------------
+c subtract A*lhs_vector(j-1) from lhs_vector(j)
+c
+c rhs(j) = rhs(j) - A*rhs(j-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,i,j),
+ > rhs(1,i,j-1,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c B(j) = B(j) - C(j-1)*A(j)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,i,j),
+ > lhsc(1,1,i,j-1,k,c),
+ > lhsb(1,1,i,j))
+
+c---------------------------------------------------------------------
+c multiply c(i,j,k) by b_inverse and copy back to c
+c multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
+c---------------------------------------------------------------------
+ call binvcrhs( lhsb(1,1,i,j),
+ > lhsc(1,1,i,j,k,c),
+ > rhs(1,i,j,k,c) )
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c Now finish up special cases for last cell
+c---------------------------------------------------------------------
+ if (last .eq. 1) then
+
+!dir$ ivdep
+ do i=start(1,c),isize
+c---------------------------------------------------------------------
+c rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,i,jsize),
+ > rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c))
+
+c---------------------------------------------------------------------
+c B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
+c call matmul_sub(aa,i,jsize,k,c,
+c $ cc,i,jsize-1,k,c,bb,i,jsize,k,c)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,i,jsize),
+ > lhsc(1,1,i,jsize-1,k,c),
+ > lhsb(1,1,i,jsize))
+
+c---------------------------------------------------------------------
+c multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
+c---------------------------------------------------------------------
+ call binvrhs( lhsb(1,1,i,jsize),
+ > rhs(1,i,jsize,k,c) )
+ enddo
+
+ endif
+ enddo
+
+
+ return
+ end
+
+
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Performs line solves in Z direction by first factoring
+c the block-tridiagonal matrix into an upper triangular matrix,
+c and then performing back substitution to solve for the unknow
+c vectors of each line.
+c
+c Make sure we treat elements zero to cell_size in the direction
+c of the sweep.
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer c, kstart, stage,
+ > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+ > isize,jsize,ksize,send_id
+
+ kstart = 0
+
+c---------------------------------------------------------------------
+c in our terminology stage is the number of the cell in the y-direction
+c i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+ do stage = 1,ncells
+ c = slice(3,stage)
+ isize = cell_size(1,c) - 1
+ jsize = cell_size(2,c) - 1
+ ksize = cell_size(3,c) - 1
+c---------------------------------------------------------------------
+c set last-cell flag
+c---------------------------------------------------------------------
+ if (stage .eq. ncells) then
+ last = 1
+ else
+ last = 0
+ endif
+
+ if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+ first = 1
+c call lhsz(c)
+ call z_solve_cell(first,last,c)
+ else
+c---------------------------------------------------------------------
+c Not the first cell of this line, so receive info from
+c processor working on preceeding cell
+c---------------------------------------------------------------------
+ first = 0
+ call z_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c overlap computations and communications
+c---------------------------------------------------------------------
+c call lhsz(c)
+c---------------------------------------------------------------------
+c wait for completion
+c---------------------------------------------------------------------
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c install C'(kstart+1) and rhs'(kstart+1) to be used in this cell
+c---------------------------------------------------------------------
+ call z_unpack_solve_info(c)
+ call z_solve_cell(first,last,c)
+ endif
+
+ if (last .eq. 0) call z_send_solve_info(send_id,c)
+ enddo
+
+c---------------------------------------------------------------------
+c now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+ do stage = ncells, 1, -1
+ c = slice(3,stage)
+ first = 0
+ last = 0
+ if (stage .eq. 1) first = 1
+ if (stage .eq. ncells) then
+ last = 1
+c---------------------------------------------------------------------
+c last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+ call z_backsubstitute(first, last,c)
+ else
+ call z_receive_backsub_info(recv_id,c)
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+ call z_unpack_backsub_info(c)
+ call z_backsubstitute(first,last,c)
+ endif
+ if (first .eq. 0) call z_send_backsub_info(send_id,c)
+ enddo
+
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_unpack_solve_info(c)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack C'(-1) and rhs'(-1) for
+c all i and j
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i,j,m,n,ptr,c,kstart
+
+ kstart = 0
+ ptr = 0
+ do j=0,JMAX-1
+ do i=0,IMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send C'(kend) and rhs'(kend) for
+c all i and j
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i,j,m,n,ksize,ptr,c,ip,jp
+ integer error,send_id,buffer_size
+
+ ksize = cell_size(3,c)-1
+ ip = cell_coord(1,c) - 1
+ jp = cell_coord(2,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c pack up buffer
+c---------------------------------------------------------------------
+ ptr = 0
+ do j=0,JMAX-1
+ do i=0,IMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,i,j,ksize,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c send buffer
+c---------------------------------------------------------------------
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, successor(3),
+ > BOTTOM+ip+jp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send U(jstart) for all i and j
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i,j,n,ptr,c,kstart,ip,jp
+ integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c Send element 0 to previous processor
+c---------------------------------------------------------------------
+ kstart = 0
+ ip = cell_coord(1,c)-1
+ jp = cell_coord(2,c)-1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ ptr = 0
+ do j=0,JMAX-1
+ do i=0,IMAX-1
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,i,j,kstart,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, predecessor(3),
+ > TOP+ip+jp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack U(ksize) for all i and j
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i,j,n,ptr,c
+
+ ptr = 0
+ do j=0,JMAX-1
+ do i=0,IMAX-1
+ do n=1,BLOCK_SIZE
+ backsub_info(n,i,j,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer error,recv_id,ip,jp,c,buffer_size
+ ip = cell_coord(1,c) - 1
+ jp = cell_coord(2,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, successor(3),
+ > TOP+ip+jp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer ip,jp,recv_id,error,c,buffer_size
+ ip = cell_coord(1,c) - 1
+ jp = cell_coord(2,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, predecessor(3),
+ > BOTTOM+ip+jp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c back solve: if last cell, then generate U(ksize)=rhs(ksize)
+c else assume U(ksize) is loaded in un pack backsub_info
+c so just use it
+c after call u(kstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer first, last, c, i, k
+ integer m,n,j,jsize,isize,ksize,kstart
+
+ kstart = 0
+ isize = cell_size(1,c)-end(1,c)-1
+ jsize = cell_size(2,c)-end(2,c)-1
+ ksize = cell_size(3,c)-1
+ if (last .eq. 0) then
+ do j=start(2,c),jsize
+ do i=start(1,c),isize
+c---------------------------------------------------------------------
+c U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c)
+ > - lhsc(m,n,i,j,ksize,c)*
+ > backsub_info(n,i,j,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ endif
+ do k=ksize-1,kstart,-1
+ do j=start(2,c),jsize
+ do i=start(1,c),isize
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
+ > - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c performs guaussian elimination on this cell.
+c
+c assumes that unpacking routines for non-first cells
+c preload C' and rhs' from previous cell.
+c
+c assumed send happens outside this routine, but that
+c c'(KMAX) and rhs'(KMAX) will be sent to next cell.
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'work_lhs.h'
+
+ integer first,last,c
+ integer i,j,k,isize,ksize,jsize,kstart
+ double precision utmp(6,-2:KMAX+1)
+
+ kstart = 0
+ isize = cell_size(1,c)-end(1,c)-1
+ jsize = cell_size(2,c)-end(2,c)-1
+ ksize = cell_size(3,c)-1
+
+ call lhsabinit(lhsa, lhsb, ksize)
+
+ do j=start(2,c),jsize
+ do i=start(1,c),isize
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three z-factors
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Compute the indices for storing the block-diagonal matrix;
+c determine c (labeled f) and s jacobians for cell c
+c---------------------------------------------------------------------
+ do k = start(3,c)-1, cell_size(3,c)-end(3,c)
+ utmp(1,k) = 1.0d0 / u(1,i,j,k,c)
+ utmp(2,k) = u(2,i,j,k,c)
+ utmp(3,k) = u(3,i,j,k,c)
+ utmp(4,k) = u(4,i,j,k,c)
+ utmp(5,k) = u(5,i,j,k,c)
+ utmp(6,k) = qs(i,j,k,c)
+ end do
+
+ do k = start(3,c)-1, cell_size(3,c)-end(3,c)
+
+ tmp1 = utmp(1,k)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ fjac(1,1,k) = 0.0d+00
+ fjac(1,2,k) = 0.0d+00
+ fjac(1,3,k) = 0.0d+00
+ fjac(1,4,k) = 1.0d+00
+ fjac(1,5,k) = 0.0d+00
+
+ fjac(2,1,k) = - ( utmp(2,k)*utmp(4,k) )
+ > * tmp2
+ fjac(2,2,k) = utmp(4,k) * tmp1
+ fjac(2,3,k) = 0.0d+00
+ fjac(2,4,k) = utmp(2,k) * tmp1
+ fjac(2,5,k) = 0.0d+00
+
+ fjac(3,1,k) = - ( utmp(3,k)*utmp(4,k) )
+ > * tmp2
+ fjac(3,2,k) = 0.0d+00
+ fjac(3,3,k) = utmp(4,k) * tmp1
+ fjac(3,4,k) = utmp(3,k) * tmp1
+ fjac(3,5,k) = 0.0d+00
+
+ fjac(4,1,k) = - (utmp(4,k)*utmp(4,k) * tmp2 )
+ > + c2 * utmp(6,k)
+ fjac(4,2,k) = - c2 * utmp(2,k) * tmp1
+ fjac(4,3,k) = - c2 * utmp(3,k) * tmp1
+ fjac(4,4,k) = ( 2.0d+00 - c2 )
+ > * utmp(4,k) * tmp1
+ fjac(4,5,k) = c2
+
+ fjac(5,1,k) = ( c2 * 2.0d0 * utmp(6,k)
+ > - c1 * ( utmp(5,k) * tmp1 ) )
+ > * ( utmp(4,k) * tmp1 )
+ fjac(5,2,k) = - c2 * ( utmp(2,k)*utmp(4,k) )
+ > * tmp2
+ fjac(5,3,k) = - c2 * ( utmp(3,k)*utmp(4,k) )
+ > * tmp2
+ fjac(5,4,k) = c1 * ( utmp(5,k) * tmp1 )
+ > - c2 * ( utmp(6,k)
+ > + utmp(4,k)*utmp(4,k) * tmp2 )
+ fjac(5,5,k) = c1 * utmp(4,k) * tmp1
+
+ njac(1,1,k) = 0.0d+00
+ njac(1,2,k) = 0.0d+00
+ njac(1,3,k) = 0.0d+00
+ njac(1,4,k) = 0.0d+00
+ njac(1,5,k) = 0.0d+00
+
+ njac(2,1,k) = - c3c4 * tmp2 * utmp(2,k)
+ njac(2,2,k) = c3c4 * tmp1
+ njac(2,3,k) = 0.0d+00
+ njac(2,4,k) = 0.0d+00
+ njac(2,5,k) = 0.0d+00
+
+ njac(3,1,k) = - c3c4 * tmp2 * utmp(3,k)
+ njac(3,2,k) = 0.0d+00
+ njac(3,3,k) = c3c4 * tmp1
+ njac(3,4,k) = 0.0d+00
+ njac(3,5,k) = 0.0d+00
+
+ njac(4,1,k) = - con43 * c3c4 * tmp2 * utmp(4,k)
+ njac(4,2,k) = 0.0d+00
+ njac(4,3,k) = 0.0d+00
+ njac(4,4,k) = con43 * c3 * c4 * tmp1
+ njac(4,5,k) = 0.0d+00
+
+ njac(5,1,k) = - ( c3c4
+ > - c1345 ) * tmp3 * (utmp(2,k)**2)
+ > - ( c3c4 - c1345 ) * tmp3 * (utmp(3,k)**2)
+ > - ( con43 * c3c4
+ > - c1345 ) * tmp3 * (utmp(4,k)**2)
+ > - c1345 * tmp2 * utmp(5,k)
+
+ njac(5,2,k) = ( c3c4 - c1345 ) * tmp2 * utmp(2,k)
+ njac(5,3,k) = ( c3c4 - c1345 ) * tmp2 * utmp(3,k)
+ njac(5,4,k) = ( con43 * c3c4
+ > - c1345 ) * tmp2 * utmp(4,k)
+ njac(5,5,k) = ( c1345 )* tmp1
+
+
+ enddo
+
+c---------------------------------------------------------------------
+c now joacobians set, so form left hand side in z direction
+c---------------------------------------------------------------------
+ do k = start(3,c), ksize-end(3,c)
+
+ tmp1 = dt * tz1
+ tmp2 = dt * tz2
+
+ lhsa(1,1,k) = - tmp2 * fjac(1,1,k-1)
+ > - tmp1 * njac(1,1,k-1)
+ > - tmp1 * dz1
+ lhsa(1,2,k) = - tmp2 * fjac(1,2,k-1)
+ > - tmp1 * njac(1,2,k-1)
+ lhsa(1,3,k) = - tmp2 * fjac(1,3,k-1)
+ > - tmp1 * njac(1,3,k-1)
+ lhsa(1,4,k) = - tmp2 * fjac(1,4,k-1)
+ > - tmp1 * njac(1,4,k-1)
+ lhsa(1,5,k) = - tmp2 * fjac(1,5,k-1)
+ > - tmp1 * njac(1,5,k-1)
+
+ lhsa(2,1,k) = - tmp2 * fjac(2,1,k-1)
+ > - tmp1 * njac(2,1,k-1)
+ lhsa(2,2,k) = - tmp2 * fjac(2,2,k-1)
+ > - tmp1 * njac(2,2,k-1)
+ > - tmp1 * dz2
+ lhsa(2,3,k) = - tmp2 * fjac(2,3,k-1)
+ > - tmp1 * njac(2,3,k-1)
+ lhsa(2,4,k) = - tmp2 * fjac(2,4,k-1)
+ > - tmp1 * njac(2,4,k-1)
+ lhsa(2,5,k) = - tmp2 * fjac(2,5,k-1)
+ > - tmp1 * njac(2,5,k-1)
+
+ lhsa(3,1,k) = - tmp2 * fjac(3,1,k-1)
+ > - tmp1 * njac(3,1,k-1)
+ lhsa(3,2,k) = - tmp2 * fjac(3,2,k-1)
+ > - tmp1 * njac(3,2,k-1)
+ lhsa(3,3,k) = - tmp2 * fjac(3,3,k-1)
+ > - tmp1 * njac(3,3,k-1)
+ > - tmp1 * dz3
+ lhsa(3,4,k) = - tmp2 * fjac(3,4,k-1)
+ > - tmp1 * njac(3,4,k-1)
+ lhsa(3,5,k) = - tmp2 * fjac(3,5,k-1)
+ > - tmp1 * njac(3,5,k-1)
+
+ lhsa(4,1,k) = - tmp2 * fjac(4,1,k-1)
+ > - tmp1 * njac(4,1,k-1)
+ lhsa(4,2,k) = - tmp2 * fjac(4,2,k-1)
+ > - tmp1 * njac(4,2,k-1)
+ lhsa(4,3,k) = - tmp2 * fjac(4,3,k-1)
+ > - tmp1 * njac(4,3,k-1)
+ lhsa(4,4,k) = - tmp2 * fjac(4,4,k-1)
+ > - tmp1 * njac(4,4,k-1)
+ > - tmp1 * dz4
+ lhsa(4,5,k) = - tmp2 * fjac(4,5,k-1)
+ > - tmp1 * njac(4,5,k-1)
+
+ lhsa(5,1,k) = - tmp2 * fjac(5,1,k-1)
+ > - tmp1 * njac(5,1,k-1)
+ lhsa(5,2,k) = - tmp2 * fjac(5,2,k-1)
+ > - tmp1 * njac(5,2,k-1)
+ lhsa(5,3,k) = - tmp2 * fjac(5,3,k-1)
+ > - tmp1 * njac(5,3,k-1)
+ lhsa(5,4,k) = - tmp2 * fjac(5,4,k-1)
+ > - tmp1 * njac(5,4,k-1)
+ lhsa(5,5,k) = - tmp2 * fjac(5,5,k-1)
+ > - tmp1 * njac(5,5,k-1)
+ > - tmp1 * dz5
+
+ lhsb(1,1,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(1,1,k)
+ > + tmp1 * 2.0d+00 * dz1
+ lhsb(1,2,k) = tmp1 * 2.0d+00 * njac(1,2,k)
+ lhsb(1,3,k) = tmp1 * 2.0d+00 * njac(1,3,k)
+ lhsb(1,4,k) = tmp1 * 2.0d+00 * njac(1,4,k)
+ lhsb(1,5,k) = tmp1 * 2.0d+00 * njac(1,5,k)
+
+ lhsb(2,1,k) = tmp1 * 2.0d+00 * njac(2,1,k)
+ lhsb(2,2,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(2,2,k)
+ > + tmp1 * 2.0d+00 * dz2
+ lhsb(2,3,k) = tmp1 * 2.0d+00 * njac(2,3,k)
+ lhsb(2,4,k) = tmp1 * 2.0d+00 * njac(2,4,k)
+ lhsb(2,5,k) = tmp1 * 2.0d+00 * njac(2,5,k)
+
+ lhsb(3,1,k) = tmp1 * 2.0d+00 * njac(3,1,k)
+ lhsb(3,2,k) = tmp1 * 2.0d+00 * njac(3,2,k)
+ lhsb(3,3,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(3,3,k)
+ > + tmp1 * 2.0d+00 * dz3
+ lhsb(3,4,k) = tmp1 * 2.0d+00 * njac(3,4,k)
+ lhsb(3,5,k) = tmp1 * 2.0d+00 * njac(3,5,k)
+
+ lhsb(4,1,k) = tmp1 * 2.0d+00 * njac(4,1,k)
+ lhsb(4,2,k) = tmp1 * 2.0d+00 * njac(4,2,k)
+ lhsb(4,3,k) = tmp1 * 2.0d+00 * njac(4,3,k)
+ lhsb(4,4,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(4,4,k)
+ > + tmp1 * 2.0d+00 * dz4
+ lhsb(4,5,k) = tmp1 * 2.0d+00 * njac(4,5,k)
+
+ lhsb(5,1,k) = tmp1 * 2.0d+00 * njac(5,1,k)
+ lhsb(5,2,k) = tmp1 * 2.0d+00 * njac(5,2,k)
+ lhsb(5,3,k) = tmp1 * 2.0d+00 * njac(5,3,k)
+ lhsb(5,4,k) = tmp1 * 2.0d+00 * njac(5,4,k)
+ lhsb(5,5,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(5,5,k)
+ > + tmp1 * 2.0d+00 * dz5
+
+ lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,k+1)
+ > - tmp1 * njac(1,1,k+1)
+ > - tmp1 * dz1
+ lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,k+1)
+ > - tmp1 * njac(1,2,k+1)
+ lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,k+1)
+ > - tmp1 * njac(1,3,k+1)
+ lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,k+1)
+ > - tmp1 * njac(1,4,k+1)
+ lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,k+1)
+ > - tmp1 * njac(1,5,k+1)
+
+ lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,k+1)
+ > - tmp1 * njac(2,1,k+1)
+ lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,k+1)
+ > - tmp1 * njac(2,2,k+1)
+ > - tmp1 * dz2
+ lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,k+1)
+ > - tmp1 * njac(2,3,k+1)
+ lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,k+1)
+ > - tmp1 * njac(2,4,k+1)
+ lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,k+1)
+ > - tmp1 * njac(2,5,k+1)
+
+ lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,k+1)
+ > - tmp1 * njac(3,1,k+1)
+ lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,k+1)
+ > - tmp1 * njac(3,2,k+1)
+ lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,k+1)
+ > - tmp1 * njac(3,3,k+1)
+ > - tmp1 * dz3
+ lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,k+1)
+ > - tmp1 * njac(3,4,k+1)
+ lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,k+1)
+ > - tmp1 * njac(3,5,k+1)
+
+ lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,k+1)
+ > - tmp1 * njac(4,1,k+1)
+ lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,k+1)
+ > - tmp1 * njac(4,2,k+1)
+ lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,k+1)
+ > - tmp1 * njac(4,3,k+1)
+ lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,k+1)
+ > - tmp1 * njac(4,4,k+1)
+ > - tmp1 * dz4
+ lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,k+1)
+ > - tmp1 * njac(4,5,k+1)
+
+ lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,k+1)
+ > - tmp1 * njac(5,1,k+1)
+ lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,k+1)
+ > - tmp1 * njac(5,2,k+1)
+ lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,k+1)
+ > - tmp1 * njac(5,3,k+1)
+ lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,k+1)
+ > - tmp1 * njac(5,4,k+1)
+ lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,k+1)
+ > - tmp1 * njac(5,5,k+1)
+ > - tmp1 * dz5
+
+ enddo
+
+
+c---------------------------------------------------------------------
+c outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+ if (first .eq. 1) then
+
+c---------------------------------------------------------------------
+c multiply c(i,j,kstart) by b_inverse and copy back to c
+c multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
+c---------------------------------------------------------------------
+ call binvcrhs( lhsb(1,1,kstart),
+ > lhsc(1,1,i,j,kstart,c),
+ > rhs(1,i,j,kstart,c) )
+
+ endif
+
+c---------------------------------------------------------------------
+c begin inner most do loop
+c do all the elements of the cell unless last
+c---------------------------------------------------------------------
+ do k=kstart+first,ksize-last
+
+c---------------------------------------------------------------------
+c subtract A*lhs_vector(k-1) from lhs_vector(k)
+c
+c rhs(k) = rhs(k) - A*rhs(k-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,k),
+ > rhs(1,i,j,k-1,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c B(k) = B(k) - C(k-1)*A(k)
+c call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,k),
+ > lhsc(1,1,i,j,k-1,c),
+ > lhsb(1,1,k))
+
+c---------------------------------------------------------------------
+c multiply c(i,j,k) by b_inverse and copy back to c
+c multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
+c---------------------------------------------------------------------
+ call binvcrhs( lhsb(1,1,k),
+ > lhsc(1,1,i,j,k,c),
+ > rhs(1,i,j,k,c) )
+
+ enddo
+
+c---------------------------------------------------------------------
+c Now finish up special cases for last cell
+c---------------------------------------------------------------------
+ if (last .eq. 1) then
+
+c---------------------------------------------------------------------
+c rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,ksize),
+ > rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c))
+
+c---------------------------------------------------------------------
+c B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
+c call matmul_sub(aa,i,j,ksize,c,
+c $ cc,i,j,ksize-1,c,bb,i,j,ksize,c)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,ksize),
+ > lhsc(1,1,i,j,ksize-1,c),
+ > lhsb(1,1,ksize))
+
+c---------------------------------------------------------------------
+c multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
+c---------------------------------------------------------------------
+ call binvrhs( lhsb(1,1,ksize),
+ > rhs(1,i,j,ksize,c) )
+
+ endif
+ enddo
+ enddo
+
+
+ return
+ end
+
+
+
+
+
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Performs line solves in Z direction by first factoring
+c the block-tridiagonal matrix into an upper triangular matrix,
+c and then performing back substitution to solve for the unknow
+c vectors of each line.
+c
+c Make sure we treat elements zero to cell_size in the direction
+c of the sweep.
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer c, kstart, stage,
+ > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+ > isize,jsize,ksize,send_id
+
+ kstart = 0
+
+c---------------------------------------------------------------------
+c in our terminology stage is the number of the cell in the y-direct
+c i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+ do stage = 1,ncells
+ c = slice(3,stage)
+ isize = cell_size(1,c) - 1
+ jsize = cell_size(2,c) - 1
+ ksize = cell_size(3,c) - 1
+c---------------------------------------------------------------------
+c set last-cell flag
+c---------------------------------------------------------------------
+ if (stage .eq. ncells) then
+ last = 1
+ else
+ last = 0
+ endif
+
+ if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+ first = 1
+c call lhsz(c)
+ call z_solve_cell(first,last,c)
+ else
+c---------------------------------------------------------------------
+c Not the first cell of this line, so receive info from
+c processor working on preceeding cell
+c---------------------------------------------------------------------
+ first = 0
+ call z_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c overlap computations and communications
+c---------------------------------------------------------------------
+c call lhsz(c)
+c---------------------------------------------------------------------
+c wait for completion
+c---------------------------------------------------------------------
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c install C'(kstart+1) and rhs'(kstart+1) to be used in this cell
+c---------------------------------------------------------------------
+ call z_unpack_solve_info(c)
+ call z_solve_cell(first,last,c)
+ endif
+
+ if (last .eq. 0) call z_send_solve_info(send_id,c)
+ enddo
+
+c---------------------------------------------------------------------
+c now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+ do stage = ncells, 1, -1
+ c = slice(3,stage)
+ first = 0
+ last = 0
+ if (stage .eq. 1) first = 1
+ if (stage .eq. ncells) then
+ last = 1
+c---------------------------------------------------------------------
+c last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+ call z_backsubstitute(first, last,c)
+ else
+ call z_receive_backsub_info(recv_id,c)
+ call mpi_wait(send_id,r_status,error)
+ call mpi_wait(recv_id,r_status,error)
+ call z_unpack_backsub_info(c)
+ call z_backsubstitute(first,last,c)
+ endif
+ if (first .eq. 0) call z_send_backsub_info(send_id,c)
+ enddo
+
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_unpack_solve_info(c)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack C'(-1) and rhs'(-1) for
+c all i and j
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i,j,m,n,ptr,c,kstart
+
+ kstart = 0
+ ptr = 0
+ do j=0,JMAX-1
+ do i=0,IMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send C'(kend) and rhs'(kend) for
+c all i and j
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i,j,m,n,ksize,ptr,c,ip,jp
+ integer error,send_id,buffer_size
+
+ ksize = cell_size(3,c)-1
+ ip = cell_coord(1,c) - 1
+ jp = cell_coord(2,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c pack up buffer
+c---------------------------------------------------------------------
+ ptr = 0
+ do j=0,JMAX-1
+ do i=0,IMAX-1
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,i,j,ksize,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c send buffer
+c---------------------------------------------------------------------
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, successor(3),
+ > BOTTOM+ip+jp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c pack up and send U(jstart) for all i and j
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i,j,n,ptr,c,kstart,ip,jp
+ integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c Send element 0 to previous processor
+c---------------------------------------------------------------------
+ kstart = 0
+ ip = cell_coord(1,c)-1
+ jp = cell_coord(2,c)-1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ ptr = 0
+ do j=0,JMAX-1
+ do i=0,IMAX-1
+ do n=1,BLOCK_SIZE
+ in_buffer(ptr+n) = rhs(n,i,j,kstart,c)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ call mpi_isend(in_buffer, buffer_size,
+ > dp_type, predecessor(3),
+ > TOP+ip+jp*NCELLS, comm_solve,
+ > send_id,error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c unpack U(ksize) for all i and j
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i,j,n,ptr,c
+
+ ptr = 0
+ do j=0,JMAX-1
+ do i=0,IMAX-1
+ do n=1,BLOCK_SIZE
+ backsub_info(n,i,j,c) = out_buffer(ptr+n)
+ enddo
+ ptr = ptr+BLOCK_SIZE
+ enddo
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer error,recv_id,ip,jp,c,buffer_size
+ ip = cell_coord(1,c) - 1
+ jp = cell_coord(2,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, successor(3),
+ > TOP+ip+jp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c post mpi receives
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer ip,jp,recv_id,error,c,buffer_size
+ ip = cell_coord(1,c) - 1
+ jp = cell_coord(2,c) - 1
+ buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+ > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+ call mpi_irecv(out_buffer, buffer_size,
+ > dp_type, predecessor(3),
+ > BOTTOM+ip+jp*NCELLS, comm_solve,
+ > recv_id, error)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c back solve: if last cell, then generate U(ksize)=rhs(ksize)
+c else assume U(ksize) is loaded in un pack backsub_info
+c so just use it
+c after call u(kstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer first, last, c, i, k
+ integer m,n,j,jsize,isize,ksize,kstart
+
+ kstart = 0
+ isize = cell_size(1,c)-end(1,c)-1
+ jsize = cell_size(2,c)-end(2,c)-1
+ ksize = cell_size(3,c)-1
+ if (last .eq. 0) then
+ do j=start(2,c),jsize
+ do i=start(1,c),isize
+c---------------------------------------------------------------------
+c U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c)
+ > - lhsc(m,n,i,j,ksize,c)*
+ > backsub_info(n,i,j,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ endif
+ do k=ksize-1,kstart,-1
+ do j=start(2,c),jsize
+ do i=start(1,c),isize
+ do m=1,BLOCK_SIZE
+ do n=1,BLOCK_SIZE
+ rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
+ > - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c)
+ enddo
+ enddo
+ enddo
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c performs guaussian elimination on this cell.
+c
+c assumes that unpacking routines for non-first cells
+c preload C' and rhs' from previous cell.
+c
+c assumed send happens outside this routine, but that
+c c'(KMAX) and rhs'(KMAX) will be sent to next cell.
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'work_lhs_vec.h'
+
+ integer first,last,c
+ integer i,j,k,m,n,isize,ksize,jsize,kstart
+
+ kstart = 0
+ isize = cell_size(1,c)-end(1,c)-1
+ jsize = cell_size(2,c)-end(2,c)-1
+ ksize = cell_size(3,c)-1
+
+c---------------------------------------------------------------------
+c zero the left hand side for starters
+c set diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+ do i = 0, isize
+ do m = 1, 5
+ do n = 1, 5
+ lhsa(m,n,i,0) = 0.0d0
+ lhsb(m,n,i,0) = 0.0d0
+ lhsa(m,n,i,ksize) = 0.0d0
+ lhsb(m,n,i,ksize) = 0.0d0
+ enddo
+ lhsb(m,m,i,0) = 1.0d0
+ lhsb(m,m,i,ksize) = 1.0d0
+ enddo
+ enddo
+
+ do j=start(2,c),jsize
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three z-factors
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Compute the indices for storing the block-diagonal matrix;
+c determine c (labeled f) and s jacobians for cell c
+c---------------------------------------------------------------------
+
+ do k = start(3,c)-1, cell_size(3,c)-end(3,c)
+ do i=start(1,c),isize
+
+ tmp1 = 1.0d0 / u(1,i,j,k,c)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ fjac(1,1,i,k) = 0.0d+00
+ fjac(1,2,i,k) = 0.0d+00
+ fjac(1,3,i,k) = 0.0d+00
+ fjac(1,4,i,k) = 1.0d+00
+ fjac(1,5,i,k) = 0.0d+00
+
+ fjac(2,1,i,k) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) )
+ > * tmp2
+ fjac(2,2,i,k) = u(4,i,j,k,c) * tmp1
+ fjac(2,3,i,k) = 0.0d+00
+ fjac(2,4,i,k) = u(2,i,j,k,c) * tmp1
+ fjac(2,5,i,k) = 0.0d+00
+
+ fjac(3,1,i,k) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+ > * tmp2
+ fjac(3,2,i,k) = 0.0d+00
+ fjac(3,3,i,k) = u(4,i,j,k,c) * tmp1
+ fjac(3,4,i,k) = u(3,i,j,k,c) * tmp1
+ fjac(3,5,i,k) = 0.0d+00
+
+ fjac(4,1,i,k) = - (u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 )
+ > + c2 * qs(i,j,k,c)
+ fjac(4,2,i,k) = - c2 * u(2,i,j,k,c) * tmp1
+ fjac(4,3,i,k) = - c2 * u(3,i,j,k,c) * tmp1
+ fjac(4,4,i,k) = ( 2.0d+00 - c2 )
+ > * u(4,i,j,k,c) * tmp1
+ fjac(4,5,i,k) = c2
+
+ fjac(5,1,i,k) = ( c2 * 2.0d0 * qs(i,j,k,c)
+ > - c1 * ( u(5,i,j,k,c) * tmp1 ) )
+ > * ( u(4,i,j,k,c) * tmp1 )
+ fjac(5,2,i,k) = - c2 * ( u(2,i,j,k,c)*u(4,i,j,k,c) )
+ > * tmp2
+ fjac(5,3,i,k) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+ > * tmp2
+ fjac(5,4,i,k) = c1 * ( u(5,i,j,k,c) * tmp1 )
+ > - c2 * ( qs(i,j,k,c)
+ > + u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 )
+ fjac(5,5,i,k) = c1 * u(4,i,j,k,c) * tmp1
+
+ njac(1,1,i,k) = 0.0d+00
+ njac(1,2,i,k) = 0.0d+00
+ njac(1,3,i,k) = 0.0d+00
+ njac(1,4,i,k) = 0.0d+00
+ njac(1,5,i,k) = 0.0d+00
+
+ njac(2,1,i,k) = - c3c4 * tmp2 * u(2,i,j,k,c)
+ njac(2,2,i,k) = c3c4 * tmp1
+ njac(2,3,i,k) = 0.0d+00
+ njac(2,4,i,k) = 0.0d+00
+ njac(2,5,i,k) = 0.0d+00
+
+ njac(3,1,i,k) = - c3c4 * tmp2 * u(3,i,j,k,c)
+ njac(3,2,i,k) = 0.0d+00
+ njac(3,3,i,k) = c3c4 * tmp1
+ njac(3,4,i,k) = 0.0d+00
+ njac(3,5,i,k) = 0.0d+00
+
+ njac(4,1,i,k) = - con43 * c3c4 * tmp2 * u(4,i,j,k,c)
+ njac(4,2,i,k) = 0.0d+00
+ njac(4,3,i,k) = 0.0d+00
+ njac(4,4,i,k) = con43 * c3 * c4 * tmp1
+ njac(4,5,i,k) = 0.0d+00
+
+ njac(5,1,i,k) = - ( c3c4
+ > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+ > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+ > - ( con43 * c3c4
+ > - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+ > - c1345 * tmp2 * u(5,i,j,k,c)
+
+ njac(5,2,i,k) = ( c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
+ njac(5,3,i,k) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
+ njac(5,4,i,k) = ( con43 * c3c4
+ > - c1345 ) * tmp2 * u(4,i,j,k,c)
+ njac(5,5,i,k) = ( c1345 )* tmp1
+
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c now joacobians set, so form left hand side in z direction
+c---------------------------------------------------------------------
+ do k = start(3,c), ksize-end(3,c)
+ do i=start(1,c),isize
+
+ tmp1 = dt * tz1
+ tmp2 = dt * tz2
+
+ lhsa(1,1,i,k) = - tmp2 * fjac(1,1,i,k-1)
+ > - tmp1 * njac(1,1,i,k-1)
+ > - tmp1 * dz1
+ lhsa(1,2,i,k) = - tmp2 * fjac(1,2,i,k-1)
+ > - tmp1 * njac(1,2,i,k-1)
+ lhsa(1,3,i,k) = - tmp2 * fjac(1,3,i,k-1)
+ > - tmp1 * njac(1,3,i,k-1)
+ lhsa(1,4,i,k) = - tmp2 * fjac(1,4,i,k-1)
+ > - tmp1 * njac(1,4,i,k-1)
+ lhsa(1,5,i,k) = - tmp2 * fjac(1,5,i,k-1)
+ > - tmp1 * njac(1,5,i,k-1)
+
+ lhsa(2,1,i,k) = - tmp2 * fjac(2,1,i,k-1)
+ > - tmp1 * njac(2,1,i,k-1)
+ lhsa(2,2,i,k) = - tmp2 * fjac(2,2,i,k-1)
+ > - tmp1 * njac(2,2,i,k-1)
+ > - tmp1 * dz2
+ lhsa(2,3,i,k) = - tmp2 * fjac(2,3,i,k-1)
+ > - tmp1 * njac(2,3,i,k-1)
+ lhsa(2,4,i,k) = - tmp2 * fjac(2,4,i,k-1)
+ > - tmp1 * njac(2,4,i,k-1)
+ lhsa(2,5,i,k) = - tmp2 * fjac(2,5,i,k-1)
+ > - tmp1 * njac(2,5,i,k-1)
+
+ lhsa(3,1,i,k) = - tmp2 * fjac(3,1,i,k-1)
+ > - tmp1 * njac(3,1,i,k-1)
+ lhsa(3,2,i,k) = - tmp2 * fjac(3,2,i,k-1)
+ > - tmp1 * njac(3,2,i,k-1)
+ lhsa(3,3,i,k) = - tmp2 * fjac(3,3,i,k-1)
+ > - tmp1 * njac(3,3,i,k-1)
+ > - tmp1 * dz3
+ lhsa(3,4,i,k) = - tmp2 * fjac(3,4,i,k-1)
+ > - tmp1 * njac(3,4,i,k-1)
+ lhsa(3,5,i,k) = - tmp2 * fjac(3,5,i,k-1)
+ > - tmp1 * njac(3,5,i,k-1)
+
+ lhsa(4,1,i,k) = - tmp2 * fjac(4,1,i,k-1)
+ > - tmp1 * njac(4,1,i,k-1)
+ lhsa(4,2,i,k) = - tmp2 * fjac(4,2,i,k-1)
+ > - tmp1 * njac(4,2,i,k-1)
+ lhsa(4,3,i,k) = - tmp2 * fjac(4,3,i,k-1)
+ > - tmp1 * njac(4,3,i,k-1)
+ lhsa(4,4,i,k) = - tmp2 * fjac(4,4,i,k-1)
+ > - tmp1 * njac(4,4,i,k-1)
+ > - tmp1 * dz4
+ lhsa(4,5,i,k) = - tmp2 * fjac(4,5,i,k-1)
+ > - tmp1 * njac(4,5,i,k-1)
+
+ lhsa(5,1,i,k) = - tmp2 * fjac(5,1,i,k-1)
+ > - tmp1 * njac(5,1,i,k-1)
+ lhsa(5,2,i,k) = - tmp2 * fjac(5,2,i,k-1)
+ > - tmp1 * njac(5,2,i,k-1)
+ lhsa(5,3,i,k) = - tmp2 * fjac(5,3,i,k-1)
+ > - tmp1 * njac(5,3,i,k-1)
+ lhsa(5,4,i,k) = - tmp2 * fjac(5,4,i,k-1)
+ > - tmp1 * njac(5,4,i,k-1)
+ lhsa(5,5,i,k) = - tmp2 * fjac(5,5,i,k-1)
+ > - tmp1 * njac(5,5,i,k-1)
+ > - tmp1 * dz5
+
+ lhsb(1,1,i,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(1,1,i,k)
+ > + tmp1 * 2.0d+00 * dz1
+ lhsb(1,2,i,k) = tmp1 * 2.0d+00 * njac(1,2,i,k)
+ lhsb(1,3,i,k) = tmp1 * 2.0d+00 * njac(1,3,i,k)
+ lhsb(1,4,i,k) = tmp1 * 2.0d+00 * njac(1,4,i,k)
+ lhsb(1,5,i,k) = tmp1 * 2.0d+00 * njac(1,5,i,k)
+
+ lhsb(2,1,i,k) = tmp1 * 2.0d+00 * njac(2,1,i,k)
+ lhsb(2,2,i,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(2,2,i,k)
+ > + tmp1 * 2.0d+00 * dz2
+ lhsb(2,3,i,k) = tmp1 * 2.0d+00 * njac(2,3,i,k)
+ lhsb(2,4,i,k) = tmp1 * 2.0d+00 * njac(2,4,i,k)
+ lhsb(2,5,i,k) = tmp1 * 2.0d+00 * njac(2,5,i,k)
+
+ lhsb(3,1,i,k) = tmp1 * 2.0d+00 * njac(3,1,i,k)
+ lhsb(3,2,i,k) = tmp1 * 2.0d+00 * njac(3,2,i,k)
+ lhsb(3,3,i,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(3,3,i,k)
+ > + tmp1 * 2.0d+00 * dz3
+ lhsb(3,4,i,k) = tmp1 * 2.0d+00 * njac(3,4,i,k)
+ lhsb(3,5,i,k) = tmp1 * 2.0d+00 * njac(3,5,i,k)
+
+ lhsb(4,1,i,k) = tmp1 * 2.0d+00 * njac(4,1,i,k)
+ lhsb(4,2,i,k) = tmp1 * 2.0d+00 * njac(4,2,i,k)
+ lhsb(4,3,i,k) = tmp1 * 2.0d+00 * njac(4,3,i,k)
+ lhsb(4,4,i,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(4,4,i,k)
+ > + tmp1 * 2.0d+00 * dz4
+ lhsb(4,5,i,k) = tmp1 * 2.0d+00 * njac(4,5,i,k)
+
+ lhsb(5,1,i,k) = tmp1 * 2.0d+00 * njac(5,1,i,k)
+ lhsb(5,2,i,k) = tmp1 * 2.0d+00 * njac(5,2,i,k)
+ lhsb(5,3,i,k) = tmp1 * 2.0d+00 * njac(5,3,i,k)
+ lhsb(5,4,i,k) = tmp1 * 2.0d+00 * njac(5,4,i,k)
+ lhsb(5,5,i,k) = 1.0d+00
+ > + tmp1 * 2.0d+00 * njac(5,5,i,k)
+ > + tmp1 * 2.0d+00 * dz5
+
+ lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i,k+1)
+ > - tmp1 * njac(1,1,i,k+1)
+ > - tmp1 * dz1
+ lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i,k+1)
+ > - tmp1 * njac(1,2,i,k+1)
+ lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i,k+1)
+ > - tmp1 * njac(1,3,i,k+1)
+ lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i,k+1)
+ > - tmp1 * njac(1,4,i,k+1)
+ lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i,k+1)
+ > - tmp1 * njac(1,5,i,k+1)
+
+ lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i,k+1)
+ > - tmp1 * njac(2,1,i,k+1)
+ lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i,k+1)
+ > - tmp1 * njac(2,2,i,k+1)
+ > - tmp1 * dz2
+ lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i,k+1)
+ > - tmp1 * njac(2,3,i,k+1)
+ lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i,k+1)
+ > - tmp1 * njac(2,4,i,k+1)
+ lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i,k+1)
+ > - tmp1 * njac(2,5,i,k+1)
+
+ lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i,k+1)
+ > - tmp1 * njac(3,1,i,k+1)
+ lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i,k+1)
+ > - tmp1 * njac(3,2,i,k+1)
+ lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i,k+1)
+ > - tmp1 * njac(3,3,i,k+1)
+ > - tmp1 * dz3
+ lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i,k+1)
+ > - tmp1 * njac(3,4,i,k+1)
+ lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i,k+1)
+ > - tmp1 * njac(3,5,i,k+1)
+
+ lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i,k+1)
+ > - tmp1 * njac(4,1,i,k+1)
+ lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i,k+1)
+ > - tmp1 * njac(4,2,i,k+1)
+ lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i,k+1)
+ > - tmp1 * njac(4,3,i,k+1)
+ lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i,k+1)
+ > - tmp1 * njac(4,4,i,k+1)
+ > - tmp1 * dz4
+ lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i,k+1)
+ > - tmp1 * njac(4,5,i,k+1)
+
+ lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i,k+1)
+ > - tmp1 * njac(5,1,i,k+1)
+ lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i,k+1)
+ > - tmp1 * njac(5,2,i,k+1)
+ lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i,k+1)
+ > - tmp1 * njac(5,3,i,k+1)
+ lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i,k+1)
+ > - tmp1 * njac(5,4,i,k+1)
+ lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i,k+1)
+ > - tmp1 * njac(5,5,i,k+1)
+ > - tmp1 * dz5
+
+ enddo
+ enddo
+
+
+c---------------------------------------------------------------------
+c outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+ if (first .eq. 1) then
+
+c---------------------------------------------------------------------
+c multiply c(i,j,kstart) by b_inverse and copy back to c
+c multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
+c---------------------------------------------------------------------
+!dir$ ivdep
+ do i=start(1,c),isize
+ call binvcrhs( lhsb(1,1,i,kstart),
+ > lhsc(1,1,i,j,kstart,c),
+ > rhs(1,i,j,kstart,c) )
+ enddo
+
+ endif
+
+c---------------------------------------------------------------------
+c begin inner most do loop
+c do all the elements of the cell unless last
+c---------------------------------------------------------------------
+ do k=kstart+first,ksize-last
+!dir$ ivdep
+ do i=start(1,c),isize
+
+c---------------------------------------------------------------------
+c subtract A*lhs_vector(k-1) from lhs_vector(k)
+c
+c rhs(k) = rhs(k) - A*rhs(k-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,i,k),
+ > rhs(1,i,j,k-1,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c B(k) = B(k) - C(k-1)*A(k)
+c call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,i,k),
+ > lhsc(1,1,i,j,k-1,c),
+ > lhsb(1,1,i,k))
+
+c---------------------------------------------------------------------
+c multiply c(i,j,k) by b_inverse and copy back to c
+c multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
+c---------------------------------------------------------------------
+ call binvcrhs( lhsb(1,1,i,k),
+ > lhsc(1,1,i,j,k,c),
+ > rhs(1,i,j,k,c) )
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c Now finish up special cases for last cell
+c---------------------------------------------------------------------
+ if (last .eq. 1) then
+
+!dir$ ivdep
+ do i=start(1,c),isize
+c---------------------------------------------------------------------
+c rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
+c---------------------------------------------------------------------
+ call matvec_sub(lhsa(1,1,i,ksize),
+ > rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c))
+
+c---------------------------------------------------------------------
+c B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
+c call matmul_sub(aa,i,j,ksize,c,
+c $ cc,i,j,ksize-1,c,bb,i,j,ksize,c)
+c---------------------------------------------------------------------
+ call matmul_sub(lhsa(1,1,i,ksize),
+ > lhsc(1,1,i,j,ksize-1,c),
+ > lhsb(1,1,i,ksize))
+
+c---------------------------------------------------------------------
+c multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
+c---------------------------------------------------------------------
+ call binvrhs( lhsb(1,1,i,ksize),
+ > rhs(1,i,j,ksize,c) )
+ enddo
+
+ endif
+ enddo
+
+
+ return
+ end
+
+
+
+
+
+
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=cg
+BENCHMARKU=CG
+
+include ../config/make.def
+
+OBJS = cg.o ${COMMON}/print_results.o \
+ ${COMMON}/${RAND}.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+cg.o: cg.f mpinpb.h npbparams.h
+ ${FCOMPILE} cg.f
+
+clean:
+ - rm -f *.o *~
+ - rm -f npbparams.h core
+
+
+
--- /dev/null
+!-------------------------------------------------------------------------!
+! !
+! N A S P A R A L L E L B E N C H M A R K S 3.3 !
+! !
+! C G !
+! !
+!-------------------------------------------------------------------------!
+! !
+! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
+! It is described in NAS Technical Reports 95-020 and 02-007 !
+! !
+! Permission to use, copy, distribute and modify this software !
+! for any purpose with or without fee is hereby granted. We !
+! request, however, that all derived work reference the NAS !
+! Parallel Benchmarks 3.3. This software is provided "as is" !
+! without express or implied warranty. !
+! !
+! Information on NPB 3.3, including the technical report, the !
+! original specifications, source code, results and information !
+! on how to submit new results, is available at: !
+! !
+! http://www.nas.nasa.gov/Software/NPB/ !
+! !
+! Send comments or suggestions to npb@nas.nasa.gov !
+! !
+! NAS Parallel Benchmarks Group !
+! NASA Ames Research Center !
+! Mail Stop: T27A-1 !
+! Moffett Field, CA 94035-1000 !
+! !
+! E-mail: npb@nas.nasa.gov !
+! Fax: (650) 604-3957 !
+! !
+!-------------------------------------------------------------------------!
+
+
+c---------------------------------------------------------------------
+c
+c Authors: M. Yarrow
+c C. Kuszmaul
+c R. F. Van der Wijngaart
+c H. Jin
+c
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ program cg
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+ implicit none
+
+ include 'mpinpb.h'
+ integer status(MPI_STATUS_SIZE), request, ierr
+
+ include 'npbparams.h'
+
+c---------------------------------------------------------------------
+c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows.
+c num_proc_cols and num_proc_cols are to be found in npbparams.h.
+c When num_procs is not square, then num_proc_cols must be = 2*num_proc_rows.
+c---------------------------------------------------------------------
+ integer num_procs
+ parameter( num_procs = num_proc_cols * num_proc_rows )
+
+
+
+c---------------------------------------------------------------------
+c Class specific parameters:
+c It appears here for reference only.
+c These are their values, however, this info is imported in the npbparams.h
+c include file, which is written by the sys/setparams.c program.
+c---------------------------------------------------------------------
+
+C----------
+C Class S:
+C----------
+CC parameter( na=1400,
+CC > nonzer=7,
+CC > shift=10.,
+CC > niter=15,
+CC > rcond=1.0d-1 )
+C----------
+C Class W:
+C----------
+CC parameter( na=7000,
+CC > nonzer=8,
+CC > shift=12.,
+CC > niter=15,
+CC > rcond=1.0d-1 )
+C----------
+C Class A:
+C----------
+CC parameter( na=14000,
+CC > nonzer=11,
+CC > shift=20.,
+CC > niter=15,
+CC > rcond=1.0d-1 )
+C----------
+C Class B:
+C----------
+CC parameter( na=75000,
+CC > nonzer=13,
+CC > shift=60.,
+CC > niter=75,
+CC > rcond=1.0d-1 )
+C----------
+C Class C:
+C----------
+CC parameter( na=150000,
+CC > nonzer=15,
+CC > shift=110.,
+CC > niter=75,
+CC > rcond=1.0d-1 )
+C----------
+C Class D:
+C----------
+CC parameter( na=1500000,
+CC > nonzer=21,
+CC > shift=500.,
+CC > niter=100,
+CC > rcond=1.0d-1 )
+C----------
+C Class E:
+C----------
+CC parameter( na=9000000,
+CC > nonzer=26,
+CC > shift=1500.,
+CC > niter=100,
+CC > rcond=1.0d-1 )
+
+
+
+ integer nz
+ parameter( nz = na*(nonzer+1)/num_procs*(nonzer+1)+nonzer
+ > + na*(nonzer+2+num_procs/256)/num_proc_cols )
+
+
+
+ common / partit_size / naa, nzz,
+ > npcols, nprows,
+ > proc_col, proc_row,
+ > firstrow,
+ > lastrow,
+ > firstcol,
+ > lastcol,
+ > exch_proc,
+ > exch_recv_length,
+ > send_start,
+ > send_len
+ integer naa, nzz,
+ > npcols, nprows,
+ > proc_col, proc_row,
+ > firstrow,
+ > lastrow,
+ > firstcol,
+ > lastcol,
+ > exch_proc,
+ > exch_recv_length,
+ > send_start,
+ > send_len
+
+
+ common / main_int_mem / colidx, rowstr,
+ > iv, arow, acol
+ integer colidx(nz), rowstr(na+1),
+ > iv(2*na+1), arow(nz), acol(nz)
+
+
+ common / main_flt_mem / v, aelt, a,
+ > x,
+ > z,
+ > p,
+ > q,
+ > r,
+ > w
+ double precision v(na+1), aelt(nz), a(nz),
+ > x(na/num_proc_rows+2),
+ > z(na/num_proc_rows+2),
+ > p(na/num_proc_rows+2),
+ > q(na/num_proc_rows+2),
+ > r(na/num_proc_rows+2),
+ > w(na/num_proc_rows+2)
+
+
+ common /urando/ amult, tran
+ double precision amult, tran
+
+
+
+ integer l2npcols
+ integer reduce_exch_proc(num_proc_cols)
+ integer reduce_send_starts(num_proc_cols)
+ integer reduce_send_lengths(num_proc_cols)
+ integer reduce_recv_starts(num_proc_cols)
+ integer reduce_recv_lengths(num_proc_cols)
+
+ integer i, j, k, it
+
+ double precision zeta, randlc
+ external randlc
+ double precision rnorm
+ double precision norm_temp1(2), norm_temp2(2)
+
+ double precision t, tmax, mflops
+ external timer_read
+ double precision timer_read
+ character class
+ logical verified
+ double precision zeta_verify_value, epsilon, err
+
+
+c---------------------------------------------------------------------
+c Set up mpi initialization and number of proc testing
+c---------------------------------------------------------------------
+ call initialize_mpi
+
+
+ if( na .eq. 1400 .and.
+ & nonzer .eq. 7 .and.
+ & niter .eq. 15 .and.
+ & shift .eq. 10.d0 ) then
+ class = 'S'
+ zeta_verify_value = 8.5971775078648d0
+ else if( na .eq. 7000 .and.
+ & nonzer .eq. 8 .and.
+ & niter .eq. 15 .and.
+ & shift .eq. 12.d0 ) then
+ class = 'W'
+ zeta_verify_value = 10.362595087124d0
+ else if( na .eq. 14000 .and.
+ & nonzer .eq. 11 .and.
+ & niter .eq. 15 .and.
+ & shift .eq. 20.d0 ) then
+ class = 'A'
+ zeta_verify_value = 17.130235054029d0
+ else if( na .eq. 75000 .and.
+ & nonzer .eq. 13 .and.
+ & niter .eq. 75 .and.
+ & shift .eq. 60.d0 ) then
+ class = 'B'
+ zeta_verify_value = 22.712745482631d0
+ else if( na .eq. 150000 .and.
+ & nonzer .eq. 15 .and.
+ & niter .eq. 75 .and.
+ & shift .eq. 110.d0 ) then
+ class = 'C'
+ zeta_verify_value = 28.973605592845d0
+ else if( na .eq. 1500000 .and.
+ & nonzer .eq. 21 .and.
+ & niter .eq. 100 .and.
+ & shift .eq. 500.d0 ) then
+ class = 'D'
+ zeta_verify_value = 52.514532105794d0
+ else if( na .eq. 9000000 .and.
+ & nonzer .eq. 26 .and.
+ & niter .eq. 100 .and.
+ & shift .eq. 1.5d3 ) then
+ class = 'E'
+ zeta_verify_value = 77.522164599383d0
+ else
+ class = 'U'
+ endif
+
+ if( me .eq. root )then
+ write( *,1000 )
+ write( *,1001 ) na
+ write( *,1002 ) niter
+ write( *,1003 ) nprocs
+ write( *,1004 ) nonzer
+ write( *,1005 ) shift
+ 1000 format(//,' NAS Parallel Benchmarks 3.3 -- CG Benchmark', /)
+ 1001 format(' Size: ', i10 )
+ 1002 format(' Iterations: ', i5 )
+ 1003 format(' Number of active processes: ', i5 )
+ 1004 format(' Number of nonzeroes per row: ', i8)
+ 1005 format(' Eigenvalue shift: ', e8.3)
+ endif
+
+ if (.not. convertdouble) then
+ dp_type = MPI_DOUBLE_PRECISION
+ else
+ dp_type = MPI_REAL
+ endif
+
+
+ naa = na
+ nzz = nz
+
+
+c---------------------------------------------------------------------
+c Set up processor info, such as whether sq num of procs, etc
+c---------------------------------------------------------------------
+ call setup_proc_info( num_procs,
+ > num_proc_rows,
+ > num_proc_cols )
+
+
+c---------------------------------------------------------------------
+c Set up partition's submatrix info: firstcol, lastcol, firstrow, lastrow
+c---------------------------------------------------------------------
+ call setup_submatrix_info( l2npcols,
+ > reduce_exch_proc,
+ > reduce_send_starts,
+ > reduce_send_lengths,
+ > reduce_recv_starts,
+ > reduce_recv_lengths )
+
+
+
+c---------------------------------------------------------------------
+c Inialize random number generator
+c---------------------------------------------------------------------
+ tran = 314159265.0D0
+ amult = 1220703125.0D0
+ zeta = randlc( tran, amult )
+
+c---------------------------------------------------------------------
+c Set up partition's sparse random matrix for given class size
+c---------------------------------------------------------------------
+ call makea(naa, nzz, a, colidx, rowstr, nonzer,
+ > firstrow, lastrow, firstcol, lastcol,
+ > rcond, arow, acol, aelt, v, iv, shift)
+
+
+
+c---------------------------------------------------------------------
+c Note: as a result of the above call to makea:
+c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
+c values of colidx which are col indexes go from firstcol --> lastcol
+c So:
+c Shift the col index vals from actual (firstcol --> lastcol )
+c to local, i.e., (1 --> lastcol-firstcol+1)
+c---------------------------------------------------------------------
+ do j=1,lastrow-firstrow+1
+ do k=rowstr(j),rowstr(j+1)-1
+ colidx(k) = colidx(k) - firstcol + 1
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c set starting vector to (1, 1, .... 1)
+c---------------------------------------------------------------------
+ do i = 1, na/num_proc_rows+1
+ x(i) = 1.0D0
+ enddo
+
+ zeta = 0.0d0
+
+c---------------------------------------------------------------------
+c---->
+c Do one iteration untimed to init all code and data page tables
+c----> (then reinit, start timing, to niter its)
+c---------------------------------------------------------------------
+ do it = 1, 1
+
+c---------------------------------------------------------------------
+c The call to the conjugate gradient routine:
+c---------------------------------------------------------------------
+ call conj_grad ( colidx,
+ > rowstr,
+ > x,
+ > z,
+ > a,
+ > p,
+ > q,
+ > r,
+ > w,
+ > rnorm,
+ > l2npcols,
+ > reduce_exch_proc,
+ > reduce_send_starts,
+ > reduce_send_lengths,
+ > reduce_recv_starts,
+ > reduce_recv_lengths )
+
+c---------------------------------------------------------------------
+c zeta = shift + 1/(x.z)
+c So, first: (x.z)
+c Also, find norm of z
+c So, first: (z.z)
+c---------------------------------------------------------------------
+ norm_temp1(1) = 0.0d0
+ norm_temp1(2) = 0.0d0
+ do j=1, lastcol-firstcol+1
+ norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
+ norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
+ enddo
+
+ do i = 1, l2npcols
+ call mpi_irecv( norm_temp2,
+ > 2,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+ call mpi_send( norm_temp1,
+ > 2,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > ierr )
+ call mpi_wait( request, status, ierr )
+
+ norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
+ norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
+ enddo
+
+ norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
+
+
+c---------------------------------------------------------------------
+c Normalize z to obtain x
+c---------------------------------------------------------------------
+ do j=1, lastcol-firstcol+1
+ x(j) = norm_temp1(2)*z(j)
+ enddo
+
+
+ enddo ! end of do one iteration untimed
+
+
+c---------------------------------------------------------------------
+c set starting vector to (1, 1, .... 1)
+c---------------------------------------------------------------------
+c
+c NOTE: a questionable limit on size: should this be na/num_proc_cols+1 ?
+c
+ do i = 1, na/num_proc_rows+1
+ x(i) = 1.0D0
+ enddo
+
+ zeta = 0.0d0
+
+c---------------------------------------------------------------------
+c Synchronize and start timing
+c---------------------------------------------------------------------
+ call mpi_barrier( mpi_comm_world,
+ > ierr )
+
+ call timer_clear( 1 )
+ call timer_start( 1 )
+
+c---------------------------------------------------------------------
+c---->
+c Main Iteration for inverse power method
+c---->
+c---------------------------------------------------------------------
+ do it = 1, niter
+
+c---------------------------------------------------------------------
+c The call to the conjugate gradient routine:
+c---------------------------------------------------------------------
+ call conj_grad ( colidx,
+ > rowstr,
+ > x,
+ > z,
+ > a,
+ > p,
+ > q,
+ > r,
+ > w,
+ > rnorm,
+ > l2npcols,
+ > reduce_exch_proc,
+ > reduce_send_starts,
+ > reduce_send_lengths,
+ > reduce_recv_starts,
+ > reduce_recv_lengths )
+
+
+c---------------------------------------------------------------------
+c zeta = shift + 1/(x.z)
+c So, first: (x.z)
+c Also, find norm of z
+c So, first: (z.z)
+c---------------------------------------------------------------------
+ norm_temp1(1) = 0.0d0
+ norm_temp1(2) = 0.0d0
+ do j=1, lastcol-firstcol+1
+ norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
+ norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
+ enddo
+
+ do i = 1, l2npcols
+ call mpi_irecv( norm_temp2,
+ > 2,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+ call mpi_send( norm_temp1,
+ > 2,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > ierr )
+ call mpi_wait( request, status, ierr )
+
+ norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
+ norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
+ enddo
+
+ norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
+
+
+ if( me .eq. root )then
+ zeta = shift + 1.0d0 / norm_temp1(1)
+ if( it .eq. 1 ) write( *,9000 )
+ write( *,9001 ) it, rnorm, zeta
+ endif
+ 9000 format( /,' iteration ||r|| zeta' )
+ 9001 format( 4x, i5, 7x, e20.14, f20.13 )
+
+c---------------------------------------------------------------------
+c Normalize z to obtain x
+c---------------------------------------------------------------------
+ do j=1, lastcol-firstcol+1
+ x(j) = norm_temp1(2)*z(j)
+ enddo
+
+
+ enddo ! end of main iter inv pow meth
+
+ call timer_stop( 1 )
+
+c---------------------------------------------------------------------
+c End of timed section
+c---------------------------------------------------------------------
+
+ t = timer_read( 1 )
+
+ call mpi_reduce( t,
+ > tmax,
+ > 1,
+ > dp_type,
+ > MPI_MAX,
+ > root,
+ > mpi_comm_world,
+ > ierr )
+
+ if( me .eq. root )then
+ write(*,100)
+ 100 format(' Benchmark completed ')
+
+ epsilon = 1.d-10
+ if (class .ne. 'U') then
+
+ err = abs( zeta - zeta_verify_value )/zeta_verify_value
+ if( err .le. epsilon ) then
+ verified = .TRUE.
+ write(*, 200)
+ write(*, 201) zeta
+ write(*, 202) err
+ 200 format(' VERIFICATION SUCCESSFUL ')
+ 201 format(' Zeta is ', E20.13)
+ 202 format(' Error is ', E20.13)
+ else
+ verified = .FALSE.
+ write(*, 300)
+ write(*, 301) zeta
+ write(*, 302) zeta_verify_value
+ 300 format(' VERIFICATION FAILED')
+ 301 format(' Zeta ', E20.13)
+ 302 format(' The correct zeta is ', E20.13)
+ endif
+ else
+ verified = .FALSE.
+ write (*, 400)
+ write (*, 401)
+ write (*, 201) zeta
+ 400 format(' Problem size unknown')
+ 401 format(' NO VERIFICATION PERFORMED')
+ endif
+
+
+ if( tmax .ne. 0. ) then
+ mflops = float( 2*niter*na )
+ & * ( 3.+float( nonzer*(nonzer+1) )
+ & + 25.*(5.+float( nonzer*(nonzer+1) ))
+ & + 3. ) / tmax / 1000000.0
+ else
+ mflops = 0.0
+ endif
+
+ call print_results('CG', class, na, 0, 0,
+ > niter, nnodes_compiled, nprocs, tmax,
+ > mflops, ' floating point',
+ > verified, npbversion, compiletime,
+ > cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+
+
+ endif
+
+
+ call mpi_finalize(ierr)
+
+
+
+ end ! end main
+
+
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine initialize_mpi
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+
+ integer ierr
+
+
+ call mpi_init( ierr )
+ call mpi_comm_rank( mpi_comm_world, me, ierr )
+ call mpi_comm_size( mpi_comm_world, nprocs, ierr )
+ root = 0
+
+
+ return
+ end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine setup_proc_info( num_procs,
+ > num_proc_rows,
+ > num_proc_cols )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+
+ common / partit_size / naa, nzz,
+ > npcols, nprows,
+ > proc_col, proc_row,
+ > firstrow,
+ > lastrow,
+ > firstcol,
+ > lastcol,
+ > exch_proc,
+ > exch_recv_length,
+ > send_start,
+ > send_len
+ integer naa, nzz,
+ > npcols, nprows,
+ > proc_col, proc_row,
+ > firstrow,
+ > lastrow,
+ > firstcol,
+ > lastcol,
+ > exch_proc,
+ > exch_recv_length,
+ > send_start,
+ > send_len
+
+ integer num_procs, num_proc_cols, num_proc_rows
+ integer i, ierr
+ integer log2nprocs
+
+c---------------------------------------------------------------------
+c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows
+c When num_procs is not square, then num_proc_cols = 2*num_proc_rows
+c---------------------------------------------------------------------
+c First, number of procs must be power of two.
+c---------------------------------------------------------------------
+ if( nprocs .ne. num_procs )then
+ if( me .eq. root ) write( *,9000 ) nprocs, num_procs
+ 9000 format( /,'Error: ',/,'num of procs allocated (',
+ > i4, ' )',
+ > /,'is not equal to',/,
+ > 'compiled number of procs (',
+ > i4, ' )',/ )
+ call mpi_finalize(ierr)
+ stop
+ endif
+
+
+ i = num_proc_cols
+ 100 continue
+ if( i .ne. 1 .and. i/2*2 .ne. i )then
+ if ( me .eq. root ) then
+ write( *,* ) 'Error: num_proc_cols is ',
+ > num_proc_cols,
+ > ' which is not a power of two'
+ endif
+ call mpi_finalize(ierr)
+ stop
+ endif
+ i = i / 2
+ if( i .ne. 0 )then
+ goto 100
+ endif
+
+ i = num_proc_rows
+ 200 continue
+ if( i .ne. 1 .and. i/2*2 .ne. i )then
+ if ( me .eq. root ) then
+ write( *,* ) 'Error: num_proc_rows is ',
+ > num_proc_rows,
+ > ' which is not a power of two'
+ endif
+ call mpi_finalize(ierr)
+ stop
+ endif
+ i = i / 2
+ if( i .ne. 0 )then
+ goto 200
+ endif
+
+ log2nprocs = 0
+ i = nprocs
+ 300 continue
+ if( i .ne. 1 .and. i/2*2 .ne. i )then
+ write( *,* ) 'Error: nprocs is ',
+ > nprocs,
+ > ' which is not a power of two'
+ call mpi_finalize(ierr)
+ stop
+ endif
+ i = i / 2
+ if( i .ne. 0 )then
+ log2nprocs = log2nprocs + 1
+ goto 300
+ endif
+
+CC write( *,* ) 'nprocs, log2nprocs: ',nprocs,log2nprocs
+
+
+ npcols = num_proc_cols
+ nprows = num_proc_rows
+
+
+ return
+ end
+
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine setup_submatrix_info( l2npcols,
+ > reduce_exch_proc,
+ > reduce_send_starts,
+ > reduce_send_lengths,
+ > reduce_recv_starts,
+ > reduce_recv_lengths )
+ >
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+
+ integer col_size, row_size
+
+ common / partit_size / naa, nzz,
+ > npcols, nprows,
+ > proc_col, proc_row,
+ > firstrow,
+ > lastrow,
+ > firstcol,
+ > lastcol,
+ > exch_proc,
+ > exch_recv_length,
+ > send_start,
+ > send_len
+ integer naa, nzz,
+ > npcols, nprows,
+ > proc_col, proc_row,
+ > firstrow,
+ > lastrow,
+ > firstcol,
+ > lastcol,
+ > exch_proc,
+ > exch_recv_length,
+ > send_start,
+ > send_len
+
+ integer reduce_exch_proc(*)
+ integer reduce_send_starts(*)
+ integer reduce_send_lengths(*)
+ integer reduce_recv_starts(*)
+ integer reduce_recv_lengths(*)
+
+ integer i, j
+ integer div_factor
+ integer l2npcols
+
+
+ proc_row = me / npcols
+ proc_col = me - proc_row*npcols
+
+
+
+c---------------------------------------------------------------------
+c If naa evenly divisible by npcols, then it is evenly divisible
+c by nprows
+c---------------------------------------------------------------------
+
+ if( naa/npcols*npcols .eq. naa )then
+ col_size = naa/npcols
+ firstcol = proc_col*col_size + 1
+ lastcol = firstcol - 1 + col_size
+ row_size = naa/nprows
+ firstrow = proc_row*row_size + 1
+ lastrow = firstrow - 1 + row_size
+c---------------------------------------------------------------------
+c If naa not evenly divisible by npcols, then first subdivide for nprows
+c and then, if npcols not equal to nprows (i.e., not a sq number of procs),
+c get col subdivisions by dividing by 2 each row subdivision.
+c---------------------------------------------------------------------
+ else
+ if( proc_row .lt. naa - naa/nprows*nprows)then
+ row_size = naa/nprows+ 1
+ firstrow = proc_row*row_size + 1
+ lastrow = firstrow - 1 + row_size
+ else
+ row_size = naa/nprows
+ firstrow = (naa - naa/nprows*nprows)*(row_size+1)
+ > + (proc_row-(naa-naa/nprows*nprows))
+ > *row_size + 1
+ lastrow = firstrow - 1 + row_size
+ endif
+ if( npcols .eq. nprows )then
+ if( proc_col .lt. naa - naa/npcols*npcols )then
+ col_size = naa/npcols+ 1
+ firstcol = proc_col*col_size + 1
+ lastcol = firstcol - 1 + col_size
+ else
+ col_size = naa/npcols
+ firstcol = (naa - naa/npcols*npcols)*(col_size+1)
+ > + (proc_col-(naa-naa/npcols*npcols))
+ > *col_size + 1
+ lastcol = firstcol - 1 + col_size
+ endif
+ else
+ if( (proc_col/2) .lt.
+ > naa - naa/(npcols/2)*(npcols/2) )then
+ col_size = naa/(npcols/2) + 1
+ firstcol = (proc_col/2)*col_size + 1
+ lastcol = firstcol - 1 + col_size
+ else
+ col_size = naa/(npcols/2)
+ firstcol = (naa - naa/(npcols/2)*(npcols/2))
+ > *(col_size+1)
+ > + ((proc_col/2)-(naa-naa/(npcols/2)*(npcols/2)))
+ > *col_size + 1
+ lastcol = firstcol - 1 + col_size
+ endif
+CC write( *,* ) col_size,firstcol,lastcol
+ if( mod( me,2 ) .eq. 0 )then
+ lastcol = firstcol - 1 + (col_size-1)/2 + 1
+ else
+ firstcol = firstcol + (col_size-1)/2 + 1
+ lastcol = firstcol - 1 + col_size/2
+CC write( *,* ) firstcol,lastcol
+ endif
+ endif
+ endif
+
+
+
+ if( npcols .eq. nprows )then
+ send_start = 1
+ send_len = lastrow - firstrow + 1
+ else
+ if( mod( me,2 ) .eq. 0 )then
+ send_start = 1
+ send_len = (1 + lastrow-firstrow+1)/2
+ else
+ send_start = (1 + lastrow-firstrow+1)/2 + 1
+ send_len = (lastrow-firstrow+1)/2
+ endif
+ endif
+
+
+
+
+c---------------------------------------------------------------------
+c Transpose exchange processor
+c---------------------------------------------------------------------
+
+ if( npcols .eq. nprows )then
+ exch_proc = mod( me,nprows )*nprows + me/nprows
+ else
+ exch_proc = 2*(mod( me/2,nprows )*nprows + me/2/nprows)
+ > + mod( me,2 )
+ endif
+
+
+
+ i = npcols / 2
+ l2npcols = 0
+ do while( i .gt. 0 )
+ l2npcols = l2npcols + 1
+ i = i / 2
+ enddo
+
+
+c---------------------------------------------------------------------
+c Set up the reduce phase schedules...
+c---------------------------------------------------------------------
+
+ div_factor = npcols
+ do i = 1, l2npcols
+
+ j = mod( proc_col+div_factor/2, div_factor )
+ > + proc_col / div_factor * div_factor
+ reduce_exch_proc(i) = proc_row*npcols + j
+
+ div_factor = div_factor / 2
+
+ enddo
+
+
+ do i = l2npcols, 1, -1
+
+ if( nprows .eq. npcols )then
+ reduce_send_starts(i) = send_start
+ reduce_send_lengths(i) = send_len
+ reduce_recv_lengths(i) = lastrow - firstrow + 1
+ else
+ reduce_recv_lengths(i) = send_len
+ if( i .eq. l2npcols )then
+ reduce_send_lengths(i) = lastrow-firstrow+1 - send_len
+ if( me/2*2 .eq. me )then
+ reduce_send_starts(i) = send_start + send_len
+ else
+ reduce_send_starts(i) = 1
+ endif
+ else
+ reduce_send_lengths(i) = send_len
+ reduce_send_starts(i) = send_start
+ endif
+ endif
+ reduce_recv_starts(i) = send_start
+
+ enddo
+
+
+ exch_recv_length = lastcol - firstcol + 1
+
+
+ return
+ end
+
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine conj_grad ( colidx,
+ > rowstr,
+ > x,
+ > z,
+ > a,
+ > p,
+ > q,
+ > r,
+ > w,
+ > rnorm,
+ > l2npcols,
+ > reduce_exch_proc,
+ > reduce_send_starts,
+ > reduce_send_lengths,
+ > reduce_recv_starts,
+ > reduce_recv_lengths )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Floaging point arrays here are named as in NPB1 spec discussion of
+c CG algorithm
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+
+ integer status(MPI_STATUS_SIZE ), request
+
+
+ common / partit_size / naa, nzz,
+ > npcols, nprows,
+ > proc_col, proc_row,
+ > firstrow,
+ > lastrow,
+ > firstcol,
+ > lastcol,
+ > exch_proc,
+ > exch_recv_length,
+ > send_start,
+ > send_len
+ integer naa, nzz,
+ > npcols, nprows,
+ > proc_col, proc_row,
+ > firstrow,
+ > lastrow,
+ > firstcol,
+ > lastcol,
+ > exch_proc,
+ > exch_recv_length,
+ > send_start,
+ > send_len
+
+
+
+ double precision x(*),
+ > z(*),
+ > a(nzz)
+ integer colidx(nzz), rowstr(naa+1)
+
+ double precision p(*),
+ > q(*),
+ > r(*),
+ > w(*) ! used as work temporary
+
+ integer l2npcols
+ integer reduce_exch_proc(l2npcols)
+ integer reduce_send_starts(l2npcols)
+ integer reduce_send_lengths(l2npcols)
+ integer reduce_recv_starts(l2npcols)
+ integer reduce_recv_lengths(l2npcols)
+
+ integer i, j, k, ierr
+ integer cgit, cgitmax
+
+ double precision d, sum, rho, rho0, alpha, beta, rnorm
+
+ external timer_read
+ double precision timer_read
+
+ data cgitmax / 25 /
+
+
+c---------------------------------------------------------------------
+c Initialize the CG algorithm:
+c---------------------------------------------------------------------
+ do j=1,naa/nprows+1
+ q(j) = 0.0d0
+ z(j) = 0.0d0
+ r(j) = x(j)
+ p(j) = r(j)
+ w(j) = 0.0d0
+ enddo
+
+
+c---------------------------------------------------------------------
+c rho = r.r
+c Now, obtain the norm of r: First, sum squares of r elements locally...
+c---------------------------------------------------------------------
+ sum = 0.0d0
+ do j=1, lastcol-firstcol+1
+ sum = sum + r(j)*r(j)
+ enddo
+
+c---------------------------------------------------------------------
+c Exchange and sum with procs identified in reduce_exch_proc
+c (This is equivalent to mpi_allreduce.)
+c Sum the partial sums of rho, leaving rho on all processors
+c---------------------------------------------------------------------
+ do i = 1, l2npcols
+ call mpi_irecv( rho,
+ > 1,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+ call mpi_send( sum,
+ > 1,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > ierr )
+ call mpi_wait( request, status, ierr )
+
+ sum = sum + rho
+ enddo
+ rho = sum
+
+
+
+c---------------------------------------------------------------------
+c---->
+c The conj grad iteration loop
+c---->
+c---------------------------------------------------------------------
+ do cgit = 1, cgitmax
+
+
+c---------------------------------------------------------------------
+c q = A.p
+c The partition submatrix-vector multiply: use workspace w
+c---------------------------------------------------------------------
+ do j=1,lastrow-firstrow+1
+ sum = 0.d0
+ do k=rowstr(j),rowstr(j+1)-1
+ sum = sum + a(k)*p(colidx(k))
+ enddo
+ w(j) = sum
+ enddo
+
+c---------------------------------------------------------------------
+c Sum the partition submatrix-vec A.p's across rows
+c Exchange and sum piece of w with procs identified in reduce_exch_proc
+c---------------------------------------------------------------------
+ do i = l2npcols, 1, -1
+ call mpi_irecv( q(reduce_recv_starts(i)),
+ > reduce_recv_lengths(i),
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+ call mpi_send( w(reduce_send_starts(i)),
+ > reduce_send_lengths(i),
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > ierr )
+ call mpi_wait( request, status, ierr )
+ do j=send_start,send_start + reduce_recv_lengths(i) - 1
+ w(j) = w(j) + q(j)
+ enddo
+ enddo
+
+
+c---------------------------------------------------------------------
+c Exchange piece of q with transpose processor:
+c---------------------------------------------------------------------
+ if( l2npcols .ne. 0 )then
+ call mpi_irecv( q,
+ > exch_recv_length,
+ > dp_type,
+ > exch_proc,
+ > 1,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+
+ call mpi_send( w(send_start),
+ > send_len,
+ > dp_type,
+ > exch_proc,
+ > 1,
+ > mpi_comm_world,
+ > ierr )
+ call mpi_wait( request, status, ierr )
+ else
+ do j=1,exch_recv_length
+ q(j) = w(j)
+ enddo
+ endif
+
+
+c---------------------------------------------------------------------
+c Clear w for reuse...
+c---------------------------------------------------------------------
+ do j=1, max( lastrow-firstrow+1, lastcol-firstcol+1 )
+ w(j) = 0.0d0
+ enddo
+
+
+c---------------------------------------------------------------------
+c Obtain p.q
+c---------------------------------------------------------------------
+ sum = 0.0d0
+ do j=1, lastcol-firstcol+1
+ sum = sum + p(j)*q(j)
+ enddo
+
+c---------------------------------------------------------------------
+c Obtain d with a sum-reduce
+c---------------------------------------------------------------------
+ do i = 1, l2npcols
+ call mpi_irecv( d,
+ > 1,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+ call mpi_send( sum,
+ > 1,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > ierr )
+
+ call mpi_wait( request, status, ierr )
+
+ sum = sum + d
+ enddo
+ d = sum
+
+
+c---------------------------------------------------------------------
+c Obtain alpha = rho / (p.q)
+c---------------------------------------------------------------------
+ alpha = rho / d
+
+c---------------------------------------------------------------------
+c Save a temporary of rho
+c---------------------------------------------------------------------
+ rho0 = rho
+
+c---------------------------------------------------------------------
+c Obtain z = z + alpha*p
+c and r = r - alpha*q
+c---------------------------------------------------------------------
+ do j=1, lastcol-firstcol+1
+ z(j) = z(j) + alpha*p(j)
+ r(j) = r(j) - alpha*q(j)
+ enddo
+
+c---------------------------------------------------------------------
+c rho = r.r
+c Now, obtain the norm of r: First, sum squares of r elements locally...
+c---------------------------------------------------------------------
+ sum = 0.0d0
+ do j=1, lastcol-firstcol+1
+ sum = sum + r(j)*r(j)
+ enddo
+
+c---------------------------------------------------------------------
+c Obtain rho with a sum-reduce
+c---------------------------------------------------------------------
+ do i = 1, l2npcols
+ call mpi_irecv( rho,
+ > 1,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+ call mpi_send( sum,
+ > 1,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > ierr )
+ call mpi_wait( request, status, ierr )
+
+ sum = sum + rho
+ enddo
+ rho = sum
+
+c---------------------------------------------------------------------
+c Obtain beta:
+c---------------------------------------------------------------------
+ beta = rho / rho0
+
+c---------------------------------------------------------------------
+c p = r + beta*p
+c---------------------------------------------------------------------
+ do j=1, lastcol-firstcol+1
+ p(j) = r(j) + beta*p(j)
+ enddo
+
+
+
+ enddo ! end of do cgit=1,cgitmax
+
+
+
+c---------------------------------------------------------------------
+c Compute residual norm explicitly: ||r|| = ||x - A.z||
+c First, form A.z
+c The partition submatrix-vector multiply
+c---------------------------------------------------------------------
+ do j=1,lastrow-firstrow+1
+ sum = 0.d0
+ do k=rowstr(j),rowstr(j+1)-1
+ sum = sum + a(k)*z(colidx(k))
+ enddo
+ w(j) = sum
+ enddo
+
+
+
+c---------------------------------------------------------------------
+c Sum the partition submatrix-vec A.z's across rows
+c---------------------------------------------------------------------
+ do i = l2npcols, 1, -1
+ call mpi_irecv( r(reduce_recv_starts(i)),
+ > reduce_recv_lengths(i),
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+ call mpi_send( w(reduce_send_starts(i)),
+ > reduce_send_lengths(i),
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > ierr )
+ call mpi_wait( request, status, ierr )
+
+ do j=send_start,send_start + reduce_recv_lengths(i) - 1
+ w(j) = w(j) + r(j)
+ enddo
+ enddo
+
+
+c---------------------------------------------------------------------
+c Exchange piece of q with transpose processor:
+c---------------------------------------------------------------------
+ if( l2npcols .ne. 0 )then
+ call mpi_irecv( r,
+ > exch_recv_length,
+ > dp_type,
+ > exch_proc,
+ > 1,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+
+ call mpi_send( w(send_start),
+ > send_len,
+ > dp_type,
+ > exch_proc,
+ > 1,
+ > mpi_comm_world,
+ > ierr )
+ call mpi_wait( request, status, ierr )
+ else
+ do j=1,exch_recv_length
+ r(j) = w(j)
+ enddo
+ endif
+
+
+c---------------------------------------------------------------------
+c At this point, r contains A.z
+c---------------------------------------------------------------------
+ sum = 0.0d0
+ do j=1, lastcol-firstcol+1
+ d = x(j) - r(j)
+ sum = sum + d*d
+ enddo
+
+c---------------------------------------------------------------------
+c Obtain d with a sum-reduce
+c---------------------------------------------------------------------
+ do i = 1, l2npcols
+ call mpi_irecv( d,
+ > 1,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > request,
+ > ierr )
+ call mpi_send( sum,
+ > 1,
+ > dp_type,
+ > reduce_exch_proc(i),
+ > i,
+ > mpi_comm_world,
+ > ierr )
+ call mpi_wait( request, status, ierr )
+
+ sum = sum + d
+ enddo
+ d = sum
+
+
+ if( me .eq. root ) rnorm = sqrt( d )
+
+
+
+ return
+ end ! end of routine conj_grad
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine makea( n, nz, a, colidx, rowstr, nonzer,
+ > firstrow, lastrow, firstcol, lastcol,
+ > rcond, arow, acol, aelt, v, iv, shift )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ integer n, nz
+ integer firstrow, lastrow, firstcol, lastcol
+ integer colidx(nz), rowstr(n+1)
+ integer iv(2*n+1), arow(nz), acol(nz)
+ double precision v(n+1), aelt(nz)
+ double precision rcond, a(nz), shift
+
+c---------------------------------------------------------------------
+c generate the test problem for benchmark 6
+c makea generates a sparse matrix with a
+c prescribed sparsity distribution
+c
+c parameter type usage
+c
+c input
+c
+c n i number of cols/rows of matrix
+c nz i nonzeros as declared array size
+c rcond r*8 condition number
+c shift r*8 main diagonal shift
+c
+c output
+c
+c a r*8 array for nonzeros
+c colidx i col indices
+c rowstr i row pointers
+c
+c workspace
+c
+c iv, arow, acol i
+c v, aelt r*8
+c---------------------------------------------------------------------
+
+ integer i, nnza, iouter, ivelt, ivelt1, irow, nzv, NONZER
+
+c---------------------------------------------------------------------
+c nonzer is approximately (int(sqrt(nnza /n)));
+c---------------------------------------------------------------------
+
+ double precision size, ratio, scale
+ external sparse, sprnvc, vecset
+
+ size = 1.0D0
+ ratio = rcond ** (1.0D0 / dfloat(n))
+ nnza = 0
+
+c---------------------------------------------------------------------
+c Initialize iv(n+1 .. 2n) to zero.
+c Used by sprnvc to mark nonzero positions
+c---------------------------------------------------------------------
+
+ do i = 1, n
+ iv(n+i) = 0
+ enddo
+ do iouter = 1, n
+ nzv = nonzer
+ call sprnvc( n, nzv, v, colidx, iv(1), iv(n+1) )
+ call vecset( n, v, colidx, nzv, iouter, .5D0 )
+ do ivelt = 1, nzv
+ jcol = colidx(ivelt)
+ if (jcol.ge.firstcol .and. jcol.le.lastcol) then
+ scale = size * v(ivelt)
+ do ivelt1 = 1, nzv
+ irow = colidx(ivelt1)
+ if (irow.ge.firstrow .and. irow.le.lastrow) then
+ nnza = nnza + 1
+ if (nnza .gt. nz) goto 9999
+ acol(nnza) = jcol
+ arow(nnza) = irow
+ aelt(nnza) = v(ivelt1) * scale
+ endif
+ enddo
+ endif
+ enddo
+ size = size * ratio
+ enddo
+
+
+c---------------------------------------------------------------------
+c ... add the identity * rcond to the generated matrix to bound
+c the smallest eigenvalue from below by rcond
+c---------------------------------------------------------------------
+ do i = firstrow, lastrow
+ if (i.ge.firstcol .and. i.le.lastcol) then
+ iouter = n + i
+ nnza = nnza + 1
+ if (nnza .gt. nz) goto 9999
+ acol(nnza) = i
+ arow(nnza) = i
+ aelt(nnza) = rcond - shift
+ endif
+ enddo
+
+
+c---------------------------------------------------------------------
+c ... make the sparse matrix from list of elements with duplicates
+c (v and iv are used as workspace)
+c---------------------------------------------------------------------
+ call sparse( a, colidx, rowstr, n, arow, acol, aelt,
+ > firstrow, lastrow,
+ > v, iv(1), iv(n+1), nnza )
+ return
+
+ 9999 continue
+ write(*,*) 'Space for matrix elements exceeded in makea'
+ write(*,*) 'nnza, nzmax = ',nnza, nz
+ write(*,*) ' iouter = ',iouter
+
+ stop
+ end
+c-------end of makea------------------------------
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine sparse( a, colidx, rowstr, n, arow, acol, aelt,
+ > firstrow, lastrow,
+ > x, mark, nzloc, nnza )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit logical (a-z)
+ integer colidx(*), rowstr(*)
+ integer firstrow, lastrow
+ integer n, arow(*), acol(*), nnza
+ double precision a(*), aelt(*)
+
+c---------------------------------------------------------------------
+c rows range from firstrow to lastrow
+c the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
+c---------------------------------------------------------------------
+ integer nzloc(n), nrows
+ double precision x(n)
+ logical mark(n)
+
+c---------------------------------------------------
+c generate a sparse matrix from a list of
+c [col, row, element] tri
+c---------------------------------------------------
+
+ integer i, j, jajp1, nza, k, nzrow
+ double precision xi
+
+c---------------------------------------------------------------------
+c how many rows of result
+c---------------------------------------------------------------------
+ nrows = lastrow - firstrow + 1
+
+c---------------------------------------------------------------------
+c ...count the number of triples in each row
+c---------------------------------------------------------------------
+ do j = 1, n
+ rowstr(j) = 0
+ mark(j) = .false.
+ enddo
+ rowstr(n+1) = 0
+
+ do nza = 1, nnza
+ j = (arow(nza) - firstrow + 1) + 1
+ rowstr(j) = rowstr(j) + 1
+ enddo
+
+ rowstr(1) = 1
+ do j = 2, nrows+1
+ rowstr(j) = rowstr(j) + rowstr(j-1)
+ enddo
+
+
+c---------------------------------------------------------------------
+c ... rowstr(j) now is the location of the first nonzero
+c of row j of a
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c ... do a bucket sort of the triples on the row index
+c---------------------------------------------------------------------
+ do nza = 1, nnza
+ j = arow(nza) - firstrow + 1
+ k = rowstr(j)
+ a(k) = aelt(nza)
+ colidx(k) = acol(nza)
+ rowstr(j) = rowstr(j) + 1
+ enddo
+
+
+c---------------------------------------------------------------------
+c ... rowstr(j) now points to the first element of row j+1
+c---------------------------------------------------------------------
+ do j = nrows, 1, -1
+ rowstr(j+1) = rowstr(j)
+ enddo
+ rowstr(1) = 1
+
+
+c---------------------------------------------------------------------
+c ... generate the actual output rows by adding elements
+c---------------------------------------------------------------------
+ nza = 0
+ do i = 1, n
+ x(i) = 0.0
+ mark(i) = .false.
+ enddo
+
+ jajp1 = rowstr(1)
+ do j = 1, nrows
+ nzrow = 0
+
+c---------------------------------------------------------------------
+c ...loop over the jth row of a
+c---------------------------------------------------------------------
+ do k = jajp1 , rowstr(j+1)-1
+ i = colidx(k)
+ x(i) = x(i) + a(k)
+ if ( (.not. mark(i)) .and. (x(i) .ne. 0.D0)) then
+ mark(i) = .true.
+ nzrow = nzrow + 1
+ nzloc(nzrow) = i
+ endif
+ enddo
+
+c---------------------------------------------------------------------
+c ... extract the nonzeros of this row
+c---------------------------------------------------------------------
+ do k = 1, nzrow
+ i = nzloc(k)
+ mark(i) = .false.
+ xi = x(i)
+ x(i) = 0.D0
+ if (xi .ne. 0.D0) then
+ nza = nza + 1
+ a(nza) = xi
+ colidx(nza) = i
+ endif
+ enddo
+ jajp1 = rowstr(j+1)
+ rowstr(j+1) = nza + rowstr(1)
+ enddo
+CC write (*, 11000) nza
+ return
+11000 format ( //,'final nonzero count in sparse ',
+ 1 /,'number of nonzeros = ', i16 )
+ end
+c-------end of sparse-----------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine sprnvc( n, nz, v, iv, nzloc, mark )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit logical (a-z)
+ double precision v(*)
+ integer n, nz, iv(*), nzloc(n), nn1
+ integer mark(n)
+ common /urando/ amult, tran
+ double precision amult, tran
+
+
+c---------------------------------------------------------------------
+c generate a sparse n-vector (v, iv)
+c having nzv nonzeros
+c
+c mark(i) is set to 1 if position i is nonzero.
+c mark is all zero on entry and is reset to all zero before exit
+c this corrects a performance bug found by John G. Lewis, caused by
+c reinitialization of mark on every one of the n calls to sprnvc
+c---------------------------------------------------------------------
+
+ integer nzrow, nzv, ii, i, icnvrt
+
+ external randlc, icnvrt
+ double precision randlc, vecelt, vecloc
+
+
+ nzv = 0
+ nzrow = 0
+ nn1 = 1
+ 50 continue
+ nn1 = 2 * nn1
+ if (nn1 .lt. n) goto 50
+
+c---------------------------------------------------------------------
+c nn1 is the smallest power of two not less than n
+c---------------------------------------------------------------------
+
+100 continue
+ if (nzv .ge. nz) goto 110
+ vecelt = randlc( tran, amult )
+
+c---------------------------------------------------------------------
+c generate an integer between 1 and n in a portable manner
+c---------------------------------------------------------------------
+ vecloc = randlc(tran, amult)
+ i = icnvrt(vecloc, nn1) + 1
+ if (i .gt. n) goto 100
+
+c---------------------------------------------------------------------
+c was this integer generated already?
+c---------------------------------------------------------------------
+ if (mark(i) .eq. 0) then
+ mark(i) = 1
+ nzrow = nzrow + 1
+ nzloc(nzrow) = i
+ nzv = nzv + 1
+ v(nzv) = vecelt
+ iv(nzv) = i
+ endif
+ goto 100
+110 continue
+ do ii = 1, nzrow
+ i = nzloc(ii)
+ mark(i) = 0
+ enddo
+ return
+ end
+c-------end of sprnvc-----------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ function icnvrt(x, ipwr2)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit logical (a-z)
+ double precision x
+ integer ipwr2, icnvrt
+
+c---------------------------------------------------------------------
+c scale a double precision number x in (0,1) by a power of 2 and chop it
+c---------------------------------------------------------------------
+ icnvrt = int(ipwr2 * x)
+
+ return
+ end
+c-------end of icnvrt-----------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine vecset(n, v, iv, nzv, i, val)
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit logical (a-z)
+ integer n, iv(*), nzv, i, k
+ double precision v(*), val
+
+c---------------------------------------------------------------------
+c set ith element of sparse vector (v, iv) with
+c nzv nonzeros to val
+c---------------------------------------------------------------------
+
+ logical set
+
+ set = .false.
+ do k = 1, nzv
+ if (iv(k) .eq. i) then
+ v(k) = val
+ set = .true.
+ endif
+ enddo
+ if (.not. set) then
+ nzv = nzv + 1
+ v(nzv) = val
+ iv(nzv) = i
+ endif
+ return
+ end
+c-------end of vecset-----------------------------
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'mpif.h'
+
+ integer me, nprocs, root, dp_type
+ common /mpistuff/ me, nprocs, root, dp_type
+
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "DGraph.h"
+
+DGArc *newArc(DGNode *tl,DGNode *hd){
+ DGArc *ar=(DGArc *)malloc(sizeof(DGArc));
+ ar->tail=tl;
+ ar->head=hd;
+ return ar;
+}
+void arcShow(DGArc *ar){
+ DGNode *tl=(DGNode *)ar->tail,
+ *hd=(DGNode *)ar->head;
+ fprintf(stderr,"%d. |%s ->%s\n",ar->id,tl->name,hd->name);
+}
+
+DGNode *newNode(char *nm){
+ DGNode *nd=(DGNode *)malloc(sizeof(DGNode));
+ nd->attribute=0;
+ nd->color=0;
+ nd->inDegree=0;
+ nd->outDegree=0;
+ nd->maxInDegree=SMALL_BLOCK_SIZE;
+ nd->maxOutDegree=SMALL_BLOCK_SIZE;
+ nd->inArc=(DGArc **)malloc(nd->maxInDegree*sizeof(DGArc*));
+ nd->outArc=(DGArc **)malloc(nd->maxOutDegree*sizeof(DGArc*));
+ nd->name=strdup(nm);
+ nd->feat=NULL;
+ return nd;
+}
+void nodeShow(DGNode* nd){
+ fprintf( stderr,"%3d.%s: (%d,%d)\n",
+ nd->id,nd->name,nd->inDegree,nd->outDegree);
+/*
+ if(nd->verified==1) fprintf(stderr,"%ld.%s\t: usable.",nd->id,nd->name);
+ else if(nd->verified==0) fprintf(stderr,"%ld.%s\t: unusable.",nd->id,nd->name);
+ else fprintf(stderr,"%ld.%s\t: notverified.",nd->id,nd->name);
+*/
+}
+
+DGraph* newDGraph(char* nm){
+ DGraph *dg=(DGraph *)malloc(sizeof(DGraph));
+ dg->numNodes=0;
+ dg->numArcs=0;
+ dg->maxNodes=BLOCK_SIZE;
+ dg->maxArcs=BLOCK_SIZE;
+ dg->node=(DGNode **)malloc(dg->maxNodes*sizeof(DGNode*));
+ dg->arc=(DGArc **)malloc(dg->maxArcs*sizeof(DGArc*));
+ dg->name=strdup(nm);
+ return dg;
+}
+int AttachNode(DGraph* dg, DGNode* nd) {
+ int i=0,j,len=0;
+ DGNode **nds =NULL, *tmpnd=NULL;
+ DGArc **ar=NULL;
+
+ if (dg->numNodes == dg->maxNodes-1 ) {
+ dg->maxNodes += BLOCK_SIZE;
+ nds =(DGNode **) calloc(dg->maxNodes,sizeof(DGNode*));
+ memcpy(nds,dg->node,(dg->maxNodes-BLOCK_SIZE)*sizeof(DGNode*));
+ free(dg->node);
+ dg->node=nds;
+ }
+
+ len = strlen( nd->name);
+ for (i = 0; i < dg->numNodes; i++) {
+ tmpnd =dg->node[ i];
+ ar=NULL;
+ if ( strlen( tmpnd->name) != len ) continue;
+ if ( strncmp( nd->name, tmpnd->name, len) ) continue;
+ if ( nd->inDegree > 0 ) {
+ tmpnd->maxInDegree += nd->maxInDegree;
+ ar =(DGArc **) calloc(tmpnd->maxInDegree,sizeof(DGArc*));
+ memcpy(ar,tmpnd->inArc,(tmpnd->inDegree)*sizeof(DGArc*));
+ free(tmpnd->inArc);
+ tmpnd->inArc=ar;
+ for (j = 0; j < nd->inDegree; j++ ) {
+ nd->inArc[ j]->head = tmpnd;
+ }
+ memcpy( &(tmpnd->inArc[ tmpnd->inDegree]), nd->inArc, nd->inDegree*sizeof( DGArc *));
+ tmpnd->inDegree += nd->inDegree;
+ }
+ if ( nd->outDegree > 0 ) {
+ tmpnd->maxOutDegree += nd->maxOutDegree;
+ ar =(DGArc **) calloc(tmpnd->maxOutDegree,sizeof(DGArc*));
+ memcpy(ar,tmpnd->outArc,(tmpnd->outDegree)*sizeof(DGArc*));
+ free(tmpnd->outArc);
+ tmpnd->outArc=ar;
+ for (j = 0; j < nd->outDegree; j++ ) {
+ nd->outArc[ j]->tail = tmpnd;
+ }
+ memcpy( &(tmpnd->outArc[tmpnd->outDegree]),nd->outArc,nd->outDegree*sizeof( DGArc *));
+ tmpnd->outDegree += nd->outDegree;
+ }
+ free(nd);
+ return i;
+ }
+ nd->id = dg->numNodes;
+ dg->node[dg->numNodes] = nd;
+ dg->numNodes++;
+return nd->id;
+}
+int AttachArc(DGraph *dg,DGArc* nar){
+int arcId = -1;
+int i=0,newNumber=0;
+DGNode *head = nar->head,
+ *tail = nar->tail;
+DGArc **ars=NULL,*probe=NULL;
+/*fprintf(stderr,"AttachArc %ld\n",dg->numArcs); */
+ if ( !tail || !head ) return arcId;
+ if ( dg->numArcs == dg->maxArcs-1 ) {
+ dg->maxArcs += BLOCK_SIZE;
+ ars =(DGArc **) calloc(dg->maxArcs,sizeof(DGArc*));
+ memcpy(ars,dg->arc,(dg->maxArcs-BLOCK_SIZE)*sizeof(DGArc*));
+ free(dg->arc);
+ dg->arc=ars;
+ }
+ for(i = 0; i < tail->outDegree; i++ ) { /* parallel arc */
+ probe = tail->outArc[ i];
+ if(probe->head == head
+ &&
+ probe->length == nar->length
+ ){
+ free(nar);
+ return probe->id;
+ }
+ }
+
+ nar->id = dg->numArcs;
+ arcId=dg->numArcs;
+ dg->arc[dg->numArcs] = nar;
+ dg->numArcs++;
+
+ head->inArc[ head->inDegree] = nar;
+ head->inDegree++;
+ if ( head->inDegree >= head->maxInDegree ) {
+ newNumber = head->maxInDegree + SMALL_BLOCK_SIZE;
+ ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+ memcpy(ars,head->inArc,(head->inDegree)*sizeof(DGArc*));
+ free(head->inArc);
+ head->inArc=ars;
+ head->maxInDegree = newNumber;
+ }
+ tail->outArc[ tail->outDegree] = nar;
+ tail->outDegree++;
+ if(tail->outDegree >= tail->maxOutDegree ) {
+ newNumber = tail->maxOutDegree + SMALL_BLOCK_SIZE;
+ ars =(DGArc **) calloc(newNumber,sizeof(DGArc*));
+ memcpy(ars,tail->outArc,(tail->outDegree)*sizeof(DGArc*));
+ free(tail->outArc);
+ tail->outArc=ars;
+ tail->maxOutDegree = newNumber;
+ }
+/*fprintf(stderr,"AttachArc: head->in=%d tail->out=%ld\n",head->inDegree,tail->outDegree);*/
+return arcId;
+}
+void graphShow(DGraph *dg,int DetailsLevel){
+ int i=0,j=0;
+ fprintf(stderr,"%d.%s: (%d,%d)\n",dg->id,dg->name,dg->numNodes,dg->numArcs);
+ if ( DetailsLevel < 1) return;
+ for (i = 0; i < dg->numNodes; i++ ) {
+ DGNode *focusNode = dg->node[ i];
+ if(DetailsLevel >= 2) {
+ for (j = 0; j < focusNode->inDegree; j++ ) {
+ fprintf(stderr,"\t ");
+ nodeShow(focusNode->inArc[ j]->tail);
+ }
+ }
+ nodeShow(focusNode);
+ if ( DetailsLevel < 2) continue;
+ for (j = 0; j < focusNode->outDegree; j++ ) {
+ fprintf(stderr, "\t ");
+ nodeShow(focusNode->outArc[ j]->head);
+ }
+ fprintf(stderr, "---\n");
+ }
+ fprintf(stderr,"----------------------------------------\n");
+ if ( DetailsLevel < 3) return;
+}
+
+
+
--- /dev/null
+#ifndef _DGRAPH
+#define _DGRAPH
+
+#define BLOCK_SIZE 128
+#define SMALL_BLOCK_SIZE 32
+
+typedef struct{
+ int id;
+ void *tail,*head;
+ int length,width,attribute,maxWidth;
+}DGArc;
+
+typedef struct{
+ int maxInDegree,maxOutDegree;
+ int inDegree,outDegree;
+ int id;
+ char *name;
+ DGArc **inArc,**outArc;
+ int depth,height,width;
+ int color,attribute,address,verified;
+ void *feat;
+}DGNode;
+
+typedef struct{
+ int maxNodes,maxArcs;
+ int id;
+ char *name;
+ int numNodes,numArcs;
+ DGNode **node;
+ DGArc **arc;
+} DGraph;
+
+DGArc *newArc(DGNode *tl,DGNode *hd);
+void arcShow(DGArc *ar);
+DGNode *newNode(char *nm);
+void nodeShow(DGNode* nd);
+
+DGraph* newDGraph(char *nm);
+int AttachNode(DGraph *dg,DGNode *nd);
+int AttachArc(DGraph *dg,DGArc* nar);
+void graphShow(DGraph *dg,int DetailsLevel);
+
+#endif
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=dt
+BENCHMARKU=DT
+
+include ../config/make.def
+
+include ../sys/make.common
+#Override PROGRAM
+DTPROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS)
+
+OBJS = dt.o DGraph.o \
+ ${COMMON}/c_print_results.o ${COMMON}/c_timers.o ${COMMON}/c_randdp.o
+
+
+${PROGRAM}: config ${OBJS}
+ ${CLINK} ${CLINKFLAGS} -o ${DTPROGRAM} ${OBJS} ${CMPI_LIB}
+
+.c.o:
+ ${CCOMPILE} $<
+
+dt.o: dt.c npbparams.h
+DGraph.o: DGraph.c DGraph.h
+
+clean:
+ - rm -f *.o *~ mputil*
+ - rm -f dt npbparams.h core
--- /dev/null
+Data Traffic benchmark DT is new in the NPB suite
+(released as part of NPB3.x-MPI package).
+----------------------------------------------------
+
+DT is written in C and same executable can run on any number of processors,
+provided this number is not less than the number of nodes in the communication
+graph. DT benchmark takes one argument: BH, WH, or SH. This argument
+specifies the communication graph Black Hole, White Hole, or SHuffle
+respectively. The current release contains verification numbers for
+CLASSES S, W, A, and B only. Classes C and D are defined, but verification
+numbers are not provided in this release.
+
+The following table summarizes the number of nodes in the communication
+graph based on CLASS and graph TYPE.
+
+CLASS N_Source N_Nodes(BH,WH) N_Nodes(SH)
+ S 4 5 12
+ W 8 11 32
+ A 16 21 80
+ B 32 43 192
+ C 64 85 448
+ D 128 171 1024
--- /dev/null
+/*************************************************************************
+ * *
+ * N A S P A R A L L E L B E N C H M A R K S 3.3 *
+ * *
+ * D T *
+ * *
+ *************************************************************************
+ * *
+ * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
+ * *
+ * Permission to use, copy, distribute and modify this software *
+ * for any purpose with or without fee is hereby granted. We *
+ * request, however, that all derived work reference the NAS *
+ * Parallel Benchmarks 3.3. This software is provided "as is" *
+ * without express or implied warranty. *
+ * *
+ * Information on NPB 3.3, including the technical report, the *
+ * original specifications, source code, results and information *
+ * on how to submit new results, is available at: *
+ * *
+ * http: www.nas.nasa.gov/Software/NPB *
+ * *
+ * Send comments or suggestions to npb@nas.nasa.gov *
+ * Send bug reports to npb-bugs@nas.nasa.gov *
+ * *
+ * NAS Parallel Benchmarks Group *
+ * NASA Ames Research Center *
+ * Mail Stop: T27A-1 *
+ * Moffett Field, CA 94035-1000 *
+ * *
+ * E-mail: npb@nas.nasa.gov *
+ * Fax: (650) 604-3957 *
+ * *
+ *************************************************************************
+ * *
+ * Author: M. Frumkin * *
+ * *
+ *************************************************************************/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS 1
+#endif
+
+//int passed_verification;
+extern double randlc( double *X, double *A );
+extern
+void c_print_results( char *name,
+ char class,
+ int n1,
+ int n2,
+ int n3,
+ int niter,
+ int nprocs_compiled,
+ int nprocs_total,
+ double t,
+ double mops,
+ char *optype,
+ int passed_verification,
+ char *npbversion,
+ char *compiletime,
+ char *mpicc,
+ char *clink,
+ char *cmpi_lib,
+ char *cmpi_inc,
+ char *cflags,
+ char *clinkflags );
+
+void timer_clear( int n );
+void timer_start( int n );
+void timer_stop( int n );
+double timer_read( int n );
+int timer_on=0,timers_tot=64;
+
+int verify(char *bmname,double rnm2){
+ double verify_value=0.0;
+ double epsilon=1.0E-8;
+ char cls=CLASS;
+ int verified=-1;
+ if (cls != 'U') {
+ if(cls=='S') {
+ if(strstr(bmname,"BH")){
+ verify_value=30892725.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value=67349758.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value=58875767.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ }
+ verified = 0;
+ }else if(cls=='W') {
+ if(strstr(bmname,"BH")){
+ verify_value = 4102461.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 204280762.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 186944764.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ }
+ verified = 0;
+ }else if(cls=='A') {
+ if(strstr(bmname,"BH")){
+ verify_value = 17809491.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 1289925229.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 610856482.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ }
+ verified = 0;
+ }else if(cls=='B') {
+ if(strstr(bmname,"BH")){
+ verify_value = 4317114.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 7877279917.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 1836863082.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ verified = 0;
+ }
+ }else if(cls=='C') {
+ if(strstr(bmname,"BH")){
+ verify_value = 0.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 0.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 0.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ verified = -1;
+ }
+ }else if(cls=='D') {
+ if(strstr(bmname,"BH")){
+ verify_value = 0.0;
+ }else if(strstr(bmname,"WH")){
+ verify_value = 0.0;
+ }else if(strstr(bmname,"SH")){
+ verify_value = 0.0;
+ }else{
+ fprintf(stderr,"No such benchmark as %s.\n",bmname);
+ }
+ verified = -1;
+ }else{
+ fprintf(stderr,"No such class as %c.\n",cls);
+ }
+ fprintf(stderr," %s L2 Norm = %f\n",bmname,rnm2);
+ if(verified==-1){
+ fprintf(stderr," No verification was performed.\n");
+ }else if( rnm2 - verify_value < epsilon &&
+ rnm2 - verify_value > -epsilon) { /* abs here does not work on ALTIX */
+ verified = 1;
+ fprintf(stderr," Deviation = %f\n",(rnm2 - verify_value));
+ }else{
+ verified = 0;
+ fprintf(stderr," The correct verification value = %f\n",verify_value);
+ fprintf(stderr," Got value = %f\n",rnm2);
+ }
+ }else{
+ verified = -1;
+ }
+ return verified;
+ }
+
+int ipowMod(int a,long long int n,int md){
+ int seed=1,q=a,r=1;
+ if(n<0){
+ fprintf(stderr,"ipowMod: exponent must be nonnegative exp=%lld\n",n);
+ n=-n; /* temp fix */
+/* return 1; */
+ }
+ if(md<=0){
+ fprintf(stderr,"ipowMod: module must be positive mod=%d",md);
+ return 1;
+ }
+ if(n==0) return 1;
+ while(n>1){
+ int n2 = n/2;
+ if (n2*2==n){
+ seed = (q*q)%md;
+ q=seed;
+ n = n2;
+ }else{
+ seed = (r*q)%md;
+ r=seed;
+ n = n-1;
+ }
+ }
+ seed = (r*q)%md;
+ return seed;
+}
+
+#include "DGraph.h"
+DGraph *buildSH(char cls){
+/*
+ Nodes of the graph must be topologically sorted
+ to avoid MPI deadlock.
+*/
+ DGraph *dg;
+ int numSources=NUM_SOURCES; /* must be power of 2 */
+ int numOfLayers=0,tmpS=numSources>>1;
+ int firstLayerNode=0;
+ DGArc *ar=NULL;
+ DGNode *nd=NULL;
+ int mask=0x0,ndid=0,ndoff=0;
+ int i=0,j=0;
+ char nm[BLOCK_SIZE];
+
+ sprintf(nm,"DT_SH.%c",cls);
+ dg=newDGraph(nm);
+
+ while(tmpS>1){
+ numOfLayers++;
+ tmpS>>=1;
+ }
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Source.%d",i);
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ }
+ for(j=0;j<numOfLayers;j++){
+ mask=0x00000001<<j;
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Comparator.%d",(i+j*firstLayerNode));
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ ndoff=i&(~mask);
+ ndid=firstLayerNode+ndoff;
+ ar=newArc(dg->node[ndid],nd);
+ AttachArc(dg,ar);
+ ndoff+=mask;
+ ndid=firstLayerNode+ndoff;
+ ar=newArc(dg->node[ndid],nd);
+ AttachArc(dg,ar);
+ }
+ firstLayerNode+=numSources;
+ }
+ mask=0x00000001<<numOfLayers;
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Sink.%d",i);
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ ndoff=i&(~mask);
+ ndid=firstLayerNode+ndoff;
+ ar=newArc(dg->node[ndid],nd);
+ AttachArc(dg,ar);
+ ndoff+=mask;
+ ndid=firstLayerNode+ndoff;
+ ar=newArc(dg->node[ndid],nd);
+ AttachArc(dg,ar);
+ }
+return dg;
+}
+DGraph *buildWH(char cls){
+/*
+ Nodes of the graph must be topologically sorted
+ to avoid MPI deadlock.
+*/
+ int i=0,j=0;
+ int numSources=NUM_SOURCES,maxInDeg=4;
+ int numLayerNodes=numSources,firstLayerNode=0;
+ int totComparators=0;
+ int numPrevLayerNodes=numLayerNodes;
+ int id=0,sid=0;
+ DGraph *dg;
+ DGNode *nd=NULL,*source=NULL,*tmp=NULL,*snd=NULL;
+ DGArc *ar=NULL;
+ char nm[BLOCK_SIZE];
+
+ sprintf(nm,"DT_WH.%c",cls);
+ dg=newDGraph(nm);
+
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Sink.%d",i);
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ }
+ totComparators=0;
+ numPrevLayerNodes=numLayerNodes;
+ while(numLayerNodes>maxInDeg){
+ numLayerNodes=numLayerNodes/maxInDeg;
+ if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+ for(i=0;i<numLayerNodes;i++){
+ sprintf(nm,"Comparator.%d",totComparators);
+ totComparators++;
+ nd=newNode(nm);
+ id=AttachNode(dg,nd);
+ for(j=0;j<maxInDeg;j++){
+ sid=i*maxInDeg+j;
+ if(sid>=numPrevLayerNodes) break;
+ snd=dg->node[firstLayerNode+sid];
+ ar=newArc(dg->node[id],snd);
+ AttachArc(dg,ar);
+ }
+ }
+ firstLayerNode+=numPrevLayerNodes;
+ numPrevLayerNodes=numLayerNodes;
+ }
+ source=newNode("Source");
+ AttachNode(dg,source);
+ for(i=0;i<numPrevLayerNodes;i++){
+ nd=dg->node[firstLayerNode+i];
+ ar=newArc(source,nd);
+ AttachArc(dg,ar);
+ }
+
+ for(i=0;i<dg->numNodes/2;i++){ /* Topological sorting */
+ tmp=dg->node[i];
+ dg->node[i]=dg->node[dg->numNodes-1-i];
+ dg->node[i]->id=i;
+ dg->node[dg->numNodes-1-i]=tmp;
+ dg->node[dg->numNodes-1-i]->id=dg->numNodes-1-i;
+ }
+return dg;
+}
+DGraph *buildBH(char cls){
+/*
+ Nodes of the graph must be topologically sorted
+ to avoid MPI deadlock.
+*/
+ int i=0,j=0;
+ int numSources=NUM_SOURCES,maxInDeg=4;
+ int numLayerNodes=numSources,firstLayerNode=0;
+ DGraph *dg;
+ DGNode *nd=NULL, *snd=NULL, *sink=NULL;
+ DGArc *ar=NULL;
+ int totComparators=0;
+ int numPrevLayerNodes=numLayerNodes;
+ int id=0, sid=0;
+ char nm[BLOCK_SIZE];
+
+ sprintf(nm,"DT_BH.%c",cls);
+ dg=newDGraph(nm);
+
+ for(i=0;i<numSources;i++){
+ sprintf(nm,"Source.%d",i);
+ nd=newNode(nm);
+ AttachNode(dg,nd);
+ }
+ while(numLayerNodes>maxInDeg){
+ numLayerNodes=numLayerNodes/maxInDeg;
+ if(numLayerNodes*maxInDeg<numPrevLayerNodes)numLayerNodes++;
+ for(i=0;i<numLayerNodes;i++){
+ sprintf(nm,"Comparator.%d",totComparators);
+ totComparators++;
+ nd=newNode(nm);
+ id=AttachNode(dg,nd);
+ for(j=0;j<maxInDeg;j++){
+ sid=i*maxInDeg+j;
+ if(sid>=numPrevLayerNodes) break;
+ snd=dg->node[firstLayerNode+sid];
+ ar=newArc(snd,dg->node[id]);
+ AttachArc(dg,ar);
+ }
+ }
+ firstLayerNode+=numPrevLayerNodes;
+ numPrevLayerNodes=numLayerNodes;
+ }
+ sink=newNode("Sink");
+ AttachNode(dg,sink);
+ for(i=0;i<numPrevLayerNodes;i++){
+ nd=dg->node[firstLayerNode+i];
+ ar=newArc(nd,sink);
+ AttachArc(dg,ar);
+ }
+return dg;
+}
+
+typedef struct{
+ int len;
+ double* val;
+} Arr;
+Arr *newArr(int len){
+ Arr *arr=(Arr *)malloc(sizeof(Arr));
+ arr->len=len;
+ arr->val=(double *)malloc(len*sizeof(double));
+ return arr;
+}
+void arrShow(Arr* a){
+ if(!a) fprintf(stderr,"-- NULL array\n");
+ else{
+ fprintf(stderr,"-- length=%d\n",a->len);
+ }
+}
+double CheckVal(Arr *feat){
+ double csum=0.0;
+ int i=0;
+ for(i=0;i<feat->len;i++){
+ csum+=feat->val[i]*feat->val[i]/feat->len; /* The truncation does not work since
+ result will be 0 for large len */
+ }
+ return csum;
+}
+int GetFNumDPar(int* mean, int* stdev){
+ *mean=NUM_SAMPLES;
+ *stdev=STD_DEVIATION;
+ return 0;
+}
+int GetFeatureNum(char *mbname,int id){
+ double tran=314159265.0;
+ double A=2*id+1;
+ double denom=randlc(&tran,&A);
+ char cval='S';
+ int mean=NUM_SAMPLES,stdev=128;
+ int rtfs=0,len=0;
+ GetFNumDPar(&mean,&stdev);
+ rtfs=ipowMod((int)(1/denom)*(int)cval,(long long int) (2*id+1),2*stdev);
+ if(rtfs<0) rtfs=-rtfs;
+ len=mean-stdev+rtfs;
+ return len;
+}
+Arr* RandomFeatures(char *bmname,int fdim,int id){
+ int len=GetFeatureNum(bmname,id)*fdim;
+ Arr* feat=newArr(len);
+ int nxg=2,nyg=2,nzg=2,nfg=5;
+ int nx=421,ny=419,nz=1427,nf=3527;
+ long long int expon=(len*(id+1))%3141592;
+ int seedx=ipowMod(nxg,expon,nx),
+ seedy=ipowMod(nyg,expon,ny),
+ seedz=ipowMod(nzg,expon,nz),
+ seedf=ipowMod(nfg,expon,nf);
+ int i=0;
+ if(timer_on){
+ timer_clear(id+1);
+ timer_start(id+1);
+ }
+ for(i=0;i<len;i+=fdim){
+ seedx=(seedx*nxg)%nx;
+ seedy=(seedy*nyg)%ny;
+ seedz=(seedz*nzg)%nz;
+ seedf=(seedf*nfg)%nf;
+ feat->val[i]=seedx;
+ feat->val[i+1]=seedy;
+ feat->val[i+2]=seedz;
+ feat->val[i+3]=seedf;
+ }
+ if(timer_on){
+ timer_stop(id+1);
+ fprintf(stderr,"** RandomFeatures time in node %d = %f\n",id,timer_read(id+1));
+ }
+ return feat;
+}
+void Resample(Arr *a,int blen){
+ long long int i=0,j=0,jlo=0,jhi=0;
+ double avval=0.0;
+ double *nval=(double *)malloc(blen*sizeof(double));
+ Arr *tmp=newArr(10);
+ for(i=0;i<blen;i++) nval[i]=0.0;
+ for(i=1;i<a->len-1;i++){
+ jlo=(int)(0.5*(2*i-1)*(blen/a->len));
+ jhi=(int)(0.5*(2*i+1)*(blen/a->len));
+
+ avval=a->val[i]/(jhi-jlo+1);
+ for(j=jlo;j<=jhi;j++){
+ nval[j]+=avval;
+ }
+ }
+ nval[0]=a->val[0];
+ nval[blen-1]=a->val[a->len-1];
+ free(a->val);
+ a->val=nval;
+ a->len=blen;
+}
+#define fielddim 4
+Arr* WindowFilter(Arr *a, Arr* b,int w){
+ int i=0,j=0,k=0;
+ double rms0=0.0,rms1=0.0,rmsm1=0.0;
+ double weight=((double) (w+1))/(w+2);
+
+ w+=1;
+ if(timer_on){
+ timer_clear(w);
+ timer_start(w);
+ }
+ if(a->len<b->len) Resample(a,b->len);
+ if(a->len>b->len) Resample(b,a->len);
+ for(i=fielddim;i<a->len-fielddim;i+=fielddim){
+ rms0=(a->val[i]-b->val[i])*(a->val[i]-b->val[i])
+ +(a->val[i+1]-b->val[i+1])*(a->val[i+1]-b->val[i+1])
+ +(a->val[i+2]-b->val[i+2])*(a->val[i+2]-b->val[i+2])
+ +(a->val[i+3]-b->val[i+3])*(a->val[i+3]-b->val[i+3]);
+ j=i+fielddim;
+ rms1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+ +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+ +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+ +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+ j=i-fielddim;
+ rmsm1=(a->val[j]-b->val[j])*(a->val[j]-b->val[j])
+ +(a->val[j+1]-b->val[j+1])*(a->val[j+1]-b->val[j+1])
+ +(a->val[j+2]-b->val[j+2])*(a->val[j+2]-b->val[j+2])
+ +(a->val[j+3]-b->val[j+3])*(a->val[j+3]-b->val[j+3]);
+ k=0;
+ if(rms1<rms0){
+ k=1;
+ rms0=rms1;
+ }
+ if(rmsm1<rms0) k=-1;
+ if(k==0){
+ j=i+fielddim;
+ a->val[i]=weight*b->val[i];
+ a->val[i+1]=weight*b->val[i+1];
+ a->val[i+2]=weight*b->val[i+2];
+ a->val[i+3]=weight*b->val[i+3];
+ }else if(k==1){
+ j=i+fielddim;
+ a->val[i]=weight*b->val[j];
+ a->val[i+1]=weight*b->val[j+1];
+ a->val[i+2]=weight*b->val[j+2];
+ a->val[i+3]=weight*b->val[j+3];
+ }else { /*if(k==-1)*/
+ j=i-fielddim;
+ a->val[i]=weight*b->val[j];
+ a->val[i+1]=weight*b->val[j+1];
+ a->val[i+2]=weight*b->val[j+2];
+ a->val[i+3]=weight*b->val[j+3];
+ }
+ }
+ if(timer_on){
+ timer_stop(w);
+ fprintf(stderr,"** WindowFilter time in node %d = %f\n",(w-1),timer_read(w));
+ }
+ return a;
+}
+
+int SendResults(DGraph *dg,DGNode *nd,Arr *feat){
+ int i=0,tag=0;
+ DGArc *ar=NULL;
+ DGNode *head=NULL;
+ if(!feat) return 0;
+ for(i=0;i<nd->outDegree;i++){
+ ar=nd->outArc[i];
+ if(ar->tail!=nd) continue;
+ head=ar->head;
+ tag=ar->id;
+ if(head->address!=nd->address){
+ MPI_Send(&feat->len,1,MPI_INT,head->address,tag,MPI_COMM_WORLD);
+ MPI_Send(feat->val,feat->len,MPI_DOUBLE,head->address,tag,MPI_COMM_WORLD);
+ }
+ }
+ return 1;
+}
+Arr* CombineStreams(DGraph *dg,DGNode *nd){
+ Arr *resfeat=newArr(NUM_SAMPLES*fielddim);
+ int i=0,len=0,tag=0;
+ DGArc *ar=NULL;
+ DGNode *tail=NULL;
+ MPI_Status status;
+ Arr *feat=NULL,*featp=NULL;
+
+ if(nd->inDegree==0) return NULL;
+ for(i=0;i<nd->inDegree;i++){
+ ar=nd->inArc[i];
+ if(ar->head!=nd) continue;
+ tail=ar->tail;
+ if(tail->address!=nd->address){
+ len=0;
+ tag=ar->id;
+ MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+ feat=newArr(len);
+ MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+ resfeat=WindowFilter(resfeat,feat,nd->id);
+ free(feat);
+ }else{
+ featp=(Arr *)tail->feat;
+ feat=newArr(featp->len);
+ memcpy(feat->val,featp->val,featp->len*sizeof(double));
+ resfeat=WindowFilter(resfeat,feat,nd->id);
+ free(feat);
+ }
+ }
+ for(i=0;i<resfeat->len;i++) resfeat->val[i]=((int)resfeat->val[i])/nd->inDegree;
+ nd->feat=resfeat;
+ return nd->feat;
+}
+double Reduce(Arr *a,int w){
+ double retv=0.0;
+ if(timer_on){
+ timer_clear(w);
+ timer_start(w);
+ }
+ retv=(int)(w*CheckVal(a));/* The casting needed for node
+ and array dependent verifcation */
+ if(timer_on){
+ timer_stop(w);
+ fprintf(stderr,"** Reduce time in node %d = %f\n",(w-1),timer_read(w));
+ }
+ return retv;
+}
+
+double ReduceStreams(DGraph *dg,DGNode *nd){
+ double csum=0.0;
+ int i=0,len=0,tag=0;
+ DGArc *ar=NULL;
+ DGNode *tail=NULL;
+ Arr *feat=NULL;
+ double retv=0.0;
+
+ for(i=0;i<nd->inDegree;i++){
+ ar=nd->inArc[i];
+ if(ar->head!=nd) continue;
+ tail=ar->tail;
+ if(tail->address!=nd->address){
+ MPI_Status status;
+ len=0;
+ tag=ar->id;
+ MPI_Recv(&len,1,MPI_INT,tail->address,tag,MPI_COMM_WORLD,&status);
+ feat=newArr(len);
+ MPI_Recv(feat->val,feat->len,MPI_DOUBLE,tail->address,tag,MPI_COMM_WORLD,&status);
+ csum+=Reduce(feat,(nd->id+1));
+ free(feat);
+ }else{
+ csum+=Reduce(tail->feat,(nd->id+1));
+ }
+ }
+ if(nd->inDegree>0)csum=(((long long int)csum)/nd->inDegree);
+ retv=(nd->id+1)*csum;
+ return retv;
+}
+
+int ProcessNodes(DGraph *dg,int me){
+ double chksum=0.0;
+ Arr *feat=NULL;
+ int i=0,verified=0,tag;
+ DGNode *nd=NULL;
+ double rchksum=0.0;
+ MPI_Status status;
+
+ for(i=0;i<dg->numNodes;i++){
+ nd=dg->node[i];
+ if(nd->address!=me) continue;
+ if(strstr(nd->name,"Source")){
+ nd->feat=RandomFeatures(dg->name,fielddim,nd->id);
+ SendResults(dg,nd,nd->feat);
+ }else if(strstr(nd->name,"Sink")){
+ chksum=ReduceStreams(dg,nd);
+ tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+ MPI_Send(&chksum,1,MPI_DOUBLE,0,tag,MPI_COMM_WORLD);
+ }else{
+ feat=CombineStreams(dg,nd);
+ SendResults(dg,nd,feat);
+ }
+ }
+ if(me==0){ /* Report node */
+ rchksum=0.0;
+ chksum=0.0;
+ for(i=0;i<dg->numNodes;i++){
+ nd=dg->node[i];
+ if(!strstr(nd->name,"Sink")) continue;
+ tag=dg->numArcs+nd->id; /* make these to avoid clash with arc tags */
+ MPI_Recv(&rchksum,1,MPI_DOUBLE,nd->address,tag,MPI_COMM_WORLD,&status);
+ chksum+=rchksum;
+ }
+ verified=verify(dg->name,chksum);
+ }
+return verified;
+}
+
+int main(int argc,char **argv ){
+ int my_rank,comm_size;
+ int i;
+ DGraph *dg=NULL;
+ int verified=0, featnum=0;
+ double bytes_sent=2.0,tot_time=0.0;
+
+ MPI_Init( &argc, &argv );
+ MPI_Comm_rank( MPI_COMM_WORLD, &my_rank );
+ MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+
+ if(argc!=2||
+ ( strncmp(argv[1],"BH",2)!=0
+ &&strncmp(argv[1],"WH",2)!=0
+ &&strncmp(argv[1],"SH",2)!=0
+ )
+ ){
+ if(my_rank==0){
+ fprintf(stderr,"** Usage: mpirun -np N ../bin/dt.S GraphName\n");
+ fprintf(stderr,"** Where \n - N is integer number of MPI processes\n");
+ fprintf(stderr," - S is the class S, W, or A \n");
+ fprintf(stderr," - GraphName is the communication graph name BH, WH, or SH.\n");
+ fprintf(stderr," - the number of MPI processes N should not be be less than \n");
+ fprintf(stderr," the number of nodes in the graph\n");
+ }
+ MPI_Finalize();
+ exit(0);
+ }
+ if(strncmp(argv[1],"BH",2)==0){
+ dg=buildBH(CLASS);
+ }else if(strncmp(argv[1],"WH",2)==0){
+ dg=buildWH(CLASS);
+ }else if(strncmp(argv[1],"SH",2)==0){
+ dg=buildSH(CLASS);
+ }
+
+ if(timer_on&&dg->numNodes+1>timers_tot){
+ timer_on=0;
+ if(my_rank==0)
+ fprintf(stderr,"Not enough timers. Node timeing is off. \n");
+ }
+ if(dg->numNodes>comm_size){
+ if(my_rank==0){
+ fprintf(stderr,"** The number of MPI processes should not be less than \n");
+ fprintf(stderr,"** the number of nodes in the graph\n");
+ fprintf(stderr,"** Number of MPI processes = %d\n",comm_size);
+ fprintf(stderr,"** Number nodes in the graph = %d\n",dg->numNodes);
+ }
+ MPI_Finalize();
+ exit(0);
+ }
+ for(i=0;i<dg->numNodes;i++){
+ dg->node[i]->address=i;
+ }
+ if( my_rank == 0 ){
+ printf( "\n\n NAS Parallel Benchmarks 3.3 -- DT Benchmark\n\n" );
+ graphShow(dg,0);
+ timer_clear(0);
+ timer_start(0);
+ }
+ verified=ProcessNodes(dg,my_rank);
+
+ featnum=NUM_SAMPLES*fielddim;
+ bytes_sent=featnum*dg->numArcs;
+ bytes_sent/=1048576;
+ if(my_rank==0){
+ timer_stop(0);
+ tot_time=timer_read(0);
+ c_print_results( dg->name,
+ CLASS,
+ featnum,
+ 0,
+ 0,
+ dg->numNodes,
+ 0,
+ comm_size,
+ tot_time,
+ bytes_sent/tot_time,
+ "bytes transmitted",
+ verified,
+ NPBVERSION,
+ COMPILETIME,
+ MPICC,
+ CLINK,
+ CMPI_LIB,
+ CMPI_INC,
+ CFLAGS,
+ CLINKFLAGS );
+ }
+ MPI_Finalize();
+ return 1;
+}
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=ep
+BENCHMARKU=EP
+
+include ../config/make.def
+
+#OBJS = ep.o ${COMMON}/print_results.o ${COMMON}/${RAND}.o ${COMMON}/timers.o
+OBJS = ep.o randlc.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+# ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+ ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
+
+
+#ep.o: ep.f mpinpb.h npbparams.h
+# ${FCOMPILE} ep.f
+
+ep.o: ep.c randlc.c mpinpb.h npbparams.h
+ ${CCOMPILE} ep.c
+
+clean:
+ - rm -f *.o *~
+ - rm -f npbparams.h core
+
+
+
--- /dev/null
+This code implements the random-number generator described in the
+NAS Parallel Benchmark document RNR Technical Report RNR-94-007.
+The code is "embarrassingly" parallel in that no communication is
+required for the generation of the random numbers itself. There is
+no special requirement on the number of processors used for running
+the benchmark.
--- /dev/null
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include "mpi.h"
+#include "npbparams.h"
+
+#include "randlc.h"
+
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS 1
+#endif
+#define true 1
+#define false 0
+
+
+//---NOTE : all the timers function have been modified to
+// avoid global timers (privatize these).
+ // ----------------------- timers ---------------------
+ void timer_clear(double *onetimer) {
+ //elapsed[n] = 0.0;
+ *onetimer = 0.0;
+ }
+
+ void timer_start(double *onetimer) {
+ *onetimer = MPI_Wtime();
+ }
+
+ void timer_stop(int n,double *elapsed,double *start) {
+ double t, now;
+
+ now = MPI_Wtime();
+ t = now - start[n];
+ elapsed[n] += t;
+ }
+
+ double timer_read(int n, double *elapsed) { /* ok, useless, but jsut to keep function call */
+ return(elapsed[n]);
+ }
+ /********************************************************************
+ ***************** V R A N L C ******************
+ ***************** *****************/
+ double vranlc(int n, double x, double a, double *y)
+ {
+ int i;
+ long i246m1=0x00003FFFFFFFFFFF;
+ long LLx, Lx, La;
+ double d2m46;
+
+// This doesn't work, because the compiler does the calculation in 32
+// bits and overflows. No standard way (without f90 stuff) to specify
+// that the rhs should be done in 64 bit arithmetic.
+// parameter(i246m1=2**46-1)
+
+ d2m46=pow(0.5,46);
+
+// c Note that the v6 compiler on an R8000 does something stupid with
+// c the above. Using the following instead (or various other things)
+// c makes the calculation run almost 10 times as fast.
+//
+// c save d2m46
+// c data d2m46/0.0d0/
+// c if (d2m46 .eq. 0.0d0) then
+// c d2m46 = 0.5d0**46
+// c endif
+
+ Lx = (long)x;
+ La = (long)a;
+ //fprintf(stdout,("================== Vranlc ================");
+ //fprintf(stdout,("Before Loop: Lx = " + Lx + ", La = " + La);
+ LLx = Lx;
+ for (i=0; i< n; i++) {
+ Lx = Lx*La & i246m1 ;
+ LLx = Lx;
+ y[i] = d2m46 * (double)LLx;
+ /*
+ if(i == 0) {
+ fprintf(stdout,("After loop 0:");
+ fprintf(stdout,("Lx = " + Lx + ", La = " + La);
+ fprintf(stdout,("d2m46 = " + d2m46);
+ fprintf(stdout,("LLX(Lx) = " + LLX.doubleValue());
+ fprintf(stdout,("Y[0]" + y[0]);
+ }
+ */
+ }
+
+ x = (double)LLx;
+ /*
+ fprintf(stdout,("Change: Lx = " + Lx);
+ fprintf(stdout,("=============End Vranlc ================");
+ */
+ return x;
+ }
+
+
+
+//-------------- the core (unique function) -----------
+ void doTest(int argc, char **argv) {
+ double dum[3] = {1.,1.,1.};
+ double x1, x2, sx, sy, tm, an, tt, gc;
+ double Mops;
+ double epsilon=1.0E-8, a = 1220703125., s=271828183.;
+ double t1, t2, t3, t4;
+ double sx_verify_value, sy_verify_value, sx_err, sy_err;
+
+#include "npbparams.h"
+ int mk=16,
+ // --> set by make : in npbparams.h
+ //m=28, // for CLASS=A
+ //m=30, // for CLASS=B
+ //npm=2, // NPROCS
+ mm = m-mk,
+ nn = (int)(pow(2,mm)),
+ nk = (int)(pow(2,mk)),
+ nq=10,
+ np,
+ node,
+ no_nodes,
+ i,
+ ik,
+ kk,
+ l,
+ k, nit, no_large_nodes,
+ np_add, k_offset, j;
+ int me, nprocs, root=0, dp_type;
+ int verified,
+ timers_enabled=true;
+ char size[500]; // mind the size of the string to represent a big number
+
+ //Use in randlc..
+ int KS = 0;
+ double R23, R46, T23, T46;
+
+ double *qq = (double *) malloc (10000*sizeof(double));
+ double *start = (double *) malloc (64*sizeof(double));
+ double *elapsed = (double *) malloc (64*sizeof(double));
+
+ double *x = (double *) malloc (2*nk*sizeof(double));
+ double *q = (double *) malloc (nq*sizeof(double));
+
+ MPI_Init( &argc, &argv );
+ MPI_Comm_size( MPI_COMM_WORLD, &no_nodes);
+ MPI_Comm_rank( MPI_COMM_WORLD, &node);
+
+#ifdef USE_MPE
+ MPE_Init_log();
+#endif
+ root = 0;
+ if (node == root ) {
+
+ /* Because the size of the problem is too large to store in a 32-bit
+ * integer for some classes, we put it into a string (for printing).
+ * Have to strip off the decimal point put in there by the floating
+ * point print statement (internal file)
+ */
+ fprintf(stdout," NAS Parallel Benchmarks 3.2 -- EP Benchmark");
+ sprintf(size,"%d",pow(2,m+1));
+ //size = size.replace('.', ' ');
+ fprintf(stdout," Number of random numbers generated: %s\n",size);
+ fprintf(stdout," Number of active processes: %d\n",no_nodes);
+
+ }
+ verified = false;
+
+ /* c Compute the number of "batches" of random number pairs generated
+ c per processor. Adjust if the number of processors does not evenly
+ c divide the total number
+*/
+
+ np = nn / no_nodes;
+ no_large_nodes = nn % no_nodes;
+ if (node < no_large_nodes) np_add = 1;
+ else np_add = 0;
+ np = np + np_add;
+
+ if (np == 0) {
+ fprintf(stdout,"Too many nodes: %d %d",no_nodes,nn);
+ MPI_Abort(MPI_COMM_WORLD,1);
+ exit(0);
+ }
+
+/* c Call the random number generator functions and initialize
+ c the x-array to reduce the effects of paging on the timings.
+ c Also, call all mathematical functions that are used. Make
+ c sure these initializations cannot be eliminated as dead code.
+*/
+
+ //call vranlc(0, dum[1], dum[2], dum[3]);
+ // Array indexes start at 1 in Fortran, 0 in Java
+ vranlc(0, dum[0], dum[1], &(dum[2]));
+
+ dum[0] = randlc(&(dum[1]),&(dum[2]));
+ /////////////////////////////////
+ for (i=0;i<2*nk;i++) {
+ x[i] = -1e99;
+ }
+ Mops = log(sqrt(abs(1)));
+
+ /*
+ c---------------------------------------------------------------------
+ c Synchronize before placing time stamp
+ c---------------------------------------------------------------------
+ */
+ MPI_Barrier( MPI_COMM_WORLD );
+
+ timer_clear(&(elapsed[1]));
+ timer_clear(&(elapsed[2]));
+ timer_clear(&(elapsed[3]));
+ timer_start(&(start[1]));
+
+ t1 = a;
+ //fprintf(stdout,("(ep.f:160) t1 = " + t1);
+ t1 = vranlc(0, t1, a, x);
+ //fprintf(stdout,("(ep.f:161) t1 = " + t1);
+
+
+/* c Compute AN = A ^ (2 * NK) (mod 2^46). */
+
+ t1 = a;
+ //fprintf(stdout,("(ep.f:165) t1 = " + t1);
+ for (i=1; i <= mk+1; i++) {
+ t2 = randlc(&t1, &t1);
+ //fprintf(stdout,("(ep.f:168)[loop i=" + i +"] t1 = " + t1);
+ }
+ an = t1;
+ //fprintf(stdout,("(ep.f:172) s = " + s);
+ tt = s;
+ gc = 0.;
+ sx = 0.;
+ sy = 0.;
+ for (i=0; i < nq ; i++) {
+ q[i] = 0.;
+ }
+
+/*
+ Each instance of this loop may be performed independently. We compute
+ the k offsets separately to take into account the fact that some nodes
+ have more numbers to generate than others
+*/
+
+ if (np_add == 1)
+ k_offset = node * np -1;
+ else
+ k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1;
+
+ int stop = false;
+ for(k = 1; k <= np; k++) {
+ stop = false;
+ kk = k_offset + k ;
+ t1 = s;
+ //fprintf(stdout,("(ep.f:193) t1 = " + t1);
+ t2 = an;
+
+// Find starting seed t1 for this kk.
+
+ for (i=1;i<=100 && !stop;i++) {
+ ik = kk / 2;
+ //fprintf(stdout,("(ep.f:199) ik = " +ik+", kk = " + kk);
+ if (2 * ik != kk) {
+ t3 = randlc(&t1, &t2);
+ //fprintf(stdout,("(ep.f:200) t1= " +t1 );
+ }
+ if (ik==0)
+ stop = true;
+ else {
+ t3 = randlc(&t2, &t2);
+ kk = ik;
+ }
+ }
+// Compute uniform pseudorandom numbers.
+
+ //if (timers_enabled) timer_start(3);
+ timer_start(&(start[3]));
+ //call vranlc(2 * nk, t1, a, x) --> t1 and y are modified
+
+ //fprintf(stdout,">>>>>>>>>>>Before vranlc(l.210)<<<<<<<<<<<<<");
+ //fprintf(stdout,"2*nk = " + (2*nk));
+ //fprintf(stdout,"t1 = " + t1);
+ //fprintf(stdout,"a = " + a);
+ //fprintf(stdout,"x[0] = " + x[0]);
+ //fprintf(stdout,">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+
+ t1 = vranlc(2 * nk, t1, a, x);
+
+ //fprintf(stdout,(">>>>>>>>>>>After Enter vranlc (l.210)<<<<<<");
+ //fprintf(stdout,("2*nk = " + (2*nk));
+ //fprintf(stdout,("t1 = " + t1);
+ //fprintf(stdout,("a = " + a);
+ //fprintf(stdout,("x[0] = " + x[0]);
+ //fprintf(stdout,(">>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<");
+
+ //if (timers_enabled) timer_stop(3);
+ timer_stop(3,elapsed,start);
+
+/* Compute Gaussian deviates by acceptance-rejection method and
+ * tally counts in concentric square annuli. This loop is not
+ * vectorizable.
+ */
+ //if (timers_enabled) timer_start(2);
+ timer_start(&(start[2]));
+ for(i=1; i<=nk;i++) {
+ x1 = 2. * x[2*i-2] -1.0;
+ x2 = 2. * x[2*i-1] - 1.0;
+ t1 = x1*x1 + x2*x2;
+ if (t1 <= 1.) {
+ t2 = sqrt(-2. * log(t1) / t1);
+ t3 = (x1 * t2);
+ t4 = (x2 * t2);
+ l = (int)(abs(t3) > abs(t4) ? abs(t3) : abs(t4));
+ q[l] = q[l] + 1.;
+ sx = sx + t3;
+ sy = sy + t4;
+ }
+ /*
+ if(i == 1) {
+ fprintf(stdout,"x1 = " + x1);
+ fprintf(stdout,"x2 = " + x2);
+ fprintf(stdout,"t1 = " + t1);
+ fprintf(stdout,"t2 = " + t2);
+ fprintf(stdout,"t3 = " + t3);
+ fprintf(stdout,"t4 = " + t4);
+ fprintf(stdout,"l = " + l);
+ fprintf(stdout,"q[l] = " + q[l]);
+ fprintf(stdout,"sx = " + sx);
+ fprintf(stdout,"sy = " + sy);
+ }
+ */
+ }
+ //if (timers_enabled) timer_stop(2);
+ timer_stop(2,elapsed,start);
+ }
+
+ //int MPI_Allreduce(void *sbuf, void *rbuf, int count, MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
+ MPI_Allreduce(&sx, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+ sx = x[0]; //FIXME : x[0] or x[1] => x[0] because fortran starts with 1
+ MPI_Allreduce(&sy, x, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+ sy = x[0];
+ MPI_Allreduce(q, x, nq, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+
+ for(i = 0; i < nq; i++) {
+ q[i] = x[i];
+ }
+ for(i = 0; i < nq; i++) {
+ gc += q[i];
+ }
+
+ timer_stop(1,elapsed,start);
+ tm = timer_read(1,elapsed);
+ MPI_Allreduce(&tm, x, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+ tm = x[0];
+
+ if(node == root) {
+ nit = 0;
+ verified = true;
+
+ if(m == 24) {
+ sx_verify_value = -3.247834652034740E3;
+ sy_verify_value = -6.958407078382297E3;
+ } else if(m == 25) {
+ sx_verify_value = -2.863319731645753E3;
+ sy_verify_value = -6.320053679109499E3;
+ } else if(m == 28) {
+ sx_verify_value = -4.295875165629892E3;
+ sy_verify_value = -1.580732573678431E4;
+ } else if(m == 30) {
+ sx_verify_value = 4.033815542441498E4;
+ sy_verify_value = -2.660669192809235E4;
+ } else if(m == 32) {
+ sx_verify_value = 4.764367927995374E4;
+ sy_verify_value = -8.084072988043731E4;
+ } else if(m == 36) {
+ sx_verify_value = 1.982481200946593E5;
+ sy_verify_value = -1.020596636361769E5;
+ } else {
+ verified = false;
+ }
+
+ /*
+ fprintf(stdout,("sx = " + sx);
+ fprintf(stdout,("sx_verify = " + sx_verify_value);
+ fprintf(stdout,("sy = " + sy);
+ fprintf(stdout,("sy_verify = " + sy_verify_value);
+ */
+ if(verified) {
+ sx_err = abs((sx - sx_verify_value)/sx_verify_value);
+ sy_err = abs((sy - sy_verify_value)/sy_verify_value);
+ /*
+ fprintf(stdout,("sx_err = " + sx_err);
+ fprintf(stdout,("sy_err = " + sx_err);
+ fprintf(stdout,("epsilon= " + epsilon);
+ */
+ verified = ((sx_err < epsilon) && (sy_err < epsilon));
+ }
+
+ Mops = (pow(2.0, m+1))/tm/1000;
+
+ fprintf(stdout,"EP Benchmark Results:\n");
+ fprintf(stdout,"CPU Time=%d\n",tm);
+ fprintf(stdout,"N = 2^%d\n",m);
+ fprintf(stdout,"No. Gaussain Pairs =%d\n",gc);
+ fprintf(stdout,"Sum = %lf %ld\n",sx,sy);
+ fprintf(stdout,"Count:");
+ for(i = 0; i < nq; i++) {
+ fprintf(stdout,"%d\t %ld\n",i,q[i]);
+ }
+
+ /*
+ print_results("EP", _class, m+1, 0, 0, nit, npm, no_nodes, tm, Mops,
+ "Random numbers generated", verified, npbversion,
+ compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) */
+ fprintf(stdout,"\nEP Benchmark Completed\n");
+ fprintf(stdout,"Class = %s\n", _class);
+ fprintf(stdout,"Size = %s\n", size);
+ fprintf(stdout,"Iteration = %d\n", nit);
+ fprintf(stdout,"Time in seconds = %lf\n",(tm/1000));
+ fprintf(stdout,"Total processes = %d\n",no_nodes);
+ fprintf(stdout,"Mops/s total = %lf\n",Mops);
+ fprintf(stdout,"Mops/s/process = %lf\n", Mops/no_nodes);
+ fprintf(stdout,"Operation type = Random number generated\n");
+ if(verified) {
+ fprintf(stdout,"Verification = SUCCESSFUL\n");
+ } else {
+ fprintf(stdout,"Verification = UNSUCCESSFUL\n");
+ }
+ fprintf(stdout,"Total time: %lf\n",(timer_read(1,elapsed)/1000));
+ fprintf(stdout,"Gaussian pairs: %lf\n",(timer_read(2,elapsed)/1000));
+ fprintf(stdout,"Random numbers: %lf\n",(timer_read(3,elapsed)/1000));
+ }
+#ifdef USE_MPE
+ MPE_Finish_log(argv[0]);
+#endif
+
+ MPI_Finalize();
+ }
+
+ int main(int argc, char **argv) {
+ doTest(argc,argv);
+ }
--- /dev/null
+!-------------------------------------------------------------------------!
+! !
+! N A S P A R A L L E L B E N C H M A R K S 3.3 !
+! !
+! E P !
+! !
+!-------------------------------------------------------------------------!
+! !
+! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
+! It is described in NAS Technical Reports 95-020 and 02-007 !
+! !
+! Permission to use, copy, distribute and modify this software !
+! for any purpose with or without fee is hereby granted. We !
+! request, however, that all derived work reference the NAS !
+! Parallel Benchmarks 3.3. This software is provided "as is" !
+! without express or implied warranty. !
+! !
+! Information on NPB 3.3, including the technical report, the !
+! original specifications, source code, results and information !
+! on how to submit new results, is available at: !
+! !
+! http://www.nas.nasa.gov/Software/NPB/ !
+! !
+! Send comments or suggestions to npb@nas.nasa.gov !
+! !
+! NAS Parallel Benchmarks Group !
+! NASA Ames Research Center !
+! Mail Stop: T27A-1 !
+! Moffett Field, CA 94035-1000 !
+! !
+! E-mail: npb@nas.nasa.gov !
+! Fax: (650) 604-3957 !
+! !
+!-------------------------------------------------------------------------!
+
+
+c---------------------------------------------------------------------
+c
+c Authors: P. O. Frederickson
+c D. H. Bailey
+c A. C. Woo
+c R. F. Van der Wijngaart
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+ program EMBAR
+c---------------------------------------------------------------------
+C
+c This is the MPI version of the APP Benchmark 1,
+c the "embarassingly parallel" benchmark.
+c
+c
+c M is the Log_2 of the number of complex pairs of uniform (0, 1) random
+c numbers. MK is the Log_2 of the size of each batch of uniform random
+c numbers. MK can be set for convenience on a given system, since it does
+c not affect the results.
+
+ implicit none
+
+ include 'npbparams.h'
+ include 'mpinpb.h'
+
+ double precision Mops, epsilon, a, s, t1, t2, t3, t4, x, x1,
+ > x2, q, sx, sy, tm, an, tt, gc, dum(3),
+ > timer_read
+ double precision sx_verify_value, sy_verify_value, sx_err, sy_err
+ integer mk, mm, nn, nk, nq, np, ierr, node, no_nodes,
+ > i, ik, kk, l, k, nit, ierrcode, no_large_nodes,
+ > np_add, k_offset, j
+ logical verified, timers_enabled
+ parameter (timers_enabled = .false.)
+ external randlc, timer_read
+ double precision randlc, qq
+ character*15 size
+
+ parameter (mk = 16, mm = m - mk, nn = 2 ** mm,
+ > nk = 2 ** mk, nq = 10, epsilon=1.d-8,
+ > a = 1220703125.d0, s = 271828183.d0)
+
+ common/storage/ x(2*nk), q(0:nq-1), qq(10000)
+ data dum /1.d0, 1.d0, 1.d0/
+
+ call mpi_init(ierr)
+ call mpi_comm_rank(MPI_COMM_WORLD,node,ierr)
+ call mpi_comm_size(MPI_COMM_WORLD,no_nodes,ierr)
+
+ root = 0
+
+ if (.not. convertdouble) then
+ dp_type = MPI_DOUBLE_PRECISION
+ else
+ dp_type = MPI_REAL
+ endif
+
+ if (node.eq.root) then
+
+c Because the size of the problem is too large to store in a 32-bit
+c integer for some classes, we put it into a string (for printing).
+c Have to strip off the decimal point put in there by the floating
+c point print statement (internal file)
+
+ write(*, 1000)
+ write(size, '(f15.0)' ) 2.d0**(m+1)
+ j = 15
+ if (size(j:j) .eq. '.') j = j - 1
+ write (*,1001) size(1:j)
+ write(*, 1003) no_nodes
+
+ 1000 format(/,' NAS Parallel Benchmarks 3.3 -- EP Benchmark',/)
+ 1001 format(' Number of random numbers generated: ', a15)
+ 1003 format(' Number of active processes: ', 2x, i13, /)
+
+ endif
+
+ verified = .false.
+
+c Compute the number of "batches" of random number pairs generated
+c per processor. Adjust if the number of processors does not evenly
+c divide the total number
+
+ np = nn / no_nodes
+ no_large_nodes = mod(nn, no_nodes)
+ if (node .lt. no_large_nodes) then
+ np_add = 1
+ else
+ np_add = 0
+ endif
+ np = np + np_add
+
+ if (np .eq. 0) then
+ write (6, 1) no_nodes, nn
+ 1 format ('Too many nodes:',2i6)
+ call mpi_abort(MPI_COMM_WORLD,ierrcode,ierr)
+ stop
+ endif
+
+c Call the random number generator functions and initialize
+c the x-array to reduce the effects of paging on the timings.
+c Also, call all mathematical functions that are used. Make
+c sure these initializations cannot be eliminated as dead code.
+
+ call vranlc(0, dum(1), dum(2), dum(3))
+ dum(1) = randlc(dum(2), dum(3))
+ do 5 i = 1, 2*nk
+ x(i) = -1.d99
+ 5 continue
+ Mops = log(sqrt(abs(max(1.d0,1.d0))))
+
+c---------------------------------------------------------------------
+c Synchronize before placing time stamp
+c---------------------------------------------------------------------
+ call mpi_barrier(MPI_COMM_WORLD, ierr)
+
+ call timer_clear(1)
+ call timer_clear(2)
+ call timer_clear(3)
+ call timer_start(1)
+
+ t1 = a
+ call vranlc(0, t1, a, x)
+
+c Compute AN = A ^ (2 * NK) (mod 2^46).
+
+ t1 = a
+
+ do 100 i = 1, mk + 1
+ t2 = randlc(t1, t1)
+ 100 continue
+
+ an = t1
+ tt = s
+ gc = 0.d0
+ sx = 0.d0
+ sy = 0.d0
+
+ do 110 i = 0, nq - 1
+ q(i) = 0.d0
+ 110 continue
+
+c Each instance of this loop may be performed independently. We compute
+c the k offsets separately to take into account the fact that some nodes
+c have more numbers to generate than others
+
+ if (np_add .eq. 1) then
+ k_offset = node * np -1
+ else
+ k_offset = no_large_nodes*(np+1) + (node-no_large_nodes)*np -1
+ endif
+
+ do 150 k = 1, np
+ kk = k_offset + k
+ t1 = s
+ t2 = an
+
+c Find starting seed t1 for this kk.
+
+ do 120 i = 1, 100
+ ik = kk / 2
+ if (2 * ik .ne. kk) t3 = randlc(t1, t2)
+ if (ik .eq. 0) goto 130
+ t3 = randlc(t2, t2)
+ kk = ik
+ 120 continue
+
+c Compute uniform pseudorandom numbers.
+ 130 continue
+
+ if (timers_enabled) call timer_start(3)
+ call vranlc(2 * nk, t1, a, x)
+ if (timers_enabled) call timer_stop(3)
+
+c Compute Gaussian deviates by acceptance-rejection method and
+c tally counts in concentric square annuli. This loop is not
+c vectorizable.
+
+ if (timers_enabled) call timer_start(2)
+
+ do 140 i = 1, nk
+ x1 = 2.d0 * x(2*i-1) - 1.d0
+ x2 = 2.d0 * x(2*i) - 1.d0
+ t1 = x1 ** 2 + x2 ** 2
+ if (t1 .le. 1.d0) then
+ t2 = sqrt(-2.d0 * log(t1) / t1)
+ t3 = (x1 * t2)
+ t4 = (x2 * t2)
+ l = max(abs(t3), abs(t4))
+ q(l) = q(l) + 1.d0
+ sx = sx + t3
+ sy = sy + t4
+ endif
+ 140 continue
+
+ if (timers_enabled) call timer_stop(2)
+
+ 150 continue
+
+ call mpi_allreduce(sx, x, 1, dp_type,
+ > MPI_SUM, MPI_COMM_WORLD, ierr)
+ sx = x(1)
+ call mpi_allreduce(sy, x, 1, dp_type,
+ > MPI_SUM, MPI_COMM_WORLD, ierr)
+ sy = x(1)
+ call mpi_allreduce(q, x, nq, dp_type,
+ > MPI_SUM, MPI_COMM_WORLD, ierr)
+
+ do i = 1, nq
+ q(i-1) = x(i)
+ enddo
+
+ do 160 i = 0, nq - 1
+ gc = gc + q(i)
+ 160 continue
+
+ call timer_stop(1)
+ tm = timer_read(1)
+
+ call mpi_allreduce(tm, x, 1, dp_type,
+ > MPI_MAX, MPI_COMM_WORLD, ierr)
+ tm = x(1)
+
+ if (node.eq.root) then
+ nit=0
+ verified = .true.
+ if (m.eq.24) then
+ sx_verify_value = -3.247834652034740D+3
+ sy_verify_value = -6.958407078382297D+3
+ elseif (m.eq.25) then
+ sx_verify_value = -2.863319731645753D+3
+ sy_verify_value = -6.320053679109499D+3
+ elseif (m.eq.28) then
+ sx_verify_value = -4.295875165629892D+3
+ sy_verify_value = -1.580732573678431D+4
+ elseif (m.eq.30) then
+ sx_verify_value = 4.033815542441498D+4
+ sy_verify_value = -2.660669192809235D+4
+ elseif (m.eq.32) then
+ sx_verify_value = 4.764367927995374D+4
+ sy_verify_value = -8.084072988043731D+4
+ elseif (m.eq.36) then
+ sx_verify_value = 1.982481200946593D+5
+ sy_verify_value = -1.020596636361769D+5
+ elseif (m.eq.40) then
+ sx_verify_value = -5.319717441530D+05
+ sy_verify_value = -3.688834557731D+05
+ else
+ verified = .false.
+ endif
+ if (verified) then
+ sx_err = abs((sx - sx_verify_value)/sx_verify_value)
+ sy_err = abs((sy - sy_verify_value)/sy_verify_value)
+ verified = ((sx_err.le.epsilon) .and. (sy_err.le.epsilon))
+ endif
+ Mops = 2.d0**(m+1)/tm/1000000.d0
+
+ write (6,11) tm, m, gc, sx, sy, (i, q(i), i = 0, nq - 1)
+ 11 format ('EP Benchmark Results:'//'CPU Time =',f10.4/'N = 2^',
+ > i5/'No. Gaussian Pairs =',f15.0/'Sums = ',1p,2d25.15/
+ > 'Counts:'/(i3,0p,f15.0))
+
+ call print_results('EP', class, m+1, 0, 0, nit, npm,
+ > no_nodes, tm, Mops,
+ > 'Random numbers generated',
+ > verified, npbversion, compiletime, cs1,
+ > cs2, cs3, cs4, cs5, cs6, cs7)
+
+ endif
+
+ if (timers_enabled .and. (node .eq. root)) then
+ print *, 'Total time: ', timer_read(1)
+ print *, 'Gaussian pairs: ', timer_read(2)
+ print *, 'Random numbers: ', timer_read(3)
+ endif
+
+ call mpi_finalize(ierr)
+
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'mpif.h'
+
+ integer me, nprocs, root, dp_type
+ common /mpistuff/ me, nprocs, root, dp_type
+
--- /dev/null
+
+/*
+ * FUNCTION RANDLC (X, A)
+ *
+ * This routine returns a uniform pseudorandom double precision number in the
+ * range (0, 1) by using the linear congruential generator
+ *
+ * x_{k+1} = a x_k (mod 2^46)
+ *
+ * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+ * before repeating. The argument A is the same as 'a' in the above formula,
+ * and X is the same as x_0. A and X must be odd double precision integers
+ * in the range (1, 2^46). The returned value RANDLC is normalized to be
+ * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
+ * the new seed x_1, so that subsequent calls to RANDLC using the same
+ * arguments will generate a continuous sequence.
+ *
+ * This routine should produce the same results on any computer with at least
+ * 48 mantissa bits in double precision floating point data. On Cray systems,
+ * double precision should be disabled.
+ *
+ * David H. Bailey October 26, 1990
+ *
+ * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ * SAVE KS, R23, R46, T23, T46
+ * DATA KS/0/
+ *
+ * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
+ * by merely using the ** operator, in order to insure that the results are
+ * exact on all systems. This code assumes that 0.5D0 is represented exactly.
+ */
+
+
+/*****************************************************************/
+/************* R A N D L C ************/
+/************* ************/
+/************* portable random number generator ************/
+/*****************************************************************/
+
+double randlc( double *X, double *A )
+{
+ static int KS=0;
+ static double R23, R46, T23, T46;
+ double T1, T2, T3, T4;
+ double A1;
+ double A2;
+ double X1;
+ double X2;
+ double Z;
+ int i, j;
+
+ if (KS == 0)
+ {
+ R23 = 1.0;
+ R46 = 1.0;
+ T23 = 1.0;
+ T46 = 1.0;
+
+ for (i=1; i<=23; i++)
+ {
+ R23 = 0.50 * R23;
+ T23 = 2.0 * T23;
+ }
+ for (i=1; i<=46; i++)
+ {
+ R46 = 0.50 * R46;
+ T46 = 2.0 * T46;
+ }
+ KS = 1;
+ }
+
+/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
+
+ T1 = R23 * *A;
+ j = T1;
+ A1 = j;
+ A2 = *A - T23 * A1;
+
+/* Break X into two parts such that X = 2^23 * X1 + X2, compute
+ Z = A1 * X2 + A2 * X1 (mod 2^23), and then
+ X = 2^23 * Z + A2 * X2 (mod 2^46). */
+
+ T1 = R23 * *X;
+ j = T1;
+ X1 = j;
+ X2 = *X - T23 * X1;
+ T1 = A1 * X2 + A2 * X1;
+
+ j = R23 * T1;
+ T2 = j;
+ Z = T1 - T23 * T2;
+ T3 = T23 * Z + A2 * X2;
+ j = R46 * T3;
+ T4 = j;
+ *X = T3 - T46 * T4;
+ return(R46 * *X);
+}
+
+
+
+/*****************************************************************/
+/************ F I N D _ M Y _ S E E D ************/
+/************ ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
--- /dev/null
+
+double randlc( double *X, double *A );
+
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=ft
+BENCHMARKU=FT
+
+include ../config/make.def
+
+include ../sys/make.common
+
+OBJS = ft.o ${COMMON}/${RAND}.o ${COMMON}/print_results.o ${COMMON}/timers.o
+
+${PROGRAM}: config ${OBJS}
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+
+
+.f.o:
+ ${FCOMPILE} $<
+
+ft.o: ft.f global.h mpinpb.h npbparams.h
+
+clean:
+ - rm -f *.o *~ mputil*
+ - rm -f ft npbparams.h core
--- /dev/null
+This code implements the time integration of a three-dimensional
+partial differential equation using the Fast Fourier Transform.
+Some of the dimension statements are not F77 conforming and will
+not work using the g77 compiler. All dimension statements,
+however, are legal F90.
\ No newline at end of file
--- /dev/null
+!-------------------------------------------------------------------------!
+! !
+! N A S P A R A L L E L B E N C H M A R K S 3.3 !
+! !
+! F T !
+! !
+!-------------------------------------------------------------------------!
+! !
+! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
+! It is described in NAS Technical Reports 95-020 and 02-007 !
+! !
+! Permission to use, copy, distribute and modify this software !
+! for any purpose with or without fee is hereby granted. We !
+! request, however, that all derived work reference the NAS !
+! Parallel Benchmarks 3.3. This software is provided "as is" !
+! without express or implied warranty. !
+! !
+! Information on NPB 3.3, including the technical report, the !
+! original specifications, source code, results and information !
+! on how to submit new results, is available at: !
+! !
+! http://www.nas.nasa.gov/Software/NPB/ !
+! !
+! Send comments or suggestions to npb@nas.nasa.gov !
+! !
+! NAS Parallel Benchmarks Group !
+! NASA Ames Research Center !
+! Mail Stop: T27A-1 !
+! Moffett Field, CA 94035-1000 !
+! !
+! E-mail: npb@nas.nasa.gov !
+! Fax: (650) 604-3957 !
+! !
+!-------------------------------------------------------------------------!
+
+!TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER
+!STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT
+!JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID
+!FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY
+!USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE
+!NPB3.0-SER)
+
+!TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY
+!MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE
+!NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE
+!COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS
+!((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS
+!THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE
+!PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A
+!FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER.
+!THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL.
+
+!UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT
+!FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE
+!"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM
+!DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER
+!OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1
+!AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION
+!OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE
+!ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO.
+
+c---------------------------------------------------------------------
+c
+c Authors: D. Bailey
+c W. Saphir
+c R. F. Van der Wijngaart
+c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c FT benchmark
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ program ft
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpif.h'
+ include 'global.h'
+ integer i, ierr
+
+c---------------------------------------------------------------------
+c u0, u1, u2 are the main arrays in the problem.
+c Depending on the decomposition, these arrays will have different
+c dimensions. To accomodate all possibilities, we allocate them as
+c one-dimensional arrays and pass them to subroutines for different
+c views
+c - u0 contains the initial (transformed) initial condition
+c - u1 and u2 are working arrays
+c---------------------------------------------------------------------
+
+ double complex u0(ntdivnp),
+ > u1(ntdivnp),
+ > u2(ntdivnp)
+ double precision twiddle(ntdivnp)
+c---------------------------------------------------------------------
+c Large arrays are in common so that they are allocated on the
+c heap rather than the stack. This common block is not
+c referenced directly anywhere else. Padding is to avoid accidental
+c cache problems, since all array sizes are powers of two.
+c---------------------------------------------------------------------
+
+ double complex pad1(3), pad2(3), pad3(3)
+ common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
+
+ integer iter
+ double precision total_time, mflops
+ logical verified
+ character class
+
+ call MPI_Init(ierr)
+
+c---------------------------------------------------------------------
+c Run the entire problem once to make sure all data is touched.
+c This reduces variable startup costs, which is important for such a
+c short benchmark. The other NPB 2 implementations are similar.
+c---------------------------------------------------------------------
+ do i = 1, t_max
+ call timer_clear(i)
+ end do
+
+ call setup()
+ call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
+ call compute_initial_conditions(u1, dims(1,1), dims(2,1),
+ > dims(3,1))
+ call fft_init (dims(1,1))
+ call fft(1, u1, u0)
+
+c---------------------------------------------------------------------
+c Start over from the beginning. Note that all operations must
+c be timed, in contrast to other benchmarks.
+c---------------------------------------------------------------------
+ do i = 1, t_max
+ call timer_clear(i)
+ end do
+ call MPI_Barrier(MPI_COMM_WORLD, ierr)
+
+ call timer_start(T_total)
+ if (timers_enabled) call timer_start(T_setup)
+
+ call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
+ call compute_initial_conditions(u1, dims(1,1), dims(2,1),
+ > dims(3,1))
+ call fft_init (dims(1,1))
+
+ if (timers_enabled) call synchup()
+ if (timers_enabled) call timer_stop(T_setup)
+
+ if (timers_enabled) call timer_start(T_fft)
+ call fft(1, u1, u0)
+ if (timers_enabled) call timer_stop(T_fft)
+
+ do iter = 1, niter
+ if (timers_enabled) call timer_start(T_evolve)
+ call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1))
+ if (timers_enabled) call timer_stop(T_evolve)
+ if (timers_enabled) call timer_start(T_fft)
+ call fft(-1, u1, u2)
+ if (timers_enabled) call timer_stop(T_fft)
+ if (timers_enabled) call synchup()
+ if (timers_enabled) call timer_start(T_checksum)
+ call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1))
+ if (timers_enabled) call timer_stop(T_checksum)
+ end do
+
+ call verify(nx, ny, nz, niter, verified, class)
+ call timer_stop(t_total)
+ if (np .ne. np_min) verified = .false.
+ total_time = timer_read(t_total)
+
+ if( total_time .ne. 0. ) then
+ mflops = 1.0d-6*ntotal_f *
+ > (14.8157+7.19641*log(ntotal_f)
+ > + (5.23518+7.21113*log(ntotal_f))*niter)
+ > /total_time
+ else
+ mflops = 0.0
+ endif
+ if (me .eq. 0) then
+ call print_results('FT', class, nx, ny, nz, niter, np_min, np,
+ > total_time, mflops, ' floating point', verified,
+ > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+ endif
+ if (timers_enabled) call print_timers()
+ call MPI_Finalize(ierr)
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine evolve(u0, u1, twiddle, d1, d2, d3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c evolve u0 -> u1 (t time steps) in fourier space
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer d1, d2, d3
+ double precision exi
+ double complex u0(d1,d2,d3)
+ double complex u1(d1,d2,d3)
+ double precision twiddle(d1,d2,d3)
+ integer i, j, k
+
+ do k = 1, d3
+ do j = 1, d2
+ do i = 1, d1
+ u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k))
+ u1(i,j,k) = u0(i,j,k)
+ end do
+ end do
+ end do
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine compute_initial_conditions(u0, d1, d2, d3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Fill in array u0 with initial conditions from
+c random number generator
+c---------------------------------------------------------------------
+ implicit none
+ include 'global.h'
+ integer d1, d2, d3
+ double complex u0(d1, d2, d3)
+ integer k
+ double precision x0, start, an, dummy
+
+c---------------------------------------------------------------------
+c 0-D and 1-D layouts are easy because each processor gets a contiguous
+c chunk of the array, in the Fortran ordering sense.
+c For a 2-D layout, it's a bit more complicated. We always
+c have entire x-lines (contiguous) in processor.
+c We can do ny/np1 of them at a time since we have
+c ny/np1 contiguous in y-direction. But then we jump
+c by z-planes (nz/np2 of them, total).
+c For the 0-D and 1-D layouts we could do larger chunks, but
+c this turns out to have no measurable impact on performance.
+c---------------------------------------------------------------------
+
+
+ start = seed
+c---------------------------------------------------------------------
+c Jump to the starting element for our first plane.
+c---------------------------------------------------------------------
+ call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an)
+ dummy = randlc(start, an)
+ call ipow46(a, 2*nx, ny, an)
+
+c---------------------------------------------------------------------
+c Go through by z planes filling in one square at a time.
+c---------------------------------------------------------------------
+ do k = 1, dims(3, 1) ! nz/np2
+ x0 = start
+ call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k))
+ if (k .ne. dims(3, 1)) dummy = randlc(start, an)
+ end do
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine ipow46(a, exp_1, exp_2, result)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute a^exponent mod 2^46
+c---------------------------------------------------------------------
+
+ implicit none
+ double precision a, result, dummy, q, r
+ integer exp_1, exp_2, n, n2, ierr
+ external randlc
+ double precision randlc
+ logical two_pow
+c---------------------------------------------------------------------
+c Use
+c a^n = a^(n/2)*a^(n/2) if n even else
+c a^n = a*a^(n-1) if n odd
+c---------------------------------------------------------------------
+ result = 1
+ if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return
+ q = a
+ r = 1
+ n = exp_1
+ two_pow = .true.
+
+ do while (two_pow)
+ n2 = n/2
+ if (n2 * 2 .eq. n) then
+ dummy = randlc(q, q)
+ n = n2
+ else
+ n = n * exp_2
+ two_pow = .false.
+ endif
+ end do
+
+ do while (n .gt. 1)
+ n2 = n/2
+ if (n2 * 2 .eq. n) then
+ dummy = randlc(q, q)
+ n = n2
+ else
+ dummy = randlc(r, q)
+ n = n-1
+ endif
+ end do
+ dummy = randlc(r, q)
+ result = r
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'mpinpb.h'
+ include 'global.h'
+
+ integer ierr, i, j, fstatus
+ debug = .FALSE.
+
+ call MPI_Comm_size(MPI_COMM_WORLD, np, ierr)
+ call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr)
+
+ if (.not. convertdouble) then
+ dc_type = MPI_DOUBLE_COMPLEX
+ else
+ dc_type = MPI_COMPLEX
+ endif
+
+
+ if (me .eq. 0) then
+ write(*, 1000)
+ open (unit=2,file='inputft.data',status='old', iostat=fstatus)
+
+ if (fstatus .eq. 0) then
+ write(*,233)
+ 233 format(' Reading from input file inputft.data')
+ read (2,*) niter
+ read (2,*) layout_type
+ read (2,*) np1, np2
+ close(2)
+
+c---------------------------------------------------------------------
+c check to make sure input data is consistent
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c 1. product of processor grid dims must equal number of processors
+c---------------------------------------------------------------------
+
+ if (np1 * np2 .ne. np) then
+ write(*, 238)
+ 238 format(' np1 and np2 given in input file are not valid.')
+ write(*, 239) np1*np2, np
+ 239 format(' Product is ', i5, ' and should be ', i5)
+ call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+ endif
+
+c---------------------------------------------------------------------
+c 2. layout type must be valid
+c---------------------------------------------------------------------
+
+ if (layout_type .ne. layout_0D .and.
+ > layout_type .ne. layout_1D .and.
+ > layout_type .ne. layout_2D) then
+ write(*, 240)
+ 240 format(' Layout type specified in inputft.data is
+ > invalid ')
+ call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+ endif
+
+c---------------------------------------------------------------------
+c 3. 0D layout must be 1x1 grid
+c---------------------------------------------------------------------
+
+ if (layout_type .eq. layout_0D .and.
+ > (np1 .ne.1 .or. np2 .ne. 1)) then
+ write(*, 241)
+ 241 format(' For 0D layout, both np1 and np2 must be 1 ')
+ call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+ endif
+c---------------------------------------------------------------------
+c 4. 1D layout must be 1xN grid
+c---------------------------------------------------------------------
+
+ if (layout_type .eq. layout_1D .and. np1 .ne. 1) then
+ write(*, 242)
+ 242 format(' For 1D layout, np1 must be 1 ')
+ call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+ endif
+
+ else
+ write(*,234)
+ niter = niter_default
+ if (np .eq. 1) then
+ np1 = 1
+ np2 = 1
+ layout_type = layout_0D
+ else if (np .le. nz) then
+ np1 = 1
+ np2 = np
+ layout_type = layout_1D
+ else
+ np1 = nz
+ np2 = np/nz
+ layout_type = layout_2D
+ endif
+ endif
+
+ if (np .lt. np_min) then
+ write(*, 10) np_min
+ 10 format(' Error: Compiled for ', I5, ' processors. ')
+ write(*, 11) np
+ 11 format(' Only ', i5, ' processors found ')
+ call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
+ endif
+
+ 234 format(' No input file inputft.data. Using compiled defaults')
+ write(*, 1001) nx, ny, nz
+ write(*, 1002) niter
+ write(*, 1004) np
+ write(*, 1005) np1, np2
+ if (np .ne. np_min) write(*, 1006) np_min
+
+ if (layout_type .eq. layout_0D) then
+ write(*, 1010) '0D'
+ else if (layout_type .eq. layout_1D) then
+ write(*, 1010) '1D'
+ else
+ write(*, 1010) '2D'
+ endif
+
+ 1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/)
+ 1001 format(' Size : ', i4, 'x', i4, 'x', i4)
+ 1002 format(' Iterations : ', 7x, i7)
+ 1004 format(' Number of processes : ', 7x, i7)
+ 1005 format(' Processor array : ', 5x, i4, 'x', i4)
+ 1006 format(' WARNING: compiled for ', i5, ' processes. ',
+ > ' Will not verify. ')
+ 1010 format(' Layout type : ', 9x, A5)
+ endif
+
+
+c---------------------------------------------------------------------
+c Since np1, np2 and layout_type are in a common block,
+c this sends all three.
+c---------------------------------------------------------------------
+ call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+ call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
+
+ if (np1 .eq. 1 .and. np2 .eq. 1) then
+ layout_type = layout_0D
+ else if (np1 .eq. 1) then
+ layout_type = layout_1D
+ else
+ layout_type = layout_2D
+ endif
+
+ if (layout_type .eq. layout_0D) then
+ do i = 1, 3
+ dims(1, i) = nx
+ dims(2, i) = ny
+ dims(3, i) = nz
+ end do
+ else if (layout_type .eq. layout_1D) then
+ dims(1, 1) = nx
+ dims(2, 1) = ny
+ dims(3, 1) = nz
+
+ dims(1, 2) = nx
+ dims(2, 2) = ny
+ dims(3, 2) = nz
+
+ dims(1, 3) = nz
+ dims(2, 3) = nx
+ dims(3, 3) = ny
+ else if (layout_type .eq. layout_2D) then
+ dims(1, 1) = nx
+ dims(2, 1) = ny
+ dims(3, 1) = nz
+
+ dims(1, 2) = ny
+ dims(2, 2) = nx
+ dims(3, 2) = nz
+
+ dims(1, 3) = nz
+ dims(2, 3) = nx
+ dims(3, 3) = ny
+
+ endif
+ do i = 1, 3
+ dims(2, i) = dims(2, i) / np1
+ dims(3, i) = dims(3, i) / np2
+ end do
+
+
+c---------------------------------------------------------------------
+c Determine processor coordinates of this processor
+c Processor grid is np1xnp2.
+c Arrays are always (n1, n2/np1, n3/np2)
+c Processor coords are zero-based.
+c---------------------------------------------------------------------
+ me2 = mod(me, np2) ! goes from 0...np2-1
+ me1 = me/np2 ! goes from 0...np1-1
+c---------------------------------------------------------------------
+c Communicators for rows/columns of processor grid.
+c commslice1 is communicator of all procs with same me1, ranked as me2
+c commslice2 is communicator of all procs with same me2, ranked as me1
+c mpi_comm_split(comm, color, key, ...)
+c---------------------------------------------------------------------
+ call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr)
+ call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr)
+ if (timers_enabled) call synchup()
+
+ if (debug) print *, 'proc coords: ', me, me1, me2
+
+c---------------------------------------------------------------------
+c Determine which section of the grid is owned by this
+c processor.
+c---------------------------------------------------------------------
+ if (layout_type .eq. layout_0d) then
+
+ do i = 1, 3
+ xstart(i) = 1
+ xend(i) = nx
+ ystart(i) = 1
+ yend(i) = ny
+ zstart(i) = 1
+ zend(i) = nz
+ end do
+
+ else if (layout_type .eq. layout_1d) then
+
+ xstart(1) = 1
+ xend(1) = nx
+ ystart(1) = 1
+ yend(1) = ny
+ zstart(1) = 1 + me2 * nz/np2
+ zend(1) = (me2+1) * nz/np2
+
+ xstart(2) = 1
+ xend(2) = nx
+ ystart(2) = 1
+ yend(2) = ny
+ zstart(2) = 1 + me2 * nz/np2
+ zend(2) = (me2+1) * nz/np2
+
+ xstart(3) = 1
+ xend(3) = nx
+ ystart(3) = 1 + me2 * ny/np2
+ yend(3) = (me2+1) * ny/np2
+ zstart(3) = 1
+ zend(3) = nz
+
+ else if (layout_type .eq. layout_2d) then
+
+ xstart(1) = 1
+ xend(1) = nx
+ ystart(1) = 1 + me1 * ny/np1
+ yend(1) = (me1+1) * ny/np1
+ zstart(1) = 1 + me2 * nz/np2
+ zend(1) = (me2+1) * nz/np2
+
+ xstart(2) = 1 + me1 * nx/np1
+ xend(2) = (me1+1)*nx/np1
+ ystart(2) = 1
+ yend(2) = ny
+ zstart(2) = zstart(1)
+ zend(2) = zend(1)
+
+ xstart(3) = xstart(2)
+ xend(3) = xend(2)
+ ystart(3) = 1 + me2 *ny/np2
+ yend(3) = (me2+1)*ny/np2
+ zstart(3) = 1
+ zend(3) = nz
+ endif
+
+c---------------------------------------------------------------------
+c Set up info for blocking of ffts and transposes. This improves
+c performance on cache-based systems. Blocking involves
+c working on a chunk of the problem at a time, taking chunks
+c along the first, second, or third dimension.
+c
+c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
+c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
+
+c Since 1st dim is always in processor, we'll assume it's long enough
+c (default blocking factor is 16 so min size for 1st dim is 16)
+c The only case we have to worry about is cffts1 in a 2d decomposition.
+c so the blocking factor should not be larger than the 2nd dimension.
+c---------------------------------------------------------------------
+
+ fftblock = fftblock_default
+ fftblockpad = fftblockpad_default
+
+ if (layout_type .eq. layout_2d) then
+ if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1)
+ if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2)
+ if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3)
+ endif
+
+ if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine compute_indexmap(twiddle, d1, d2, d3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
+c for time evolution exponent.
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'mpinpb.h'
+ include 'global.h'
+ integer d1, d2, d3
+ integer i, j, k, ii, ii2, jj, ij2, kk
+ double precision ap, twiddle(d1, d2, d3)
+
+c---------------------------------------------------------------------
+c this function is very different depending on whether
+c we are in the 0d, 1d or 2d layout. Compute separately.
+c basically we want to convert the fortran indices
+c 1 2 3 4 5 6 7 8
+c to
+c 0 1 2 3 -4 -3 -2 -1
+c The following magic formula does the trick:
+c mod(i-1+n/2, n) - n/2
+c---------------------------------------------------------------------
+
+ ap = - 4.d0 * alpha * pi *pi
+
+ if (layout_type .eq. layout_0d) then ! xyz layout
+ do i = 1, dims(1,3)
+ ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
+ ii2 = ii*ii
+ do j = 1, dims(2,3)
+ jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
+ ij2 = jj*jj+ii2
+ do k = 1, dims(3,3)
+ kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
+ twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2))
+ end do
+ end do
+ end do
+ else if (layout_type .eq. layout_1d) then ! zxy layout
+ do i = 1,dims(2,3)
+ ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
+ ii2 = ii*ii
+ do j = 1,dims(3,3)
+ jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
+ ij2 = jj*jj+ii2
+ do k = 1,dims(1,3)
+ kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
+ twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
+ end do
+ end do
+ end do
+ else if (layout_type .eq. layout_2d) then ! zxy layout
+ do i = 1,dims(2,3)
+ ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
+ ii2 = ii*ii
+ do j = 1, dims(3,3)
+ jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
+ ij2 = jj*jj+ii2
+ do k =1,dims(1,3)
+ kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
+ twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
+ end do
+ end do
+ end do
+ else
+ print *, ' Unknown layout type ', layout_type
+ stop
+ endif
+
+ return
+ end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine print_timers()
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ integer i
+ include 'global.h'
+ character*25 tstrings(T_max)
+ data tstrings / ' total ',
+ > ' setup ',
+ > ' fft ',
+ > ' evolve ',
+ > ' checksum ',
+ > ' fftlow ',
+ > ' fftcopy ',
+ > ' transpose ',
+ > ' transpose1_loc ',
+ > ' transpose1_glo ',
+ > ' transpose1_fin ',
+ > ' transpose2_loc ',
+ > ' transpose2_glo ',
+ > ' transpose2_fin ',
+ > ' sync ' /
+
+ if (me .ne. 0) return
+ do i = 1, t_max
+ if (timer_read(i) .ne. 0.0d0) then
+ write(*, 100) i, tstrings(i), timer_read(i)
+ endif
+ end do
+ 100 format(' timer ', i2, '(', A16, ') :', F10.6)
+ return
+ end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine fft(dir, x1, x2)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer dir
+ double complex x1(ntdivnp), x2(ntdivnp)
+
+ double complex scratch(fftblockpad_default*maxdim*2)
+
+c---------------------------------------------------------------------
+c note: args x1, x2 must be different arrays
+c note: args for cfftsx are (direction, layout, xin, xout, scratch)
+c xin/xout may be the same and it can be somewhat faster
+c if they are
+c note: args for transpose are (layout1, layout2, xin, xout)
+c xin/xout must be different
+c---------------------------------------------------------------------
+
+ if (dir .eq. 1) then
+ if (layout_type .eq. layout_0d) then
+ call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
+ > x1, x1, scratch)
+ call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
+ > x1, x1, scratch)
+ call cffts3(1, dims(1,3), dims(2,3), dims(3,3),
+ > x1, x2, scratch)
+ else if (layout_type .eq. layout_1d) then
+ call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
+ > x1, x1, scratch)
+ call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
+ > x1, x1, scratch)
+ if (timers_enabled) call timer_start(T_transpose)
+ call transpose_xy_z(2, 3, x1, x2)
+ if (timers_enabled) call timer_stop(T_transpose)
+ call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
+ > x2, x2, scratch)
+ else if (layout_type .eq. layout_2d) then
+ call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
+ > x1, x1, scratch)
+ if (timers_enabled) call timer_start(T_transpose)
+ call transpose_x_y(1, 2, x1, x2)
+ if (timers_enabled) call timer_stop(T_transpose)
+ call cffts1(1, dims(1,2), dims(2,2), dims(3,2),
+ > x2, x2, scratch)
+ if (timers_enabled) call timer_start(T_transpose)
+ call transpose_x_z(2, 3, x2, x1)
+ if (timers_enabled) call timer_stop(T_transpose)
+ call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
+ > x1, x2, scratch)
+ endif
+ else
+ if (layout_type .eq. layout_0d) then
+ call cffts3(-1, dims(1,3), dims(2,3), dims(3,3),
+ > x1, x1, scratch)
+ call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
+ > x1, x1, scratch)
+ call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
+ > x1, x2, scratch)
+ else if (layout_type .eq. layout_1d) then
+ call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
+ > x1, x1, scratch)
+ if (timers_enabled) call timer_start(T_transpose)
+ call transpose_x_yz(3, 2, x1, x2)
+ if (timers_enabled) call timer_stop(T_transpose)
+ call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
+ > x2, x2, scratch)
+ call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
+ > x2, x2, scratch)
+ else if (layout_type .eq. layout_2d) then
+ call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
+ > x1, x1, scratch)
+ if (timers_enabled) call timer_start(T_transpose)
+ call transpose_x_z(3, 2, x1, x2)
+ if (timers_enabled) call timer_stop(T_transpose)
+ call cffts1(-1, dims(1,2), dims(2,2), dims(3,2),
+ > x2, x2, scratch)
+ if (timers_enabled) call timer_start(T_transpose)
+ call transpose_x_y(2, 1, x2, x1)
+ if (timers_enabled) call timer_stop(T_transpose)
+ call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
+ > x1, x2, scratch)
+ endif
+ endif
+ return
+ end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine cffts1(is, d1, d2, d3, x, xout, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'global.h'
+ integer is, d1, d2, d3, logd1
+ double complex x(d1,d2,d3)
+ double complex xout(d1,d2,d3)
+ double complex y(fftblockpad, d1, 2)
+ integer i, j, k, jj
+
+ logd1 = ilog2(d1)
+
+ do k = 1, d3
+ do jj = 0, d2 - fftblock, fftblock
+ if (timers_enabled) call timer_start(T_fftcopy)
+ do j = 1, fftblock
+ do i = 1, d1
+ y(j,i,1) = x(i,j+jj,k)
+ enddo
+ enddo
+ if (timers_enabled) call timer_stop(T_fftcopy)
+
+ if (timers_enabled) call timer_start(T_fftlow)
+ call cfftz (is, logd1, d1, y, y(1,1,2))
+ if (timers_enabled) call timer_stop(T_fftlow)
+
+ if (timers_enabled) call timer_start(T_fftcopy)
+ do j = 1, fftblock
+ do i = 1, d1
+ xout(i,j+jj,k) = y(j,i,1)
+ enddo
+ enddo
+ if (timers_enabled) call timer_stop(T_fftcopy)
+ enddo
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine cffts2(is, d1, d2, d3, x, xout, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'global.h'
+ integer is, d1, d2, d3, logd2
+ double complex x(d1,d2,d3)
+ double complex xout(d1,d2,d3)
+ double complex y(fftblockpad, d2, 2)
+ integer i, j, k, ii
+
+ logd2 = ilog2(d2)
+
+ do k = 1, d3
+ do ii = 0, d1 - fftblock, fftblock
+ if (timers_enabled) call timer_start(T_fftcopy)
+ do j = 1, d2
+ do i = 1, fftblock
+ y(i,j,1) = x(i+ii,j,k)
+ enddo
+ enddo
+ if (timers_enabled) call timer_stop(T_fftcopy)
+
+ if (timers_enabled) call timer_start(T_fftlow)
+ call cfftz (is, logd2, d2, y, y(1, 1, 2))
+ if (timers_enabled) call timer_stop(T_fftlow)
+
+ if (timers_enabled) call timer_start(T_fftcopy)
+ do j = 1, d2
+ do i = 1, fftblock
+ xout(i+ii,j,k) = y(i,j,1)
+ enddo
+ enddo
+ if (timers_enabled) call timer_stop(T_fftcopy)
+ enddo
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine cffts3(is, d1, d2, d3, x, xout, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'global.h'
+ integer is, d1, d2, d3, logd3
+ double complex x(d1,d2,d3)
+ double complex xout(d1,d2,d3)
+ double complex y(fftblockpad, d3, 2)
+ integer i, j, k, ii
+
+ logd3 = ilog2(d3)
+
+ do j = 1, d2
+ do ii = 0, d1 - fftblock, fftblock
+ if (timers_enabled) call timer_start(T_fftcopy)
+ do k = 1, d3
+ do i = 1, fftblock
+ y(i,k,1) = x(i+ii,j,k)
+ enddo
+ enddo
+ if (timers_enabled) call timer_stop(T_fftcopy)
+
+ if (timers_enabled) call timer_start(T_fftlow)
+ call cfftz (is, logd3, d3, y, y(1, 1, 2))
+ if (timers_enabled) call timer_stop(T_fftlow)
+
+ if (timers_enabled) call timer_start(T_fftcopy)
+ do k = 1, d3
+ do i = 1, fftblock
+ xout(i+ii,j,k) = y(i,k,1)
+ enddo
+ enddo
+ if (timers_enabled) call timer_stop(T_fftcopy)
+ enddo
+ enddo
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine fft_init (n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the roots-of-unity array that will be used for subsequent FFTs.
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+
+ integer m,n,nu,ku,i,j,ln
+ double precision t, ti
+
+
+c---------------------------------------------------------------------
+c Initialize the U array with sines and cosines in a manner that permits
+c stride one access at each FFT iteration.
+c---------------------------------------------------------------------
+ nu = n
+ m = ilog2(n)
+ u(1) = m
+ ku = 2
+ ln = 1
+
+ do j = 1, m
+ t = pi / ln
+
+ do i = 0, ln - 1
+ ti = i * t
+ u(i+ku) = dcmplx (cos (ti), sin(ti))
+ enddo
+
+ ku = ku + ln
+ ln = 2 * ln
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine cfftz (is, m, n, x, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Computes NY N-point complex-to-complex FFTs of X using an algorithm due
+c to Swarztrauber. X is both the input and the output array, while Y is a
+c scratch array. It is assumed that N = 2^M. Before calling CFFTZ to
+c perform FFTs, the array U must be initialized by calling CFFTZ with IS
+c set to 0 and M set to MX, where MX is the maximum value of M for any
+c subsequent call.
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+
+ integer is,m,n,i,j,l,mx
+ double complex x, y
+
+ dimension x(fftblockpad,n), y(fftblockpad,n)
+
+c---------------------------------------------------------------------
+c Check if input parameters are invalid.
+c---------------------------------------------------------------------
+ mx = u(1)
+ if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)
+ > then
+ write (*, 1) is, m, mx
+ 1 format ('CFFTZ: Either U has not been initialized, or else'/
+ > 'one of the input parameters is invalid', 3I5)
+ stop
+ endif
+
+c---------------------------------------------------------------------
+c Perform one variant of the Stockham FFT.
+c---------------------------------------------------------------------
+ do l = 1, m, 2
+ call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
+ if (l .eq. m) goto 160
+ call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
+ enddo
+
+ goto 180
+
+c---------------------------------------------------------------------
+c Copy Y to X.
+c---------------------------------------------------------------------
+ 160 do j = 1, n
+ do i = 1, fftblock
+ x(i,j) = y(i,j)
+ enddo
+ enddo
+
+ 180 continue
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Performs the L-th iteration of the second variant of the Stockham FFT.
+c---------------------------------------------------------------------
+
+ implicit none
+
+ integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
+ double complex u,x,y,u1,x11,x21
+ dimension u(n), x(ny1,n), y(ny1,n)
+
+
+c---------------------------------------------------------------------
+c Set initial parameters.
+c---------------------------------------------------------------------
+
+ n1 = n / 2
+ lk = 2 ** (l - 1)
+ li = 2 ** (m - l)
+ lj = 2 * lk
+ ku = li + 1
+
+ do i = 0, li - 1
+ i11 = i * lk + 1
+ i12 = i11 + n1
+ i21 = i * lj + 1
+ i22 = i21 + lk
+ if (is .ge. 1) then
+ u1 = u(ku+i)
+ else
+ u1 = dconjg (u(ku+i))
+ endif
+
+c---------------------------------------------------------------------
+c This loop is vectorizable.
+c---------------------------------------------------------------------
+ do k = 0, lk - 1
+ do j = 1, ny
+ x11 = x(j,i11+k)
+ x21 = x(j,i12+k)
+ y(j,i21+k) = x11 + x21
+ y(j,i22+k) = u1 * (x11 - x21)
+ enddo
+ enddo
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ integer function ilog2(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ integer n, nn, lg
+ if (n .eq. 1) then
+ ilog2=0
+ return
+ endif
+ lg = 1
+ nn = 2
+ do while (nn .lt. n)
+ nn = nn*2
+ lg = lg+1
+ end do
+ ilog2 = lg
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_x_yz(l1, l2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer l1, l2
+ double complex xin(ntdivnp), xout(ntdivnp)
+
+ call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1),
+ > xin, xout)
+
+ call transpose2_global(xout, xin)
+
+ call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1),
+ > xin, xout)
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_xy_z(l1, l2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer l1, l2
+ double complex xin(ntdivnp), xout(ntdivnp)
+
+ call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1),
+ > xin, xout)
+ call transpose2_global(xout, xin)
+ call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1),
+ > xin, xout)
+
+ return
+ end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose2_local(n1, n2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'mpinpb.h'
+ include 'global.h'
+ integer n1, n2
+ double complex xin(n1, n2), xout(n2, n1)
+
+ double complex z(transblockpad, transblock)
+
+ integer i, j, ii, jj
+
+ if (timers_enabled) call timer_start(T_transxzloc)
+
+c---------------------------------------------------------------------
+c If possible, block the transpose for cache memory systems.
+c How much does this help? Example: R8000 Power Challenge (90 MHz)
+c Blocked version decreases time spend in this routine
+c from 14 seconds to 5.2 seconds on 8 nodes class A.
+c---------------------------------------------------------------------
+
+ if (n1 .lt. transblock .or. n2 .lt. transblock) then
+ if (n1 .ge. n2) then
+ do j = 1, n2
+ do i = 1, n1
+ xout(j, i) = xin(i, j)
+ end do
+ end do
+ else
+ do i = 1, n1
+ do j = 1, n2
+ xout(j, i) = xin(i, j)
+ end do
+ end do
+ endif
+ else
+ do j = 0, n2-1, transblock
+ do i = 0, n1-1, transblock
+
+c---------------------------------------------------------------------
+c Note: compiler should be able to take j+jj out of inner loop
+c---------------------------------------------------------------------
+ do jj = 1, transblock
+ do ii = 1, transblock
+ z(jj,ii) = xin(i+ii, j+jj)
+ end do
+ end do
+
+ do ii = 1, transblock
+ do jj = 1, transblock
+ xout(j+jj, i+ii) = z(jj,ii)
+ end do
+ end do
+
+ end do
+ end do
+ endif
+ if (timers_enabled) call timer_stop(T_transxzloc)
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose2_global(xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ include 'mpinpb.h'
+ double complex xin(ntdivnp)
+ double complex xout(ntdivnp)
+ integer ierr
+
+ if (timers_enabled) call synchup()
+
+ if (timers_enabled) call timer_start(T_transxzglo)
+ call mpi_alltoall(xin, ntdivnp/np, dc_type,
+ > xout, ntdivnp/np, dc_type,
+ > commslice1, ierr)
+ if (timers_enabled) call timer_stop(T_transxzglo)
+
+ return
+ end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose2_finish(n1, n2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer n1, n2, ioff
+ double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2)
+
+ integer i, j, p
+
+ if (timers_enabled) call timer_start(T_transxzfin)
+ do p = 0, np2-1
+ ioff = p*n2
+ do j = 1, n1/np2
+ do i = 1, n2
+ xout(i+ioff, j) = xin(i, j, p)
+ end do
+ end do
+ end do
+ if (timers_enabled) call timer_stop(T_transxzfin)
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_x_z(l1, l2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer l1, l2
+ double complex xin(ntdivnp), xout(ntdivnp)
+
+ call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1),
+ > xin, xout)
+ call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1),
+ > xout, xin)
+ call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2),
+ > xin, xout)
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_x_z_local(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer d1, d2, d3
+ double complex xin(d1,d2,d3)
+ double complex xout(d3,d2,d1)
+ integer block1, block3
+ integer i, j, k, kk, ii, i1, k1
+
+ double complex buf(transblockpad, maxdim)
+ if (timers_enabled) call timer_start(T_transxzloc)
+ if (d1 .lt. 32) goto 100
+ block3 = d3
+ if (block3 .eq. 1) goto 100
+ if (block3 .gt. transblock) block3 = transblock
+ block1 = d1
+ if (block1*block3 .gt. transblock*transblock)
+ > block1 = transblock*transblock/block3
+c---------------------------------------------------------------------
+c blocked transpose
+c---------------------------------------------------------------------
+ do j = 1, d2
+ do kk = 0, d3-block3, block3
+ do ii = 0, d1-block1, block1
+
+ do k = 1, block3
+ k1 = k + kk
+ do i = 1, block1
+ buf(k, i) = xin(i+ii, j, k1)
+ end do
+ end do
+
+ do i = 1, block1
+ i1 = i + ii
+ do k = 1, block3
+ xout(k+kk, j, i1) = buf(k, i)
+ end do
+ end do
+
+ end do
+ end do
+ end do
+ goto 200
+
+
+c---------------------------------------------------------------------
+c basic transpose
+c---------------------------------------------------------------------
+ 100 continue
+
+ do j = 1, d2
+ do k = 1, d3
+ do i = 1, d1
+ xout(k, j, i) = xin(i, j, k)
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c all done
+c---------------------------------------------------------------------
+ 200 continue
+
+ if (timers_enabled) call timer_stop(T_transxzloc)
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_x_z_global(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ include 'mpinpb.h'
+ integer d1, d2, d3
+ double complex xin(d3,d2,d1)
+ double complex xout(d3,d2,d1) ! not real layout, but right size
+ integer ierr
+
+ if (timers_enabled) call synchup()
+
+c---------------------------------------------------------------------
+c do transpose among all processes with same 1-coord (me1)
+c---------------------------------------------------------------------
+ if (timers_enabled)call timer_start(T_transxzglo)
+ call mpi_alltoall(xin, d1*d2*d3/np2, dc_type,
+ > xout, d1*d2*d3/np2, dc_type,
+ > commslice1, ierr)
+ if (timers_enabled) call timer_stop(T_transxzglo)
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_x_z_finish(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer d1, d2, d3
+ double complex xin(d1/np2, d2, d3, 0:np2-1)
+ double complex xout(d1,d2,d3)
+ integer i, j, k, p, ioff
+ if (timers_enabled) call timer_start(T_transxzfin)
+c---------------------------------------------------------------------
+c this is the most straightforward way of doing it. the
+c calculation in the inner loop doesn't help.
+c do i = 1, d1/np2
+c do j = 1, d2
+c do k = 1, d3
+c do p = 0, np2-1
+c ii = i + p*d1/np2
+c xout(ii, j, k) = xin(i, j, k, p)
+c end do
+c end do
+c end do
+c end do
+c---------------------------------------------------------------------
+
+ do p = 0, np2-1
+ ioff = p*d1/np2
+ do k = 1, d3
+ do j = 1, d2
+ do i = 1, d1/np2
+ xout(i+ioff, j, k) = xin(i, j, k, p)
+ end do
+ end do
+ end do
+ end do
+ if (timers_enabled) call timer_stop(T_transxzfin)
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_x_y(l1, l2, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer l1, l2
+ double complex xin(ntdivnp), xout(ntdivnp)
+
+c---------------------------------------------------------------------
+c xy transpose is a little tricky, since we don't want
+c to touch 3rd axis. But alltoall must involve 3rd axis (most
+c slowly varying) to be efficient. So we do
+c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local)
+c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global)
+c then local finish.
+c---------------------------------------------------------------------
+
+
+ call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1),
+ > xin, xout)
+ call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1),
+ > xout, xin)
+ call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2),
+ > xin, xout)
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_x_y_local(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer d1, d2, d3
+ double complex xin(d1, d2, d3)
+ double complex xout(d2, d3, d1)
+ integer i, j, k
+ if (timers_enabled) call timer_start(T_transxyloc)
+
+ do k = 1, d3
+ do i = 1, d1
+ do j = 1, d2
+ xout(j,k,i)=xin(i,j,k)
+ end do
+ end do
+ end do
+ if (timers_enabled) call timer_stop(T_transxyloc)
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_x_y_global(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ include 'mpinpb.h'
+ integer d1, d2, d3
+c---------------------------------------------------------------------
+c array is in form (ny/np1, nz/np2, nx)
+c---------------------------------------------------------------------
+ double complex xin(d2,d3,d1)
+ double complex xout(d2,d3,d1) ! not real layout but right size
+ integer ierr
+
+ if (timers_enabled) call synchup()
+
+c---------------------------------------------------------------------
+c do transpose among all processes with same 1-coord (me1)
+c---------------------------------------------------------------------
+ if (timers_enabled) call timer_start(T_transxyglo)
+ call mpi_alltoall(xin, d1*d2*d3/np1, dc_type,
+ > xout, d1*d2*d3/np1, dc_type,
+ > commslice2, ierr)
+ if (timers_enabled) call timer_stop(T_transxyglo)
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine transpose_x_y_finish(d1, d2, d3, xin, xout)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ integer d1, d2, d3
+ double complex xin(d1/np1, d3, d2, 0:np1-1)
+ double complex xout(d1,d2,d3)
+ integer i, j, k, p, ioff
+ if (timers_enabled) call timer_start(T_transxyfin)
+c---------------------------------------------------------------------
+c this is the most straightforward way of doing it. the
+c calculation in the inner loop doesn't help.
+c do i = 1, d1/np1
+c do j = 1, d2
+c do k = 1, d3
+c do p = 0, np1-1
+c ii = i + p*d1/np1
+c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2)
+c xout(ii, j, k) = xin(i, k, j, p)
+c end do
+c end do
+c end do
+c end do
+c---------------------------------------------------------------------
+
+ do p = 0, np1-1
+ ioff = p*d1/np1
+ do k = 1, d3
+ do j = 1, d2
+ do i = 1, d1/np1
+ xout(i+ioff, j, k) = xin(i, k, j, p)
+ end do
+ end do
+ end do
+ end do
+ if (timers_enabled) call timer_stop(T_transxyfin)
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine checksum(i, u1, d1, d2, d3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ include 'mpinpb.h'
+ integer i, d1, d2, d3
+ double complex u1(d1, d2, d3)
+ integer j, q,r,s, ierr
+ double complex chk,allchk
+ chk = (0.0,0.0)
+
+ do j=1,1024
+ q = mod(j, nx)+1
+ if (q .ge. xstart(1) .and. q .le. xend(1)) then
+ r = mod(3*j,ny)+1
+ if (r .ge. ystart(1) .and. r .le. yend(1)) then
+ s = mod(5*j,nz)+1
+ if (s .ge. zstart(1) .and. s .le. zend(1)) then
+ chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1)
+ end if
+ end if
+ end if
+ end do
+ chk = chk/ntotal_f
+
+ call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM,
+ > 0, MPI_COMM_WORLD, ierr)
+ if (me .eq. 0) then
+ write (*, 30) i, allchk
+ 30 format (' T =',I5,5X,'Checksum =',1P2D22.12)
+ endif
+
+c sums(i) = allchk
+c If we compute the checksum for diagnostic purposes, we let i be
+c negative, so the result will not be stored in an array
+ if (i .gt. 0) sums(i) = allchk
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine synchup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ include 'mpinpb.h'
+ integer ierr
+ call timer_start(T_synch)
+ call mpi_barrier(MPI_COMM_WORLD, ierr)
+ call timer_stop(T_synch)
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine verify (d1, d2, d3, nt, verified, class)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'global.h'
+ include 'mpinpb.h'
+ integer d1, d2, d3, nt
+ character class
+ logical verified
+ integer ierr, size, i
+ double precision err, epsilon
+
+c---------------------------------------------------------------------
+c Reference checksums
+c---------------------------------------------------------------------
+ double complex csum_ref(25)
+
+
+ class = 'U'
+
+ if (me .ne. 0) return
+
+ epsilon = 1.0d-12
+ verified = .FALSE.
+
+ if (d1 .eq. 64 .and.
+ > d2 .eq. 64 .and.
+ > d3 .eq. 64 .and.
+ > nt .eq. 6) then
+c---------------------------------------------------------------------
+c Sample size reference checksums
+c---------------------------------------------------------------------
+ class = 'S'
+ csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
+ csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
+ csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
+ csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
+ csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
+ csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
+
+ else if (d1 .eq. 128 .and.
+ > d2 .eq. 128 .and.
+ > d3 .eq. 32 .and.
+ > nt .eq. 6) then
+c---------------------------------------------------------------------
+c Class W size reference checksums
+c---------------------------------------------------------------------
+ class = 'W'
+ csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
+ csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
+ csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
+ csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
+ csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
+ csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
+
+ else if (d1 .eq. 256 .and.
+ > d2 .eq. 256 .and.
+ > d3 .eq. 128 .and.
+ > nt .eq. 6) then
+c---------------------------------------------------------------------
+c Class A size reference checksums
+c---------------------------------------------------------------------
+ class = 'A'
+ csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
+ csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
+ csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
+ csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
+ csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
+ csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
+
+ else if (d1 .eq. 512 .and.
+ > d2 .eq. 256 .and.
+ > d3 .eq. 256 .and.
+ > nt .eq. 20) then
+c---------------------------------------------------------------------
+c Class B size reference checksums
+c---------------------------------------------------------------------
+ class = 'B'
+ csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02)
+ csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02)
+ csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02)
+ csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02)
+ csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02)
+ csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02)
+ csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02)
+ csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02)
+ csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02)
+ csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
+ csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
+ csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
+ csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
+ csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
+ csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
+ csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
+ csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
+ csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
+ csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
+ csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
+
+ else if (d1 .eq. 512 .and.
+ > d2 .eq. 512 .and.
+ > d3 .eq. 512 .and.
+ > nt .eq. 20) then
+c---------------------------------------------------------------------
+c Class C size reference checksums
+c---------------------------------------------------------------------
+ class = 'C'
+ csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02)
+ csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02)
+ csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02)
+ csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02)
+ csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02)
+ csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02)
+ csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02)
+ csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02)
+ csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02)
+ csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
+ csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
+ csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
+ csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
+ csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
+ csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
+ csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
+ csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
+ csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
+ csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
+ csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
+
+ else if (d1 .eq. 2048 .and.
+ > d2 .eq. 1024 .and.
+ > d3 .eq. 1024 .and.
+ > nt .eq. 25) then
+c---------------------------------------------------------------------
+c Class D size reference checksums
+c---------------------------------------------------------------------
+ class = 'D'
+ csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02)
+ csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02)
+ csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02)
+ csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02)
+ csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02)
+ csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02)
+ csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02)
+ csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02)
+ csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02)
+ csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
+ csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
+ csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
+ csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
+ csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
+ csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
+ csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
+ csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
+ csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
+ csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
+ csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
+ csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
+ csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
+ csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
+ csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
+ csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
+
+ else if (d1 .eq. 4096 .and.
+ > d2 .eq. 2048 .and.
+ > d3 .eq. 2048 .and.
+ > nt .eq. 25) then
+c---------------------------------------------------------------------
+c Class E size reference checksums
+c---------------------------------------------------------------------
+ class = 'E'
+ csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02)
+ csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02)
+ csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02)
+ csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02)
+ csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02)
+ csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02)
+ csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02)
+ csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02)
+ csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02)
+ csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
+ csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
+ csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
+ csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
+ csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
+ csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
+ csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
+ csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
+ csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
+ csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
+ csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
+ csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
+ csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
+ csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
+ csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
+ csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
+
+ endif
+
+
+ if (class .ne. 'U') then
+
+ do i = 1, nt
+ err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
+ if (.not.(err .le. epsilon)) goto 100
+ end do
+ verified = .TRUE.
+ 100 continue
+
+ endif
+
+ call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
+ if (size .ne. np) then
+ write(*, 4010) np
+ write(*, 4011)
+ write(*, 4012)
+c---------------------------------------------------------------------
+c multiple statements because some Fortran compilers have
+c problems with long strings.
+c---------------------------------------------------------------------
+ 4010 format( ' Warning: benchmark was compiled for ', i5,
+ > 'processors')
+ 4011 format( ' Must be run on this many processors for official',
+ > ' verification')
+ 4012 format( ' so memory access is repeatable')
+ verified = .false.
+ endif
+
+ if (class .ne. 'U') then
+ if (verified) then
+ write(*,2000)
+ 2000 format(' Result verification successful')
+ else
+ write(*,2001)
+ 2001 format(' Result verification failed')
+ endif
+ endif
+ print *, 'class = ', class
+
+ return
+ end
+
+
--- /dev/null
+ include 'npbparams.h'
+
+c 2D processor array -> 2D grid decomposition (by pencils)
+c If processor array is 1xN or -> 1D grid decomposition (by planes)
+c If processor array is 1x1 -> 0D grid decomposition
+c For simplicity, do not treat Nx1 (np2 = 1) specially
+ integer np1, np2, np
+
+c basic decomposition strategy
+ integer layout_type
+ integer layout_0D, layout_1D, layout_2D
+ parameter (layout_0D = 0, layout_1D = 1, layout_2D = 2)
+
+ common /procgrid/ np1, np2, layout_type, np
+
+
+c Cache blocking params. These values are good for most
+c RISC processors.
+c FFT parameters:
+c fftblock controls how many ffts are done at a time.
+c The default is appropriate for most cache-based machines
+c On vector machines, the FFT can be vectorized with vector
+c length equal to the block size, so the block size should
+c be as large as possible. This is the size of the smallest
+c dimension of the problem: 128 for class A, 256 for class B and
+c 512 for class C.
+c Transpose parameters:
+c transblock is the blocking factor for the transposes when there
+c is a 1-D layout. On vector machines it should probably be
+c large (largest dimension of the problem).
+
+
+ integer fftblock_default, fftblockpad_default
+ parameter (fftblock_default=16, fftblockpad_default=18)
+ integer transblock, transblockpad
+ parameter(transblock=32, transblockpad=34)
+
+ integer fftblock, fftblockpad
+ common /blockinfo/ fftblock, fftblockpad
+
+c we need a bunch of logic to keep track of how
+c arrays are laid out.
+c coords of this processor
+ integer me, me1, me2
+ common /coords/ me, me1, me2
+c need a communicator for row/col in processor grid
+ integer commslice1, commslice2
+ common /comms/ commslice1, commslice2
+
+
+
+c There are basically three stages
+c 1: x-y-z layout
+c 2: after x-transform (before y)
+c 3: after y-transform (before z)
+c The computation proceeds logically as
+
+c set up initial conditions
+c fftx(1)
+c transpose (1->2)
+c ffty(2)
+c transpose (2->3)
+c fftz(3)
+c time evolution
+c fftz(3)
+c transpose (3->2)
+c ffty(2)
+c transpose (2->1)
+c fftx(1)
+c compute residual(1)
+
+c for the 0D, 1D, 2D strategies, the layouts look like xxx
+c
+c 0D 1D 2D
+c 1: xyz xyz xyz
+c 2: xyz xyz yxz
+c 3: xyz zyx zxy
+
+c the array dimensions are stored in dims(coord, phase)
+ integer dims(3, 3)
+ integer xstart(3), ystart(3), zstart(3)
+ integer xend(3), yend(3), zend(3)
+ common /layout/ dims,
+ > xstart, ystart, zstart,
+ > xend, yend, zend
+
+ integer T_total, T_setup, T_fft, T_evolve, T_checksum,
+ > T_fftlow, T_fftcopy, T_transpose,
+ > T_transxzloc, T_transxzglo, T_transxzfin,
+ > T_transxyloc, T_transxyglo, T_transxyfin,
+ > T_synch, T_max
+ parameter (T_total = 1, T_setup = 2, T_fft = 3,
+ > T_evolve = 4, T_checksum = 5,
+ > T_fftlow = 6, T_fftcopy = 7, T_transpose = 8,
+ > T_transxzloc = 9, T_transxzglo = 10, T_transxzfin = 11,
+ > T_transxyloc = 12, T_transxyglo = 13,
+ > T_transxyfin = 14, T_synch = 15, T_max = 15)
+
+
+
+ logical timers_enabled
+ parameter (timers_enabled = .false.)
+
+
+ external timer_read
+ double precision timer_read
+ external ilog2
+ integer ilog2
+
+ external randlc
+ double precision randlc
+
+
+c other stuff
+ logical debug, debugsynch
+ common /dbg/ debug, debugsynch
+
+ double precision seed, a, pi, alpha
+ parameter (seed = 314159265.d0, a = 1220703125.d0,
+ > pi = 3.141592653589793238d0, alpha=1.0d-6)
+
+c roots of unity array
+c relies on x being largest dimension?
+ double complex u(nx)
+ common /ucomm/ u
+
+
+c for checksum data
+ double complex sums(0:niter_default)
+ common /sumcomm/ sums
+
+c number of iterations
+ integer niter
+ common /iter/ niter
--- /dev/null
+6 ! number of iterations
+2 ! layout type. 0 = 0d, 1 = 1d, 2 = 2d
+2 4 ! processor layout. 0d must be "1 1"; 1d must be "1 N"
--- /dev/null
+ include 'mpif.h'
+c mpi data types
+ integer dc_type
+ common /mpistuff/ dc_type
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=is
+BENCHMARKU=IS
+
+include ../config/make.def
+
+include ../sys/make.common
+
+OBJS = is.o ${COMMON}/c_print_results.o
+
+
+${PROGRAM}: config ${OBJS}
+ ${CLINK} ${CLINKFLAGS} -o ${PROGRAM} ${OBJS} ${CMPI_LIB}
+
+.c.o:
+ ${CCOMPILE} $<
+
+is.o: is.c npbparams.h
+
+
+clean:
+ - rm -f *.o *~ mputil*
+ - rm -f is npbparams.h core
--- /dev/null
+/*************************************************************************
+ * *
+ * N A S P A R A L L E L B E N C H M A R K S 3.3 *
+ * *
+ * I S *
+ * *
+ *************************************************************************
+ * *
+ * This benchmark is part of the NAS Parallel Benchmark 3.3 suite. *
+ * It is described in NAS Technical Report 95-020. *
+ * *
+ * Permission to use, copy, distribute and modify this software *
+ * for any purpose with or without fee is hereby granted. We *
+ * request, however, that all derived work reference the NAS *
+ * Parallel Benchmarks 3.3. This software is provided "as is" *
+ * without express or implied warranty. *
+ * *
+ * Information on NPB 3.3, including the technical report, the *
+ * original specifications, source code, results and information *
+ * on how to submit new results, is available at: *
+ * *
+ * http://www.nas.nasa.gov/Software/NPB *
+ * *
+ * Send comments or suggestions to npb@nas.nasa.gov *
+ * Send bug reports to npb-bugs@nas.nasa.gov *
+ * *
+ * NAS Parallel Benchmarks Group *
+ * NASA Ames Research Center *
+ * Mail Stop: T27A-1 *
+ * Moffett Field, CA 94035-1000 *
+ * *
+ * E-mail: npb@nas.nasa.gov *
+ * Fax: (650) 604-3957 *
+ * *
+ *************************************************************************
+ * *
+ * Author: M. Yarrow *
+ * H. Jin *
+ * *
+ *************************************************************************/
+
+#include "mpi.h"
+#include "npbparams.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+/******************/
+/* default values */
+/******************/
+#ifndef CLASS
+#define CLASS 'S'
+#define NUM_PROCS 1
+#endif
+#define MIN_PROCS 1
+
+
+/*************/
+/* CLASS S */
+/*************/
+#if CLASS == 'S'
+#define TOTAL_KEYS_LOG_2 16
+#define MAX_KEY_LOG_2 11
+#define NUM_BUCKETS_LOG_2 9
+#endif
+
+
+/*************/
+/* CLASS W */
+/*************/
+#if CLASS == 'W'
+#define TOTAL_KEYS_LOG_2 20
+#define MAX_KEY_LOG_2 16
+#define NUM_BUCKETS_LOG_2 10
+#endif
+
+/*************/
+/* CLASS A */
+/*************/
+#if CLASS == 'A'
+#define TOTAL_KEYS_LOG_2 23
+#define MAX_KEY_LOG_2 19
+#define NUM_BUCKETS_LOG_2 10
+#endif
+
+
+/*************/
+/* CLASS B */
+/*************/
+#if CLASS == 'B'
+#define TOTAL_KEYS_LOG_2 25
+#define MAX_KEY_LOG_2 21
+#define NUM_BUCKETS_LOG_2 10
+#endif
+
+
+/*************/
+/* CLASS C */
+/*************/
+#if CLASS == 'C'
+#define TOTAL_KEYS_LOG_2 27
+#define MAX_KEY_LOG_2 23
+#define NUM_BUCKETS_LOG_2 10
+#endif
+
+
+/*************/
+/* CLASS D */
+/*************/
+#if CLASS == 'D'
+#define TOTAL_KEYS_LOG_2 29
+#define MAX_KEY_LOG_2 27
+#define NUM_BUCKETS_LOG_2 10
+#undef MIN_PROCS
+#define MIN_PROCS 4
+#endif
+
+
+#define TOTAL_KEYS (1 << TOTAL_KEYS_LOG_2)
+#define MAX_KEY (1 << MAX_KEY_LOG_2)
+#define NUM_BUCKETS (1 << NUM_BUCKETS_LOG_2)
+#define NUM_KEYS (TOTAL_KEYS/NUM_PROCS*MIN_PROCS)
+
+/*****************************************************************/
+/* On larger number of processors, since the keys are (roughly) */
+/* gaussian distributed, the first and last processor sort keys */
+/* in a large interval, requiring array sizes to be larger. Note */
+/* that for large NUM_PROCS, NUM_KEYS is, however, a small number*/
+/* The required array size also depends on the bucket size used. */
+/* The following values are validated for the 1024-bucket setup. */
+/*****************************************************************/
+#if NUM_PROCS < 256
+#define SIZE_OF_BUFFERS 3*NUM_KEYS/2
+#elif NUM_PROCS < 512
+#define SIZE_OF_BUFFERS 5*NUM_KEYS/2
+#elif NUM_PROCS < 1024
+#define SIZE_OF_BUFFERS 4*NUM_KEYS
+#else
+#define SIZE_OF_BUFFERS 13*NUM_KEYS/2
+#endif
+
+/*****************************************************************/
+/* NOTE: THIS CODE CANNOT BE RUN ON ARBITRARILY LARGE NUMBERS OF */
+/* PROCESSORS. THE LARGEST VERIFIED NUMBER IS 1024. INCREASE */
+/* MAX_PROCS AT YOUR PERIL */
+/*****************************************************************/
+#if CLASS == 'S'
+#define MAX_PROCS 128
+#else
+#define MAX_PROCS 1024
+#endif
+
+#define MAX_ITERATIONS 10
+#define TEST_ARRAY_SIZE 5
+
+
+/***********************************/
+/* Enable separate communication, */
+/* computation timing and printout */
+/***********************************/
+/* #define TIMING_ENABLED */
+
+
+/*************************************/
+/* Typedef: if necessary, change the */
+/* size of int here by changing the */
+/* int type to, say, long */
+/*************************************/
+typedef int INT_TYPE;
+typedef long INT_TYPE2;
+#define MP_KEY_TYPE MPI_INT
+
+
+typedef struct {
+
+/********************/
+/* MPI properties: */
+/********************/
+int my_rank,
+ comm_size;
+
+
+/********************/
+/* Some global info */
+/********************/
+INT_TYPE *key_buff_ptr_global, /* used by full_verify to get */
+ total_local_keys, /* copies of rank info */
+ total_lesser_keys;
+
+
+int passed_verification;
+
+
+
+/************************************/
+/* These are the three main arrays. */
+/* See SIZE_OF_BUFFERS def above */
+/************************************/
+INT_TYPE key_array[SIZE_OF_BUFFERS],
+ key_buff1[SIZE_OF_BUFFERS],
+ key_buff2[SIZE_OF_BUFFERS],
+ bucket_size[NUM_BUCKETS+TEST_ARRAY_SIZE], /* Top 5 elements for */
+ bucket_size_totals[NUM_BUCKETS+TEST_ARRAY_SIZE], /* part. ver. vals */
+ bucket_ptrs[NUM_BUCKETS],
+ process_bucket_distrib_ptr1[NUM_BUCKETS+TEST_ARRAY_SIZE],
+ process_bucket_distrib_ptr2[NUM_BUCKETS+TEST_ARRAY_SIZE];
+int send_count[MAX_PROCS], recv_count[MAX_PROCS],
+ send_displ[MAX_PROCS], recv_displ[MAX_PROCS];
+
+
+/**********************/
+/* Partial verif info */
+/**********************/
+INT_TYPE2 test_index_array[TEST_ARRAY_SIZE],
+ test_rank_array[TEST_ARRAY_SIZE];
+
+/**********/
+/* Timers */
+/**********/
+double start[64], elapsed[64];
+
+} global_data;
+
+
+const INT_TYPE2
+ S_test_index_array[TEST_ARRAY_SIZE] =
+ {48427,17148,23627,62548,4431},
+ S_test_rank_array[TEST_ARRAY_SIZE] =
+ {0,18,346,64917,65463},
+
+ W_test_index_array[TEST_ARRAY_SIZE] =
+ {357773,934767,875723,898999,404505},
+ W_test_rank_array[TEST_ARRAY_SIZE] =
+ {1249,11698,1039987,1043896,1048018},
+
+ A_test_index_array[TEST_ARRAY_SIZE] =
+ {2112377,662041,5336171,3642833,4250760},
+ A_test_rank_array[TEST_ARRAY_SIZE] =
+ {104,17523,123928,8288932,8388264},
+
+ B_test_index_array[TEST_ARRAY_SIZE] =
+ {41869,812306,5102857,18232239,26860214},
+ B_test_rank_array[TEST_ARRAY_SIZE] =
+ {33422937,10244,59149,33135281,99},
+
+ C_test_index_array[TEST_ARRAY_SIZE] =
+ {44172927,72999161,74326391,129606274,21736814},
+ C_test_rank_array[TEST_ARRAY_SIZE] =
+ {61147,882988,266290,133997595,133525895},
+
+ D_test_index_array[TEST_ARRAY_SIZE] =
+ {1317351170,995930646,1157283250,1503301535,1453734525},
+ D_test_rank_array[TEST_ARRAY_SIZE] =
+ {1,36538729,1978098519,2145192618,2147425337};
+
+
+
+/***********************/
+/* function prototypes */
+/***********************/
+double randlc( double *X, double *A );
+
+void full_verify( global_data* gd );
+
+void c_print_results( char *name,
+ char class,
+ int n1,
+ int n2,
+ int n3,
+ int niter,
+ int nprocs_compiled,
+ int nprocs_total,
+ double t,
+ double mops,
+ char *optype,
+ int passed_verification,
+ char *npbversion,
+ char *compiletime,
+ char *mpicc,
+ char *clink,
+ char *cmpi_lib,
+ char *cmpi_inc,
+ char *cflags,
+ char *clinkflags );
+
+void timer_clear(global_data* gd, int n );
+void timer_start(global_data* gd, int n );
+void timer_stop(global_data* gd, int n );
+double timer_read(global_data* gd, int n );
+
+void timer_clear(global_data* gd, int n ) {
+ gd->elapsed[n] = 0.0;
+}
+
+void timer_start(global_data* gd, int n ) {
+ gd->start[n] = MPI_Wtime();
+}
+
+void timer_stop(global_data* gd, int n ) {
+ gd->elapsed[n] += MPI_Wtime() - gd->start[n];
+}
+
+double timer_read(global_data* gd, int n ) {
+ return gd->elapsed[n];
+}
+
+
+/*
+ * FUNCTION RANDLC (X, A)
+ *
+ * This routine returns a uniform pseudorandom double precision number in the
+ * range (0, 1) by using the linear congruential generator
+ *
+ * x_{k+1} = a x_k (mod 2^46)
+ *
+ * where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+ * before repeating. The argument A is the same as 'a' in the above formula,
+ * and X is the same as x_0. A and X must be odd double precision integers
+ * in the range (1, 2^46). The returned value RANDLC is normalized to be
+ * between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
+ * the new seed x_1, so that subsequent calls to RANDLC using the same
+ * arguments will generate a continuous sequence.
+ *
+ * This routine should produce the same results on any computer with at least
+ * 48 mantissa bits in double precision floating point data. On Cray systems,
+ * double precision should be disabled.
+ *
+ * David H. Bailey October 26, 1990
+ *
+ * IMPLICIT DOUBLE PRECISION (A-H, O-Z)
+ * SAVE KS, R23, R46, T23, T46
+ * DATA KS/0/
+ *
+ * If this is the first call to RANDLC, compute R23 = 2 ^ -23, R46 = 2 ^ -46,
+ * T23 = 2 ^ 23, and T46 = 2 ^ 46. These are computed in loops, rather than
+ * by merely using the ** operator, in order to insure that the results are
+ * exact on all systems. This code assumes that 0.5D0 is represented exactly.
+ */
+
+
+/*****************************************************************/
+/************* R A N D L C ************/
+/************* ************/
+/************* portable random number generator ************/
+/*****************************************************************/
+
+double randlc( double *X, double *A )
+{
+ static int KS=0;
+ static double R23, R46, T23, T46;
+ double T1, T2, T3, T4;
+ double A1;
+ double A2;
+ double X1;
+ double X2;
+ double Z;
+ int i, j;
+
+ if (KS == 0)
+ {
+ R23 = 1.0;
+ R46 = 1.0;
+ T23 = 1.0;
+ T46 = 1.0;
+
+ for (i=1; i<=23; i++)
+ {
+ R23 = 0.50 * R23;
+ T23 = 2.0 * T23;
+ }
+ for (i=1; i<=46; i++)
+ {
+ R46 = 0.50 * R46;
+ T46 = 2.0 * T46;
+ }
+ KS = 1;
+ }
+
+/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
+
+ T1 = R23 * *A;
+ j = T1;
+ A1 = j;
+ A2 = *A - T23 * A1;
+
+/* Break X into two parts such that X = 2^23 * X1 + X2, compute
+ Z = A1 * X2 + A2 * X1 (mod 2^23), and then
+ X = 2^23 * Z + A2 * X2 (mod 2^46). */
+
+ T1 = R23 * *X;
+ j = T1;
+ X1 = j;
+ X2 = *X - T23 * X1;
+ T1 = A1 * X2 + A2 * X1;
+
+ j = R23 * T1;
+ T2 = j;
+ Z = T1 - T23 * T2;
+ T3 = T23 * Z + A2 * X2;
+ j = R46 * T3;
+ T4 = j;
+ *X = T3 - T46 * T4;
+ return(R46 * *X);
+}
+
+
+
+/*****************************************************************/
+/************ F I N D _ M Y _ S E E D ************/
+/************ ************/
+/************ returns parallel random number seq seed ************/
+/*****************************************************************/
+
+/*
+ * Create a random number sequence of total length nn residing
+ * on np number of processors. Each processor will therefore have a
+ * subsequence of length nn/np. This routine returns that random
+ * number which is the first random number for the subsequence belonging
+ * to processor rank kn, and which is used as seed for proc kn ran # gen.
+ */
+
+double find_my_seed( int kn, /* my processor rank, 0<=kn<=num procs */
+ int np, /* np = num procs */
+ long nn, /* total num of ran numbers, all procs */
+ double s, /* Ran num seed, for ex.: 314159265.00 */
+ double a ) /* Ran num gen mult, try 1220703125.00 */
+{
+
+ long i;
+
+ double t1,t2,t3,an;
+ long mq,nq,kk,ik;
+
+
+
+ nq = nn / np;
+
+ for( mq=0; nq>1; mq++,nq/=2 )
+ ;
+
+ t1 = a;
+
+ for( i=1; i<=mq; i++ )
+ t2 = randlc( &t1, &t1 );
+
+ an = t1;
+
+ kk = kn;
+ t1 = s;
+ t2 = an;
+
+ for( i=1; i<=100; i++ )
+ {
+ ik = kk / 2;
+ if( 2 * ik != kk )
+ t3 = randlc( &t1, &t2 );
+ if( ik == 0 )
+ break;
+ t3 = randlc( &t2, &t2 );
+ kk = ik;
+ }
+
+ return( t1 );
+
+}
+
+
+
+
+/*****************************************************************/
+/************* C R E A T E _ S E Q ************/
+/*****************************************************************/
+
+void create_seq( global_data* gd, double seed, double a )
+{
+ double x;
+ int i, k;
+
+ k = MAX_KEY/4;
+
+ for (i=0; i<NUM_KEYS; i++)
+ {
+ x = randlc(&seed, &a);
+ x += randlc(&seed, &a);
+ x += randlc(&seed, &a);
+ x += randlc(&seed, &a);
+
+ gd->key_array[i] = k*x;
+ }
+}
+
+
+
+
+/*****************************************************************/
+/************* F U L L _ V E R I F Y ************/
+/*****************************************************************/
+
+
+void full_verify( global_data* gd )
+{
+ MPI_Status status;
+ MPI_Request request;
+
+ INT_TYPE i, j;
+ INT_TYPE k, last_local_key;
+
+
+/* Now, finally, sort the keys: */
+ for( i=0; i<gd->total_local_keys; i++ )
+ gd->key_array[--gd->key_buff_ptr_global[gd->key_buff2[i]]-
+ gd->total_lesser_keys] = gd->key_buff2[i];
+ last_local_key = (gd->total_local_keys<1)? 0 : (gd->total_local_keys-1);
+
+/* Send largest key value to next processor */
+ if( gd->my_rank > 0 )
+ MPI_Irecv( &k,
+ 1,
+ MP_KEY_TYPE,
+ gd->my_rank-1,
+ 1000,
+ MPI_COMM_WORLD,
+ &request );
+ if( gd->my_rank < gd->comm_size-1 )
+ MPI_Send( &gd->key_array[last_local_key],
+ 1,
+ MP_KEY_TYPE,
+ gd->my_rank+1,
+ 1000,
+ MPI_COMM_WORLD );
+ if( gd->my_rank > 0 )
+ MPI_Wait( &request, &status );
+
+/* Confirm that neighbor's greatest key value
+ is not greater than my least key value */
+ j = 0;
+ if( gd->my_rank > 0 && gd->total_local_keys > 0 )
+ if( k > gd->key_array[0] )
+ j++;
+
+
+/* Confirm keys correctly sorted: count incorrectly sorted keys, if any */
+ for( i=1; i<gd->total_local_keys; i++ )
+ if( gd->key_array[i-1] > gd->key_array[i] )
+ j++;
+
+
+ if( j != 0 )
+ {
+ printf( "Processor %d: Full_verify: number of keys out of sort: %d\n",
+ gd->my_rank, j );
+ }
+ else
+ gd->passed_verification++;
+
+
+}
+
+
+
+
+/*****************************************************************/
+/************* R A N K ****************/
+/*****************************************************************/
+
+
+void rank( global_data* gd, int iteration )
+{
+
+ INT_TYPE i, k;
+
+ INT_TYPE shift = MAX_KEY_LOG_2 - NUM_BUCKETS_LOG_2;
+ INT_TYPE key;
+ INT_TYPE2 bucket_sum_accumulator, j, m;
+ INT_TYPE local_bucket_sum_accumulator;
+ INT_TYPE min_key_val, max_key_val;
+ INT_TYPE *key_buff_ptr;
+
+
+
+
+/* Iteration alteration of keys */
+ if(gd->my_rank == 0 )
+ {
+ gd->key_array[iteration] = iteration;
+ gd->key_array[iteration+MAX_ITERATIONS] = MAX_KEY - iteration;
+ }
+
+
+/* Initialize */
+ for( i=0; i<NUM_BUCKETS+TEST_ARRAY_SIZE; i++ )
+ {
+ gd->bucket_size[i] = 0;
+ gd->bucket_size_totals[i] = 0;
+ gd->process_bucket_distrib_ptr1[i] = 0;
+ gd->process_bucket_distrib_ptr2[i] = 0;
+ }
+
+
+/* Determine where the partial verify test keys are, load into */
+/* top of array bucket_size */
+ for( i=0; i<TEST_ARRAY_SIZE; i++ )
+ if( (gd->test_index_array[i]/NUM_KEYS) == gd->my_rank )
+ gd->bucket_size[NUM_BUCKETS+i] =
+ gd->key_array[gd->test_index_array[i] % NUM_KEYS];
+
+
+/* Determine the number of keys in each bucket */
+ for( i=0; i<NUM_KEYS; i++ )
+ gd->bucket_size[gd->key_array[i] >> shift]++;
+
+
+/* Accumulative bucket sizes are the bucket pointers */
+ gd->bucket_ptrs[0] = 0;
+ for( i=1; i< NUM_BUCKETS; i++ )
+ gd->bucket_ptrs[i] = gd->bucket_ptrs[i-1] + gd->bucket_size[i-1];
+
+
+/* Sort into appropriate bucket */
+ for( i=0; i<NUM_KEYS; i++ )
+ {
+ key = gd->key_array[i];
+ gd->key_buff1[gd->bucket_ptrs[key >> shift]++] = key;
+ }
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 2 );
+ timer_start(gd, 3 );
+#endif
+
+/* Get the bucket size totals for the entire problem. These
+ will be used to determine the redistribution of keys */
+ MPI_Allreduce( gd->bucket_size,
+ gd->bucket_size_totals,
+ NUM_BUCKETS+TEST_ARRAY_SIZE,
+ MP_KEY_TYPE,
+ MPI_SUM,
+ MPI_COMM_WORLD );
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 3 );
+ timer_start(gd, 2 );
+#endif
+
+/* Determine Redistibution of keys: accumulate the bucket size totals
+ till this number surpasses NUM_KEYS (which the average number of keys
+ per processor). Then all keys in these buckets go to processor 0.
+ Continue accumulating again until supassing 2*NUM_KEYS. All keys
+ in these buckets go to processor 1, etc. This algorithm guarantees
+ that all processors have work ranking; no processors are left idle.
+ The optimum number of buckets, however, does not result in as high
+ a degree of load balancing (as even a distribution of keys as is
+ possible) as is obtained from increasing the number of buckets, but
+ more buckets results in more computation per processor so that the
+ optimum number of buckets turns out to be 1024 for machines tested.
+ Note that process_bucket_distrib_ptr1 and ..._ptr2 hold the bucket
+ number of first and last bucket which each processor will have after
+ the redistribution is done. */
+
+ bucket_sum_accumulator = 0;
+ local_bucket_sum_accumulator = 0;
+ gd->send_displ[0] = 0;
+ gd->process_bucket_distrib_ptr1[0] = 0;
+ for( i=0, j=0; i<NUM_BUCKETS; i++ )
+ {
+ bucket_sum_accumulator += gd->bucket_size_totals[i];
+ local_bucket_sum_accumulator += gd->bucket_size[i];
+ if( bucket_sum_accumulator >= (j+1)*NUM_KEYS )
+ {
+ gd->send_count[j] = local_bucket_sum_accumulator;
+ if( j != 0 )
+ {
+ gd->send_displ[j] = gd->send_displ[j-1] + gd->send_count[j-1];
+ gd->process_bucket_distrib_ptr1[j] =
+ gd->process_bucket_distrib_ptr2[j-1]+1;
+ }
+ gd->process_bucket_distrib_ptr2[j++] = i;
+ local_bucket_sum_accumulator = 0;
+ }
+ }
+
+/* When NUM_PROCS approaching NUM_BUCKETS, it is highly possible
+ that the last few processors don't get any buckets. So, we
+ need to set counts properly in this case to avoid any fallouts. */
+ while( j < gd->comm_size )
+ {
+ gd->send_count[j] = 0;
+ gd->process_bucket_distrib_ptr1[j] = 1;
+ j++;
+ }
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 2 );
+ timer_start(gd, 3 );
+#endif
+
+/* This is the redistribution section: first find out how many keys
+ each processor will send to every other processor: */
+ MPI_Alltoall( gd->send_count,
+ 1,
+ MPI_INT,
+ gd->recv_count,
+ 1,
+ MPI_INT,
+ MPI_COMM_WORLD );
+ MPI_Wtime();
+
+/* Determine the receive array displacements for the buckets */
+ gd->recv_displ[0] = 0;
+ for( i=1; i<gd->comm_size; i++ )
+ gd->recv_displ[i] = gd->recv_displ[i-1] + gd->recv_count[i-1];
+
+
+ MPI_Wtime();
+/* Now send the keys to respective processors */
+ MPI_Alltoallv( gd->key_buff1,
+ gd->send_count,
+ gd->send_displ,
+ MP_KEY_TYPE,
+ gd->key_buff2,
+ gd->recv_count,
+ gd->recv_displ,
+ MP_KEY_TYPE,
+ MPI_COMM_WORLD );
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 3 );
+ timer_start(gd, 2 );
+#endif
+
+/* The starting and ending bucket numbers on each processor are
+ multiplied by the interval size of the buckets to obtain the
+ smallest possible min and greatest possible max value of any
+ key on each processor */
+ min_key_val = gd->process_bucket_distrib_ptr1[gd->my_rank] << shift;
+ max_key_val = ((gd->process_bucket_distrib_ptr2[gd->my_rank] + 1) << shift)-1;
+
+/* Clear the work array */
+ for( i=0; i<max_key_val-min_key_val+1; i++ )
+ gd->key_buff1[i] = 0;
+
+/* Determine the total number of keys on all other
+ processors holding keys of lesser value */
+ m = 0;
+ for( k=0; k<gd->my_rank; k++ )
+ for( i= gd->process_bucket_distrib_ptr1[k];
+ i<=gd->process_bucket_distrib_ptr2[k];
+ i++ )
+ m += gd->bucket_size_totals[i]; /* m has total # of lesser keys */
+
+/* Determine total number of keys on this processor */
+ j = 0;
+ for( i= gd->process_bucket_distrib_ptr1[gd->my_rank];
+ i<=gd->process_bucket_distrib_ptr2[gd->my_rank];
+ i++ )
+ j += gd->bucket_size_totals[i]; /* j has total # of local keys */
+
+
+/* Ranking of all keys occurs in this section: */
+/* shift it backwards so no subtractions are necessary in loop */
+ key_buff_ptr = gd->key_buff1 - min_key_val;
+
+/* In this section, the keys themselves are used as their
+ own indexes to determine how many of each there are: their
+ individual population */
+ for( i=0; i<j; i++ )
+ key_buff_ptr[gd->key_buff2[i]]++; /* Now they have individual key */
+ /* population */
+
+/* To obtain ranks of each key, successively add the individual key
+ population, not forgetting the total of lesser keys, m.
+ NOTE: Since the total of lesser keys would be subtracted later
+ in verification, it is no longer added to the first key population
+ here, but still needed during the partial verify test. This is to
+ ensure that 32-bit key_buff can still be used for class D. */
+/* key_buff_ptr[min_key_val] += m; */
+ for( i=min_key_val; i<max_key_val; i++ )
+ key_buff_ptr[i+1] += key_buff_ptr[i];
+
+
+/* This is the partial verify test section */
+/* Observe that test_rank_array vals are */
+/* shifted differently for different cases */
+ for( i=0; i<TEST_ARRAY_SIZE; i++ )
+ {
+ k = gd->bucket_size_totals[i+NUM_BUCKETS]; /* Keys were hidden here */
+ if( min_key_val <= k && k <= max_key_val )
+ {
+ /* Add the total of lesser keys, m, here */
+ INT_TYPE2 key_rank = key_buff_ptr[k-1] + m;
+ int failed = 0;
+
+ switch( CLASS )
+ {
+ case 'S':
+ if( i <= 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'W':
+ if( i < 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+(iteration-2) )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'A':
+ if( i <= 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+(iteration-1) )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-(iteration-1) )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'B':
+ if( i == 1 || i == 2 || i == 4 )
+ {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'C':
+ if( i <= 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ case 'D':
+ if( i < 2 )
+ {
+ if( key_rank != gd->test_rank_array[i]+iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ else
+ {
+ if( key_rank != gd->test_rank_array[i]-iteration )
+ failed = 1;
+ else
+ gd->passed_verification++;
+ }
+ break;
+ }
+ if( failed == 1 )
+ printf( "Failed partial verification: "
+ "iteration %d, processor %d, test key %d\n",
+ iteration, gd->my_rank, (int)i );
+ }
+ }
+
+
+
+
+/* Make copies of rank info for use by full_verify: these variables
+ in rank are local; making them global slows down the code, probably
+ since they cannot be made register by compiler */
+
+ if( iteration == MAX_ITERATIONS )
+ {
+ gd->key_buff_ptr_global = key_buff_ptr;
+ gd->total_local_keys = j;
+ gd->total_lesser_keys = 0; /* no longer set to 'm', see note above */
+ }
+
+}
+
+
+/*****************************************************************/
+/************* M A I N ****************/
+/*****************************************************************/
+
+int main( int argc, char **argv )
+{
+
+ int i, iteration, itemp;
+
+ double timecounter, maxtime;
+
+ global_data* gd = malloc(sizeof(global_data));
+/* Initialize MPI */
+ MPI_Init( &argc, &argv );
+ MPI_Comm_rank( MPI_COMM_WORLD, &gd->my_rank );
+ MPI_Comm_size( MPI_COMM_WORLD, &gd->comm_size );
+
+/* Initialize the verification arrays if a valid class */
+ for( i=0; i<TEST_ARRAY_SIZE; i++ )
+ switch( CLASS )
+ {
+ case 'S':
+ gd->test_index_array[i] = S_test_index_array[i];
+ gd->test_rank_array[i] = S_test_rank_array[i];
+ break;
+ case 'A':
+ gd->test_index_array[i] = A_test_index_array[i];
+ gd->test_rank_array[i] = A_test_rank_array[i];
+ break;
+ case 'W':
+ gd->test_index_array[i] = W_test_index_array[i];
+ gd->test_rank_array[i] = W_test_rank_array[i];
+ break;
+ case 'B':
+ gd->test_index_array[i] = B_test_index_array[i];
+ gd->test_rank_array[i] = B_test_rank_array[i];
+ break;
+ case 'C':
+ gd->test_index_array[i] = C_test_index_array[i];
+ gd->test_rank_array[i] = C_test_rank_array[i];
+ break;
+ case 'D':
+ gd->test_index_array[i] = D_test_index_array[i];
+ gd->test_rank_array[i] = D_test_rank_array[i];
+ break;
+ };
+
+
+
+/* Printout initial NPB info */
+ if( gd->my_rank == 0 )
+ {
+ printf( "\n\n NAS Parallel Benchmarks 3.3 -- IS Benchmark\n\n" );
+ printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS*MIN_PROCS, CLASS );
+ printf( " Iterations: %d\n", MAX_ITERATIONS );
+ printf( " Number of processes: %d\n",gd->comm_size );
+ }
+
+/* Check that actual and compiled number of processors agree */
+ if( gd->comm_size != NUM_PROCS )
+ {
+ if( gd->my_rank == 0 )
+ printf( "\n ERROR: compiled for %d processes\n"
+ " Number of active processes: %d\n"
+ " Exiting program!\n\n", NUM_PROCS, gd->comm_size );
+ MPI_Finalize();
+ exit( 1 );
+ }
+
+/* Check to see whether total number of processes is within bounds.
+ This could in principle be checked in setparams.c, but it is more
+ convenient to do it here */
+ if( gd->comm_size < MIN_PROCS || gd->comm_size > MAX_PROCS)
+ {
+ if( gd->my_rank == 0 )
+ printf( "\n ERROR: number of processes %d not within range %d-%d"
+ "\n Exiting program!\n\n", gd->comm_size, MIN_PROCS, MAX_PROCS);
+ MPI_Finalize();
+ exit( 1 );
+ }
+
+
+/* Generate random number sequence and subsequent keys on all procs */
+ create_seq(gd, find_my_seed( gd->my_rank,
+ gd->comm_size,
+ 4*(long)TOTAL_KEYS*MIN_PROCS,
+ 314159265.00, /* Random number gen seed */
+ 1220703125.00 ), /* Random number gen mult */
+ 1220703125.00 ); /* Random number gen mult */
+
+/* Do one interation for free (i.e., untimed) to guarantee initialization of
+ all data and code pages and respective tables */
+ rank(gd, 1 );
+
+/* Start verification counter */
+ gd->passed_verification = 0;
+
+ if( gd->my_rank == 0 && CLASS != 'S' ) printf( "\n iteration\n" );
+
+/* Initialize timer */
+ timer_clear(gd, 0 );
+
+/* Initialize separate communication, computation timing */
+#ifdef TIMING_ENABLED
+ for( i=1; i<=3; i++ ) timer_clear(gd, i );
+#endif
+
+/* Start timer */
+ timer_start(gd, 0 );
+
+#ifdef TIMING_ENABLED
+ timer_start(gd, 1 );
+ timer_start(gd, 2 );
+#endif
+
+/* This is the main iteration */
+ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ )
+ {
+ if( gd->my_rank == 0 && CLASS != 'S' ) printf( " %d\n", iteration );
+ rank(gd, iteration );
+ }
+
+
+#ifdef TIMING_ENABLED
+ timer_stop(gd, 2 );
+ timer_stop(gd, 1 );
+#endif
+
+/* Stop timer, obtain time for processors */
+ timer_stop(gd, 0 );
+
+ timecounter = timer_read(gd, 0 );
+
+/* End of timing, obtain maximum time of all processors */
+ MPI_Reduce( &timecounter,
+ &maxtime,
+ 1,
+ MPI_DOUBLE,
+ MPI_MAX,
+ 0,
+ MPI_COMM_WORLD );
+
+#ifdef TIMING_ENABLED
+ {
+ double tmin, tsum, tmax;
+
+ if( my_rank == 0 )
+ {
+ printf( "\ntimer 1/2/3 = total/computation/communication time\n");
+ printf( " min avg max\n" );
+ }
+ for( i=1; i<=3; i++ )
+ {
+ timecounter = timer_read(gd, i );
+ MPI_Reduce( &timecounter,
+ &tmin,
+ 1,
+ MPI_DOUBLE,
+ MPI_MIN,
+ 0,
+ MPI_COMM_WORLD );
+ MPI_Reduce( &timecounter,
+ &tsum,
+ 1,
+ MPI_DOUBLE,
+ MPI_SUM,
+ 0,
+ MPI_COMM_WORLD );
+ MPI_Reduce( &timecounter,
+ &tmax,
+ 1,
+ MPI_DOUBLE,
+ MPI_MAX,
+ 0,
+ MPI_COMM_WORLD );
+ if( my_rank == 0 )
+ printf( "timer %d: %f %f %f\n",
+ i, tmin, tsum/((double) comm_size), tmax );
+ }
+ if( my_rank == 0 )
+ printf( "\n" );
+ }
+#endif
+
+/* This tests that keys are in sequence: sorting of last ranked key seq
+ occurs here, but is an untimed operation */
+ full_verify(gd);
+
+
+/* Obtain verification counter sum */
+ itemp =gd->passed_verification;
+ MPI_Reduce( &itemp,
+ &gd->passed_verification,
+ 1,
+ MPI_INT,
+ MPI_SUM,
+ 0,
+ MPI_COMM_WORLD );
+
+
+
+/* The final printout */
+ if( gd->my_rank == 0 )
+ {
+ if( gd->passed_verification != 5*MAX_ITERATIONS + gd->comm_size )
+ gd->passed_verification = 0;
+ c_print_results( "IS",
+ CLASS,
+ (int)(TOTAL_KEYS),
+ MIN_PROCS,
+ 0,
+ MAX_ITERATIONS,
+ NUM_PROCS,
+ gd->comm_size,
+ maxtime,
+ ((double) (MAX_ITERATIONS)*TOTAL_KEYS*MIN_PROCS)
+ /maxtime/1000000.,
+ "keys ranked",
+ gd->passed_verification,
+ NPBVERSION,
+ COMPILETIME,
+ MPICC,
+ CLINK,
+ CMPI_LIB,
+ CMPI_INC,
+ CFLAGS,
+ CLINKFLAGS );
+ }
+
+ MPI_Finalize();
+ free(gd);
+
+ return 0;
+ /**************************/
+} /* E N D P R O G R A M */
+ /**************************/
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=lu
+BENCHMARKU=LU
+VEC=
+
+include ../config/make.def
+
+OBJS = lu.o init_comm.o read_input.o bcast_inputs.o proc_grid.o neighbors.o \
+ nodedim.o subdomain.o setcoeff.o sethyper.o setbv.o exact.o setiv.o \
+ erhs.o ssor.o exchange_1.o exchange_3.o exchange_4.o exchange_5.o \
+ exchange_6.o rhs.o l2norm.o jacld.o blts$(VEC).o jacu.o buts$(VEC).o \
+ error.o pintgr.o verify.o ${COMMON}/print_results.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+
+# npbparams.h is included by applu.incl
+# The following rule should do the trick but many make programs (not gmake)
+# will do the wrong thing and rebuild the world every time (because the
+# mod time on header.h is not changed. One solution would be to
+# touch header.h but this might cause confusion if someone has
+# accidentally deleted it. Instead, make the dependency on npbparams.h
+# explicit in all the lines below (even though dependence is indirect).
+
+# applu.incl: npbparams.h
+
+${PROGRAM}: config
+ @if [ x$(VERSION) = xvec ] ; then \
+ ${MAKE} VEC=_vec exec; \
+ elif [ x$(VERSION) = xVEC ] ; then \
+ ${MAKE} VEC=_vec exec; \
+ else \
+ ${MAKE} exec; \
+ fi
+
+exec: $(OBJS)
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+.f.o :
+ ${FCOMPILE} $<
+
+lu.o: lu.f applu.incl npbparams.h
+bcast_inputs.o: bcast_inputs.f applu.incl npbparams.h mpinpb.h
+blts$(VEC).o: blts$(VEC).f
+buts$(VEC).o: buts$(VEC).f
+erhs.o: erhs.f applu.incl npbparams.h
+error.o: error.f applu.incl npbparams.h mpinpb.h
+exact.o: exact.f applu.incl npbparams.h
+exchange_1.o: exchange_1.f applu.incl npbparams.h mpinpb.h
+exchange_3.o: exchange_3.f applu.incl npbparams.h mpinpb.h
+exchange_4.o: exchange_4.f applu.incl npbparams.h mpinpb.h
+exchange_5.o: exchange_5.f applu.incl npbparams.h mpinpb.h
+exchange_6.o: exchange_6.f applu.incl npbparams.h mpinpb.h
+init_comm.o: init_comm.f applu.incl npbparams.h mpinpb.h
+jacld.o: jacld.f applu.incl npbparams.h
+jacu.o: jacu.f applu.incl npbparams.h
+l2norm.o: l2norm.f mpinpb.h
+neighbors.o: neighbors.f applu.incl npbparams.h
+nodedim.o: nodedim.f
+pintgr.o: pintgr.f applu.incl npbparams.h mpinpb.h
+proc_grid.o: proc_grid.f applu.incl npbparams.h
+read_input.o: read_input.f applu.incl npbparams.h mpinpb.h
+rhs.o: rhs.f applu.incl npbparams.h
+setbv.o: setbv.f applu.incl npbparams.h
+setiv.o: setiv.f applu.incl npbparams.h
+setcoeff.o: setcoeff.f applu.incl npbparams.h
+sethyper.o: sethyper.f applu.incl npbparams.h
+ssor.o: ssor.f applu.incl npbparams.h mpinpb.h
+subdomain.o: subdomain.f applu.incl npbparams.h mpinpb.h
+verify.o: verify.f applu.incl npbparams.h
+
+clean:
+ - /bin/rm -f npbparams.h
+ - /bin/rm -f *.o *~
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c--- applu.incl
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c npbparams.h defines parameters that depend on the class and
+c number of nodes
+c---------------------------------------------------------------------
+
+ include 'npbparams.h'
+
+c---------------------------------------------------------------------
+c parameters which can be overridden in runtime config file
+c (in addition to size of problem - isiz01,02,03 give the maximum size)
+c ipr = 1 to print out verbose information
+c omega = 2.0 is correct for all classes
+c tolrsd is tolerance levels for steady state residuals
+c---------------------------------------------------------------------
+ integer ipr_default
+ parameter (ipr_default = 1)
+ double precision omega_default
+ parameter (omega_default = 1.2d0)
+ double precision tolrsd1_def, tolrsd2_def, tolrsd3_def,
+ > tolrsd4_def, tolrsd5_def
+ parameter (tolrsd1_def=1.0e-08,
+ > tolrsd2_def=1.0e-08, tolrsd3_def=1.0e-08,
+ > tolrsd4_def=1.0e-08, tolrsd5_def=1.0e-08)
+
+ double precision c1, c2, c3, c4, c5
+ parameter( c1 = 1.40d+00, c2 = 0.40d+00,
+ > c3 = 1.00d-01, c4 = 1.00d+00,
+ > c5 = 1.40d+00 )
+
+c---------------------------------------------------------------------
+c grid
+c---------------------------------------------------------------------
+ integer nx, ny, nz
+ integer nx0, ny0, nz0
+ integer ipt, ist, iend
+ integer jpt, jst, jend
+ integer ii1, ii2
+ integer ji1, ji2
+ integer ki1, ki2
+ double precision dxi, deta, dzeta
+ double precision tx1, tx2, tx3
+ double precision ty1, ty2, ty3
+ double precision tz1, tz2, tz3
+
+ common/cgcon/ dxi, deta, dzeta,
+ > tx1, tx2, tx3,
+ > ty1, ty2, ty3,
+ > tz1, tz2, tz3,
+ > nx, ny, nz,
+ > nx0, ny0, nz0,
+ > ipt, ist, iend,
+ > jpt, jst, jend,
+ > ii1, ii2,
+ > ji1, ji2,
+ > ki1, ki2
+
+c---------------------------------------------------------------------
+c dissipation
+c---------------------------------------------------------------------
+ double precision dx1, dx2, dx3, dx4, dx5
+ double precision dy1, dy2, dy3, dy4, dy5
+ double precision dz1, dz2, dz3, dz4, dz5
+ double precision dssp
+
+ common/disp/ dx1,dx2,dx3,dx4,dx5,
+ > dy1,dy2,dy3,dy4,dy5,
+ > dz1,dz2,dz3,dz4,dz5,
+ > dssp
+
+c---------------------------------------------------------------------
+c field variables and residuals
+c---------------------------------------------------------------------
+ double precision u(5,-1:isiz1+2,-1:isiz2+2,isiz3),
+ > rsd(5,-1:isiz1+2,-1:isiz2+2,isiz3),
+ > frct(5,-1:isiz1+2,-1:isiz2+2,isiz3),
+ > flux(5,0:isiz1+1,0:isiz2+1,isiz3)
+
+ common/cvar/ u,
+ > rsd,
+ > frct,
+ > flux
+
+
+c---------------------------------------------------------------------
+c output control parameters
+c---------------------------------------------------------------------
+ integer ipr, inorm
+
+ common/cprcon/ ipr, inorm
+
+c---------------------------------------------------------------------
+c newton-raphson iteration control parameters
+c---------------------------------------------------------------------
+ integer itmax, invert
+ double precision dt, omega, tolrsd(5),
+ > rsdnm(5), errnm(5), frc, ttotal
+
+ common/ctscon/ dt, omega, tolrsd,
+ > rsdnm, errnm, frc, ttotal,
+ > itmax, invert
+
+ double precision a(5,5,isiz1,isiz2),
+ > b(5,5,isiz1,isiz2),
+ > c(5,5,isiz1,isiz2),
+ > d(5,5,isiz1,isiz2)
+
+ common/cjac/ a, b, c, d
+
+c---------------------------------------------------------------------
+c coefficients of the exact solution
+c---------------------------------------------------------------------
+ double precision ce(5,13)
+
+ common/cexact/ ce
+
+c---------------------------------------------------------------------
+c multi-processor common blocks
+c---------------------------------------------------------------------
+ integer id, ndim, num, xdim, ydim, row, col
+ common/dim/ id,ndim,num,xdim,ydim,row,col
+
+ integer north,south,east,west
+ common/neigh/ north,south,east, west
+
+ integer from_s,from_n,from_e,from_w
+ parameter (from_s=1,from_n=2,from_e=3,from_w=4)
+
+ integer npmax
+ parameter (npmax=isiz01+isiz02)
+
+ logical icommn(npmax+1),icomms(npmax+1),
+ > icomme(npmax+1),icommw(npmax+1)
+ double precision buf(5,2*isiz2*isiz3),
+ > buf1(5,2*isiz2*isiz3)
+
+ common/comm/ buf, buf1,
+ > icommn,icomms,
+ > icomme,icommw
+
+ double precision maxtime
+ common/timer/maxtime
+
+
+c---------------------------------------------------------------------
+c end of include file
+c---------------------------------------------------------------------
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine bcast_inputs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer ierr
+
+c---------------------------------------------------------------------
+c root broadcasts the data
+c The data isn't contiguous or of the same type, so it's not
+c clear how to send it in the "MPI" way.
+c We could pack the info into a buffer or we could create
+c an obscene datatype to handle it all at once. Since we only
+c broadcast the data once, just use a separate broadcast for
+c each piece.
+c---------------------------------------------------------------------
+ call MPI_BCAST(ipr, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+ call MPI_BCAST(inorm, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+ call MPI_BCAST(itmax, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+ call MPI_BCAST(dt, 1, dp_type, root, MPI_COMM_WORLD, ierr)
+ call MPI_BCAST(omega, 1, dp_type, root, MPI_COMM_WORLD, ierr)
+ call MPI_BCAST(tolrsd, 5, dp_type, root, MPI_COMM_WORLD, ierr)
+ call MPI_BCAST(nx0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+ call MPI_BCAST(ny0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+ call MPI_BCAST(nz0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
+
+ return
+ end
+
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine blts ( ldmx, ldmy, ldmz,
+ > nx, ny, nz, k,
+ > omega,
+ > v,
+ > ldz, ldy, ldx, d,
+ > ist, iend, jst, jend,
+ > nx0, ny0, ipt, jpt)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c compute the regular-sparse, block lower triangular solution:
+c
+c v <-- ( L-inv ) * v
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ integer ldmx, ldmy, ldmz
+ integer nx, ny, nz
+ integer k
+ double precision omega
+ double precision v( 5, -1:ldmx+2, -1:ldmy+2, *),
+ > ldz( 5, 5, ldmx, ldmy),
+ > ldy( 5, 5, ldmx, ldmy),
+ > ldx( 5, 5, ldmx, ldmy),
+ > d( 5, 5, ldmx, ldmy)
+ integer ist, iend
+ integer jst, jend
+ integer nx0, ny0
+ integer ipt, jpt
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, m
+ integer iex
+ double precision tmp, tmp1
+ double precision tmat(5,5)
+
+
+c---------------------------------------------------------------------
+c receive data from north and west
+c---------------------------------------------------------------------
+ iex = 0
+ call exchange_1( v,k,iex )
+
+
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+
+ v( m, i, j, k ) = v( m, i, j, k )
+ > - omega * ( ldz( m, 1, i, j ) * v( 1, i, j, k-1 )
+ > + ldz( m, 2, i, j ) * v( 2, i, j, k-1 )
+ > + ldz( m, 3, i, j ) * v( 3, i, j, k-1 )
+ > + ldz( m, 4, i, j ) * v( 4, i, j, k-1 )
+ > + ldz( m, 5, i, j ) * v( 5, i, j, k-1 ) )
+
+ end do
+ end do
+ end do
+
+
+ do j=jst,jend
+ do i = ist, iend
+
+ do m = 1, 5
+
+ v( m, i, j, k ) = v( m, i, j, k )
+ > - omega * ( ldy( m, 1, i, j ) * v( 1, i, j-1, k )
+ > + ldx( m, 1, i, j ) * v( 1, i-1, j, k )
+ > + ldy( m, 2, i, j ) * v( 2, i, j-1, k )
+ > + ldx( m, 2, i, j ) * v( 2, i-1, j, k )
+ > + ldy( m, 3, i, j ) * v( 3, i, j-1, k )
+ > + ldx( m, 3, i, j ) * v( 3, i-1, j, k )
+ > + ldy( m, 4, i, j ) * v( 4, i, j-1, k )
+ > + ldx( m, 4, i, j ) * v( 4, i-1, j, k )
+ > + ldy( m, 5, i, j ) * v( 5, i, j-1, k )
+ > + ldx( m, 5, i, j ) * v( 5, i-1, j, k ) )
+
+ end do
+
+c---------------------------------------------------------------------
+c diagonal block inversion
+c
+c forward elimination
+c---------------------------------------------------------------------
+ do m = 1, 5
+ tmat( m, 1 ) = d( m, 1, i, j )
+ tmat( m, 2 ) = d( m, 2, i, j )
+ tmat( m, 3 ) = d( m, 3, i, j )
+ tmat( m, 4 ) = d( m, 4, i, j )
+ tmat( m, 5 ) = d( m, 5, i, j )
+ end do
+
+ tmp1 = 1.0d+00 / tmat( 1, 1 )
+ tmp = tmp1 * tmat( 2, 1 )
+ tmat( 2, 2 ) = tmat( 2, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 2, 3 ) = tmat( 2, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 2, 4 ) = tmat( 2, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 2, 5 ) = tmat( 2, 5 )
+ > - tmp * tmat( 1, 5 )
+ v( 2, i, j, k ) = v( 2, i, j, k )
+ > - v( 1, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 3, 1 )
+ tmat( 3, 2 ) = tmat( 3, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 3, 3 ) = tmat( 3, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 3, 4 ) = tmat( 3, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 3, 5 ) = tmat( 3, 5 )
+ > - tmp * tmat( 1, 5 )
+ v( 3, i, j, k ) = v( 3, i, j, k )
+ > - v( 1, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 4, 1 )
+ tmat( 4, 2 ) = tmat( 4, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 4, 3 ) = tmat( 4, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 1, 5 )
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > - v( 1, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 5, 1 )
+ tmat( 5, 2 ) = tmat( 5, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 5, 3 ) = tmat( 5, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 1, 5 )
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > - v( 1, i, j, k ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 2, 2 )
+ tmp = tmp1 * tmat( 3, 2 )
+ tmat( 3, 3 ) = tmat( 3, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 3, 4 ) = tmat( 3, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 3, 5 ) = tmat( 3, 5 )
+ > - tmp * tmat( 2, 5 )
+ v( 3, i, j, k ) = v( 3, i, j, k )
+ > - v( 2, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 4, 2 )
+ tmat( 4, 3 ) = tmat( 4, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 2, 5 )
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > - v( 2, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 5, 2 )
+ tmat( 5, 3 ) = tmat( 5, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 2, 5 )
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > - v( 2, i, j, k ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 3, 3 )
+ tmp = tmp1 * tmat( 4, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 3, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 3, 5 )
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > - v( 3, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 5, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 3, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 3, 5 )
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > - v( 3, i, j, k ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 4, 4 )
+ tmp = tmp1 * tmat( 5, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 4, 5 )
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > - v( 4, i, j, k ) * tmp
+
+c---------------------------------------------------------------------
+c back substitution
+c---------------------------------------------------------------------
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > / tmat( 5, 5 )
+
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > - tmat( 4, 5 ) * v( 5, i, j, k )
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > / tmat( 4, 4 )
+
+ v( 3, i, j, k ) = v( 3, i, j, k )
+ > - tmat( 3, 4 ) * v( 4, i, j, k )
+ > - tmat( 3, 5 ) * v( 5, i, j, k )
+ v( 3, i, j, k ) = v( 3, i, j, k )
+ > / tmat( 3, 3 )
+
+ v( 2, i, j, k ) = v( 2, i, j, k )
+ > - tmat( 2, 3 ) * v( 3, i, j, k )
+ > - tmat( 2, 4 ) * v( 4, i, j, k )
+ > - tmat( 2, 5 ) * v( 5, i, j, k )
+ v( 2, i, j, k ) = v( 2, i, j, k )
+ > / tmat( 2, 2 )
+
+ v( 1, i, j, k ) = v( 1, i, j, k )
+ > - tmat( 1, 2 ) * v( 2, i, j, k )
+ > - tmat( 1, 3 ) * v( 3, i, j, k )
+ > - tmat( 1, 4 ) * v( 4, i, j, k )
+ > - tmat( 1, 5 ) * v( 5, i, j, k )
+ v( 1, i, j, k ) = v( 1, i, j, k )
+ > / tmat( 1, 1 )
+
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c send data to east and south
+c---------------------------------------------------------------------
+ iex = 2
+ call exchange_1( v,k,iex )
+
+ return
+ end
+
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine blts ( ldmx, ldmy, ldmz,
+ > nx, ny, nz, k,
+ > omega,
+ > v,
+ > ldz, ldy, ldx, d,
+ > ist, iend, jst, jend,
+ > nx0, ny0, ipt, jpt)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c compute the regular-sparse, block lower triangular solution:
+c
+c v <-- ( L-inv ) * v
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ integer ldmx, ldmy, ldmz
+ integer nx, ny, nz
+ integer k
+ double precision omega
+ double precision v( 5, -1:ldmx+2, -1:ldmy+2, *),
+ > ldz( 5, 5, ldmx, ldmy),
+ > ldy( 5, 5, ldmx, ldmy),
+ > ldx( 5, 5, ldmx, ldmy),
+ > d( 5, 5, ldmx, ldmy)
+ integer ist, iend
+ integer jst, jend
+ integer nx0, ny0
+ integer ipt, jpt
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, m, l, istp, iendp
+ integer iex
+ double precision tmp, tmp1
+ double precision tmat(5,5)
+
+
+c---------------------------------------------------------------------
+c receive data from north and west
+c---------------------------------------------------------------------
+ iex = 0
+ call exchange_1( v,k,iex )
+
+
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+
+ v( m, i, j, k ) = v( m, i, j, k )
+ > - omega * ( ldz( m, 1, i, j ) * v( 1, i, j, k-1 )
+ > + ldz( m, 2, i, j ) * v( 2, i, j, k-1 )
+ > + ldz( m, 3, i, j ) * v( 3, i, j, k-1 )
+ > + ldz( m, 4, i, j ) * v( 4, i, j, k-1 )
+ > + ldz( m, 5, i, j ) * v( 5, i, j, k-1 ) )
+
+ end do
+ end do
+ end do
+
+
+ do l = ist+jst, iend+jend
+ istp = max(l - jend, ist)
+ iendp = min(l - jst, iend)
+
+!dir$ ivdep
+ do i = istp, iendp
+ j = l - i
+
+!!dir$ unroll 5
+! manually unroll the loop
+! do m = 1, 5
+
+ v( 1, i, j, k ) = v( 1, i, j, k )
+ > - omega * ( ldy( 1, 1, i, j ) * v( 1, i, j-1, k )
+ > + ldx( 1, 1, i, j ) * v( 1, i-1, j, k )
+ > + ldy( 1, 2, i, j ) * v( 2, i, j-1, k )
+ > + ldx( 1, 2, i, j ) * v( 2, i-1, j, k )
+ > + ldy( 1, 3, i, j ) * v( 3, i, j-1, k )
+ > + ldx( 1, 3, i, j ) * v( 3, i-1, j, k )
+ > + ldy( 1, 4, i, j ) * v( 4, i, j-1, k )
+ > + ldx( 1, 4, i, j ) * v( 4, i-1, j, k )
+ > + ldy( 1, 5, i, j ) * v( 5, i, j-1, k )
+ > + ldx( 1, 5, i, j ) * v( 5, i-1, j, k ) )
+ v( 2, i, j, k ) = v( 2, i, j, k )
+ > - omega * ( ldy( 2, 1, i, j ) * v( 1, i, j-1, k )
+ > + ldx( 2, 1, i, j ) * v( 1, i-1, j, k )
+ > + ldy( 2, 2, i, j ) * v( 2, i, j-1, k )
+ > + ldx( 2, 2, i, j ) * v( 2, i-1, j, k )
+ > + ldy( 2, 3, i, j ) * v( 3, i, j-1, k )
+ > + ldx( 2, 3, i, j ) * v( 3, i-1, j, k )
+ > + ldy( 2, 4, i, j ) * v( 4, i, j-1, k )
+ > + ldx( 2, 4, i, j ) * v( 4, i-1, j, k )
+ > + ldy( 2, 5, i, j ) * v( 5, i, j-1, k )
+ > + ldx( 2, 5, i, j ) * v( 5, i-1, j, k ) )
+ v( 3, i, j, k ) = v( 3, i, j, k )
+ > - omega * ( ldy( 3, 1, i, j ) * v( 1, i, j-1, k )
+ > + ldx( 3, 1, i, j ) * v( 1, i-1, j, k )
+ > + ldy( 3, 2, i, j ) * v( 2, i, j-1, k )
+ > + ldx( 3, 2, i, j ) * v( 2, i-1, j, k )
+ > + ldy( 3, 3, i, j ) * v( 3, i, j-1, k )
+ > + ldx( 3, 3, i, j ) * v( 3, i-1, j, k )
+ > + ldy( 3, 4, i, j ) * v( 4, i, j-1, k )
+ > + ldx( 3, 4, i, j ) * v( 4, i-1, j, k )
+ > + ldy( 3, 5, i, j ) * v( 5, i, j-1, k )
+ > + ldx( 3, 5, i, j ) * v( 5, i-1, j, k ) )
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > - omega * ( ldy( 4, 1, i, j ) * v( 1, i, j-1, k )
+ > + ldx( 4, 1, i, j ) * v( 1, i-1, j, k )
+ > + ldy( 4, 2, i, j ) * v( 2, i, j-1, k )
+ > + ldx( 4, 2, i, j ) * v( 2, i-1, j, k )
+ > + ldy( 4, 3, i, j ) * v( 3, i, j-1, k )
+ > + ldx( 4, 3, i, j ) * v( 3, i-1, j, k )
+ > + ldy( 4, 4, i, j ) * v( 4, i, j-1, k )
+ > + ldx( 4, 4, i, j ) * v( 4, i-1, j, k )
+ > + ldy( 4, 5, i, j ) * v( 5, i, j-1, k )
+ > + ldx( 4, 5, i, j ) * v( 5, i-1, j, k ) )
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > - omega * ( ldy( 5, 1, i, j ) * v( 1, i, j-1, k )
+ > + ldx( 5, 1, i, j ) * v( 1, i-1, j, k )
+ > + ldy( 5, 2, i, j ) * v( 2, i, j-1, k )
+ > + ldx( 5, 2, i, j ) * v( 2, i-1, j, k )
+ > + ldy( 5, 3, i, j ) * v( 3, i, j-1, k )
+ > + ldx( 5, 3, i, j ) * v( 3, i-1, j, k )
+ > + ldy( 5, 4, i, j ) * v( 4, i, j-1, k )
+ > + ldx( 5, 4, i, j ) * v( 4, i-1, j, k )
+ > + ldy( 5, 5, i, j ) * v( 5, i, j-1, k )
+ > + ldx( 5, 5, i, j ) * v( 5, i-1, j, k ) )
+
+! end do
+
+c---------------------------------------------------------------------
+c diagonal block inversion
+c
+c forward elimination
+c---------------------------------------------------------------------
+!!dir$ unroll 5
+! manually unroll the loop
+! do m = 1, 5
+ tmat( 1, 1 ) = d( 1, 1, i, j )
+ tmat( 1, 2 ) = d( 1, 2, i, j )
+ tmat( 1, 3 ) = d( 1, 3, i, j )
+ tmat( 1, 4 ) = d( 1, 4, i, j )
+ tmat( 1, 5 ) = d( 1, 5, i, j )
+ tmat( 2, 1 ) = d( 2, 1, i, j )
+ tmat( 2, 2 ) = d( 2, 2, i, j )
+ tmat( 2, 3 ) = d( 2, 3, i, j )
+ tmat( 2, 4 ) = d( 2, 4, i, j )
+ tmat( 2, 5 ) = d( 2, 5, i, j )
+ tmat( 3, 1 ) = d( 3, 1, i, j )
+ tmat( 3, 2 ) = d( 3, 2, i, j )
+ tmat( 3, 3 ) = d( 3, 3, i, j )
+ tmat( 3, 4 ) = d( 3, 4, i, j )
+ tmat( 3, 5 ) = d( 3, 5, i, j )
+ tmat( 4, 1 ) = d( 4, 1, i, j )
+ tmat( 4, 2 ) = d( 4, 2, i, j )
+ tmat( 4, 3 ) = d( 4, 3, i, j )
+ tmat( 4, 4 ) = d( 4, 4, i, j )
+ tmat( 4, 5 ) = d( 4, 5, i, j )
+ tmat( 5, 1 ) = d( 5, 1, i, j )
+ tmat( 5, 2 ) = d( 5, 2, i, j )
+ tmat( 5, 3 ) = d( 5, 3, i, j )
+ tmat( 5, 4 ) = d( 5, 4, i, j )
+ tmat( 5, 5 ) = d( 5, 5, i, j )
+! end do
+
+ tmp1 = 1.0d+00 / tmat( 1, 1 )
+ tmp = tmp1 * tmat( 2, 1 )
+ tmat( 2, 2 ) = tmat( 2, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 2, 3 ) = tmat( 2, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 2, 4 ) = tmat( 2, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 2, 5 ) = tmat( 2, 5 )
+ > - tmp * tmat( 1, 5 )
+ v( 2, i, j, k ) = v( 2, i, j, k )
+ > - v( 1, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 3, 1 )
+ tmat( 3, 2 ) = tmat( 3, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 3, 3 ) = tmat( 3, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 3, 4 ) = tmat( 3, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 3, 5 ) = tmat( 3, 5 )
+ > - tmp * tmat( 1, 5 )
+ v( 3, i, j, k ) = v( 3, i, j, k )
+ > - v( 1, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 4, 1 )
+ tmat( 4, 2 ) = tmat( 4, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 4, 3 ) = tmat( 4, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 1, 5 )
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > - v( 1, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 5, 1 )
+ tmat( 5, 2 ) = tmat( 5, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 5, 3 ) = tmat( 5, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 1, 5 )
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > - v( 1, i, j, k ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 2, 2 )
+ tmp = tmp1 * tmat( 3, 2 )
+ tmat( 3, 3 ) = tmat( 3, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 3, 4 ) = tmat( 3, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 3, 5 ) = tmat( 3, 5 )
+ > - tmp * tmat( 2, 5 )
+ v( 3, i, j, k ) = v( 3, i, j, k )
+ > - v( 2, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 4, 2 )
+ tmat( 4, 3 ) = tmat( 4, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 2, 5 )
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > - v( 2, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 5, 2 )
+ tmat( 5, 3 ) = tmat( 5, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 2, 5 )
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > - v( 2, i, j, k ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 3, 3 )
+ tmp = tmp1 * tmat( 4, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 3, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 3, 5 )
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > - v( 3, i, j, k ) * tmp
+
+ tmp = tmp1 * tmat( 5, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 3, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 3, 5 )
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > - v( 3, i, j, k ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 4, 4 )
+ tmp = tmp1 * tmat( 5, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 4, 5 )
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > - v( 4, i, j, k ) * tmp
+
+c---------------------------------------------------------------------
+c back substitution
+c---------------------------------------------------------------------
+ v( 5, i, j, k ) = v( 5, i, j, k )
+ > / tmat( 5, 5 )
+
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > - tmat( 4, 5 ) * v( 5, i, j, k )
+ v( 4, i, j, k ) = v( 4, i, j, k )
+ > / tmat( 4, 4 )
+
+ v( 3, i, j, k ) = v( 3, i, j, k )
+ > - tmat( 3, 4 ) * v( 4, i, j, k )
+ > - tmat( 3, 5 ) * v( 5, i, j, k )
+ v( 3, i, j, k ) = v( 3, i, j, k )
+ > / tmat( 3, 3 )
+
+ v( 2, i, j, k ) = v( 2, i, j, k )
+ > - tmat( 2, 3 ) * v( 3, i, j, k )
+ > - tmat( 2, 4 ) * v( 4, i, j, k )
+ > - tmat( 2, 5 ) * v( 5, i, j, k )
+ v( 2, i, j, k ) = v( 2, i, j, k )
+ > / tmat( 2, 2 )
+
+ v( 1, i, j, k ) = v( 1, i, j, k )
+ > - tmat( 1, 2 ) * v( 2, i, j, k )
+ > - tmat( 1, 3 ) * v( 3, i, j, k )
+ > - tmat( 1, 4 ) * v( 4, i, j, k )
+ > - tmat( 1, 5 ) * v( 5, i, j, k )
+ v( 1, i, j, k ) = v( 1, i, j, k )
+ > / tmat( 1, 1 )
+
+
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c send data to east and south
+c---------------------------------------------------------------------
+ iex = 2
+ call exchange_1( v,k,iex )
+
+ return
+ end
+
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine buts( ldmx, ldmy, ldmz,
+ > nx, ny, nz, k,
+ > omega,
+ > v, tv,
+ > d, udx, udy, udz,
+ > ist, iend, jst, jend,
+ > nx0, ny0, ipt, jpt )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c compute the regular-sparse, block upper triangular solution:
+c
+c v <-- ( U-inv ) * v
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ integer ldmx, ldmy, ldmz
+ integer nx, ny, nz
+ integer k
+ double precision omega
+ double precision v( 5, -1:ldmx+2, -1:ldmy+2, *),
+ > tv(5, ldmx, ldmy),
+ > d( 5, 5, ldmx, ldmy),
+ > udx( 5, 5, ldmx, ldmy),
+ > udy( 5, 5, ldmx, ldmy),
+ > udz( 5, 5, ldmx, ldmy )
+ integer ist, iend
+ integer jst, jend
+ integer nx0, ny0
+ integer ipt, jpt
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, m
+ integer iex
+ double precision tmp, tmp1
+ double precision tmat(5,5)
+
+
+c---------------------------------------------------------------------
+c receive data from south and east
+c---------------------------------------------------------------------
+ iex = 1
+ call exchange_1( v,k,iex )
+
+ do j = jend, jst, -1
+ do i = iend, ist, -1
+ do m = 1, 5
+ tv( m, i, j ) =
+ > omega * ( udz( m, 1, i, j ) * v( 1, i, j, k+1 )
+ > + udz( m, 2, i, j ) * v( 2, i, j, k+1 )
+ > + udz( m, 3, i, j ) * v( 3, i, j, k+1 )
+ > + udz( m, 4, i, j ) * v( 4, i, j, k+1 )
+ > + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) )
+ end do
+ end do
+ end do
+
+
+ do j = jend,jst,-1
+ do i = iend,ist,-1
+
+ do m = 1, 5
+ tv( m, i, j ) = tv( m, i, j )
+ > + omega * ( udy( m, 1, i, j ) * v( 1, i, j+1, k )
+ > + udx( m, 1, i, j ) * v( 1, i+1, j, k )
+ > + udy( m, 2, i, j ) * v( 2, i, j+1, k )
+ > + udx( m, 2, i, j ) * v( 2, i+1, j, k )
+ > + udy( m, 3, i, j ) * v( 3, i, j+1, k )
+ > + udx( m, 3, i, j ) * v( 3, i+1, j, k )
+ > + udy( m, 4, i, j ) * v( 4, i, j+1, k )
+ > + udx( m, 4, i, j ) * v( 4, i+1, j, k )
+ > + udy( m, 5, i, j ) * v( 5, i, j+1, k )
+ > + udx( m, 5, i, j ) * v( 5, i+1, j, k ) )
+ end do
+
+c---------------------------------------------------------------------
+c diagonal block inversion
+c---------------------------------------------------------------------
+ do m = 1, 5
+ tmat( m, 1 ) = d( m, 1, i, j )
+ tmat( m, 2 ) = d( m, 2, i, j )
+ tmat( m, 3 ) = d( m, 3, i, j )
+ tmat( m, 4 ) = d( m, 4, i, j )
+ tmat( m, 5 ) = d( m, 5, i, j )
+ end do
+
+ tmp1 = 1.0d+00 / tmat( 1, 1 )
+ tmp = tmp1 * tmat( 2, 1 )
+ tmat( 2, 2 ) = tmat( 2, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 2, 3 ) = tmat( 2, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 2, 4 ) = tmat( 2, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 2, 5 ) = tmat( 2, 5 )
+ > - tmp * tmat( 1, 5 )
+ tv( 2, i, j ) = tv( 2, i, j )
+ > - tv( 1, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 3, 1 )
+ tmat( 3, 2 ) = tmat( 3, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 3, 3 ) = tmat( 3, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 3, 4 ) = tmat( 3, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 3, 5 ) = tmat( 3, 5 )
+ > - tmp * tmat( 1, 5 )
+ tv( 3, i, j ) = tv( 3, i, j )
+ > - tv( 1, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 4, 1 )
+ tmat( 4, 2 ) = tmat( 4, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 4, 3 ) = tmat( 4, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 1, 5 )
+ tv( 4, i, j ) = tv( 4, i, j )
+ > - tv( 1, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 5, 1 )
+ tmat( 5, 2 ) = tmat( 5, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 5, 3 ) = tmat( 5, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 1, 5 )
+ tv( 5, i, j ) = tv( 5, i, j )
+ > - tv( 1, i, j ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 2, 2 )
+ tmp = tmp1 * tmat( 3, 2 )
+ tmat( 3, 3 ) = tmat( 3, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 3, 4 ) = tmat( 3, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 3, 5 ) = tmat( 3, 5 )
+ > - tmp * tmat( 2, 5 )
+ tv( 3, i, j ) = tv( 3, i, j )
+ > - tv( 2, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 4, 2 )
+ tmat( 4, 3 ) = tmat( 4, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 2, 5 )
+ tv( 4, i, j ) = tv( 4, i, j )
+ > - tv( 2, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 5, 2 )
+ tmat( 5, 3 ) = tmat( 5, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 2, 5 )
+ tv( 5, i, j ) = tv( 5, i, j )
+ > - tv( 2, i, j ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 3, 3 )
+ tmp = tmp1 * tmat( 4, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 3, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 3, 5 )
+ tv( 4, i, j ) = tv( 4, i, j )
+ > - tv( 3, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 5, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 3, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 3, 5 )
+ tv( 5, i, j ) = tv( 5, i, j )
+ > - tv( 3, i, j ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 4, 4 )
+ tmp = tmp1 * tmat( 5, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 4, 5 )
+ tv( 5, i, j ) = tv( 5, i, j )
+ > - tv( 4, i, j ) * tmp
+
+c---------------------------------------------------------------------
+c back substitution
+c---------------------------------------------------------------------
+ tv( 5, i, j ) = tv( 5, i, j )
+ > / tmat( 5, 5 )
+
+ tv( 4, i, j ) = tv( 4, i, j )
+ > - tmat( 4, 5 ) * tv( 5, i, j )
+ tv( 4, i, j ) = tv( 4, i, j )
+ > / tmat( 4, 4 )
+
+ tv( 3, i, j ) = tv( 3, i, j )
+ > - tmat( 3, 4 ) * tv( 4, i, j )
+ > - tmat( 3, 5 ) * tv( 5, i, j )
+ tv( 3, i, j ) = tv( 3, i, j )
+ > / tmat( 3, 3 )
+
+ tv( 2, i, j ) = tv( 2, i, j )
+ > - tmat( 2, 3 ) * tv( 3, i, j )
+ > - tmat( 2, 4 ) * tv( 4, i, j )
+ > - tmat( 2, 5 ) * tv( 5, i, j )
+ tv( 2, i, j ) = tv( 2, i, j )
+ > / tmat( 2, 2 )
+
+ tv( 1, i, j ) = tv( 1, i, j )
+ > - tmat( 1, 2 ) * tv( 2, i, j )
+ > - tmat( 1, 3 ) * tv( 3, i, j )
+ > - tmat( 1, 4 ) * tv( 4, i, j )
+ > - tmat( 1, 5 ) * tv( 5, i, j )
+ tv( 1, i, j ) = tv( 1, i, j )
+ > / tmat( 1, 1 )
+
+ v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j )
+ v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j )
+ v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j )
+ v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j )
+ v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j )
+
+
+ enddo
+ end do
+
+c---------------------------------------------------------------------
+c send data to north and west
+c---------------------------------------------------------------------
+ iex = 3
+ call exchange_1( v,k,iex )
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine buts( ldmx, ldmy, ldmz,
+ > nx, ny, nz, k,
+ > omega,
+ > v, tv,
+ > d, udx, udy, udz,
+ > ist, iend, jst, jend,
+ > nx0, ny0, ipt, jpt )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c compute the regular-sparse, block upper triangular solution:
+c
+c v <-- ( U-inv ) * v
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ integer ldmx, ldmy, ldmz
+ integer nx, ny, nz
+ integer k
+ double precision omega
+ double precision v( 5, -1:ldmx+2, -1:ldmy+2, *),
+ > tv(5, ldmx, ldmy),
+ > d( 5, 5, ldmx, ldmy),
+ > udx( 5, 5, ldmx, ldmy),
+ > udy( 5, 5, ldmx, ldmy),
+ > udz( 5, 5, ldmx, ldmy )
+ integer ist, iend
+ integer jst, jend
+ integer nx0, ny0
+ integer ipt, jpt
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, m, l, istp, iendp
+ integer iex
+ double precision tmp, tmp1
+ double precision tmat(5,5)
+
+
+c---------------------------------------------------------------------
+c receive data from south and east
+c---------------------------------------------------------------------
+ iex = 1
+ call exchange_1( v,k,iex )
+
+ do j = jend, jst, -1
+ do i = iend, ist, -1
+ do m = 1, 5
+ tv( m, i, j ) =
+ > omega * ( udz( m, 1, i, j ) * v( 1, i, j, k+1 )
+ > + udz( m, 2, i, j ) * v( 2, i, j, k+1 )
+ > + udz( m, 3, i, j ) * v( 3, i, j, k+1 )
+ > + udz( m, 4, i, j ) * v( 4, i, j, k+1 )
+ > + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) )
+ end do
+ end do
+ end do
+
+
+ do l = iend+jend, ist+jst, -1
+ istp = max(l - jend, ist)
+ iendp = min(l - jst, iend)
+
+!dir$ ivdep
+ do i = istp, iendp
+ j = l - i
+
+!!dir$ unroll 5
+! manually unroll the loop
+! do m = 1, 5
+ tv( 1, i, j ) = tv( 1, i, j )
+ > + omega * ( udy( 1, 1, i, j ) * v( 1, i, j+1, k )
+ > + udx( 1, 1, i, j ) * v( 1, i+1, j, k )
+ > + udy( 1, 2, i, j ) * v( 2, i, j+1, k )
+ > + udx( 1, 2, i, j ) * v( 2, i+1, j, k )
+ > + udy( 1, 3, i, j ) * v( 3, i, j+1, k )
+ > + udx( 1, 3, i, j ) * v( 3, i+1, j, k )
+ > + udy( 1, 4, i, j ) * v( 4, i, j+1, k )
+ > + udx( 1, 4, i, j ) * v( 4, i+1, j, k )
+ > + udy( 1, 5, i, j ) * v( 5, i, j+1, k )
+ > + udx( 1, 5, i, j ) * v( 5, i+1, j, k ) )
+ tv( 2, i, j ) = tv( 2, i, j )
+ > + omega * ( udy( 2, 1, i, j ) * v( 1, i, j+1, k )
+ > + udx( 2, 1, i, j ) * v( 1, i+1, j, k )
+ > + udy( 2, 2, i, j ) * v( 2, i, j+1, k )
+ > + udx( 2, 2, i, j ) * v( 2, i+1, j, k )
+ > + udy( 2, 3, i, j ) * v( 3, i, j+1, k )
+ > + udx( 2, 3, i, j ) * v( 3, i+1, j, k )
+ > + udy( 2, 4, i, j ) * v( 4, i, j+1, k )
+ > + udx( 2, 4, i, j ) * v( 4, i+1, j, k )
+ > + udy( 2, 5, i, j ) * v( 5, i, j+1, k )
+ > + udx( 2, 5, i, j ) * v( 5, i+1, j, k ) )
+ tv( 3, i, j ) = tv( 3, i, j )
+ > + omega * ( udy( 3, 1, i, j ) * v( 1, i, j+1, k )
+ > + udx( 3, 1, i, j ) * v( 1, i+1, j, k )
+ > + udy( 3, 2, i, j ) * v( 2, i, j+1, k )
+ > + udx( 3, 2, i, j ) * v( 2, i+1, j, k )
+ > + udy( 3, 3, i, j ) * v( 3, i, j+1, k )
+ > + udx( 3, 3, i, j ) * v( 3, i+1, j, k )
+ > + udy( 3, 4, i, j ) * v( 4, i, j+1, k )
+ > + udx( 3, 4, i, j ) * v( 4, i+1, j, k )
+ > + udy( 3, 5, i, j ) * v( 5, i, j+1, k )
+ > + udx( 3, 5, i, j ) * v( 5, i+1, j, k ) )
+ tv( 4, i, j ) = tv( 4, i, j )
+ > + omega * ( udy( 4, 1, i, j ) * v( 1, i, j+1, k )
+ > + udx( 4, 1, i, j ) * v( 1, i+1, j, k )
+ > + udy( 4, 2, i, j ) * v( 2, i, j+1, k )
+ > + udx( 4, 2, i, j ) * v( 2, i+1, j, k )
+ > + udy( 4, 3, i, j ) * v( 3, i, j+1, k )
+ > + udx( 4, 3, i, j ) * v( 3, i+1, j, k )
+ > + udy( 4, 4, i, j ) * v( 4, i, j+1, k )
+ > + udx( 4, 4, i, j ) * v( 4, i+1, j, k )
+ > + udy( 4, 5, i, j ) * v( 5, i, j+1, k )
+ > + udx( 4, 5, i, j ) * v( 5, i+1, j, k ) )
+ tv( 5, i, j ) = tv( 5, i, j )
+ > + omega * ( udy( 5, 1, i, j ) * v( 1, i, j+1, k )
+ > + udx( 5, 1, i, j ) * v( 1, i+1, j, k )
+ > + udy( 5, 2, i, j ) * v( 2, i, j+1, k )
+ > + udx( 5, 2, i, j ) * v( 2, i+1, j, k )
+ > + udy( 5, 3, i, j ) * v( 3, i, j+1, k )
+ > + udx( 5, 3, i, j ) * v( 3, i+1, j, k )
+ > + udy( 5, 4, i, j ) * v( 4, i, j+1, k )
+ > + udx( 5, 4, i, j ) * v( 4, i+1, j, k )
+ > + udy( 5, 5, i, j ) * v( 5, i, j+1, k )
+ > + udx( 5, 5, i, j ) * v( 5, i+1, j, k ) )
+! end do
+
+c---------------------------------------------------------------------
+c diagonal block inversion
+c---------------------------------------------------------------------
+!!dir$ unroll 5
+! manually unroll the loop
+! do m = 1, 5
+ tmat( 1, 1 ) = d( 1, 1, i, j )
+ tmat( 1, 2 ) = d( 1, 2, i, j )
+ tmat( 1, 3 ) = d( 1, 3, i, j )
+ tmat( 1, 4 ) = d( 1, 4, i, j )
+ tmat( 1, 5 ) = d( 1, 5, i, j )
+ tmat( 2, 1 ) = d( 2, 1, i, j )
+ tmat( 2, 2 ) = d( 2, 2, i, j )
+ tmat( 2, 3 ) = d( 2, 3, i, j )
+ tmat( 2, 4 ) = d( 2, 4, i, j )
+ tmat( 2, 5 ) = d( 2, 5, i, j )
+ tmat( 3, 1 ) = d( 3, 1, i, j )
+ tmat( 3, 2 ) = d( 3, 2, i, j )
+ tmat( 3, 3 ) = d( 3, 3, i, j )
+ tmat( 3, 4 ) = d( 3, 4, i, j )
+ tmat( 3, 5 ) = d( 3, 5, i, j )
+ tmat( 4, 1 ) = d( 4, 1, i, j )
+ tmat( 4, 2 ) = d( 4, 2, i, j )
+ tmat( 4, 3 ) = d( 4, 3, i, j )
+ tmat( 4, 4 ) = d( 4, 4, i, j )
+ tmat( 4, 5 ) = d( 4, 5, i, j )
+ tmat( 5, 1 ) = d( 5, 1, i, j )
+ tmat( 5, 2 ) = d( 5, 2, i, j )
+ tmat( 5, 3 ) = d( 5, 3, i, j )
+ tmat( 5, 4 ) = d( 5, 4, i, j )
+ tmat( 5, 5 ) = d( 5, 5, i, j )
+! end do
+
+ tmp1 = 1.0d+00 / tmat( 1, 1 )
+ tmp = tmp1 * tmat( 2, 1 )
+ tmat( 2, 2 ) = tmat( 2, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 2, 3 ) = tmat( 2, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 2, 4 ) = tmat( 2, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 2, 5 ) = tmat( 2, 5 )
+ > - tmp * tmat( 1, 5 )
+ tv( 2, i, j ) = tv( 2, i, j )
+ > - tv( 1, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 3, 1 )
+ tmat( 3, 2 ) = tmat( 3, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 3, 3 ) = tmat( 3, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 3, 4 ) = tmat( 3, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 3, 5 ) = tmat( 3, 5 )
+ > - tmp * tmat( 1, 5 )
+ tv( 3, i, j ) = tv( 3, i, j )
+ > - tv( 1, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 4, 1 )
+ tmat( 4, 2 ) = tmat( 4, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 4, 3 ) = tmat( 4, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 1, 5 )
+ tv( 4, i, j ) = tv( 4, i, j )
+ > - tv( 1, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 5, 1 )
+ tmat( 5, 2 ) = tmat( 5, 2 )
+ > - tmp * tmat( 1, 2 )
+ tmat( 5, 3 ) = tmat( 5, 3 )
+ > - tmp * tmat( 1, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 1, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 1, 5 )
+ tv( 5, i, j ) = tv( 5, i, j )
+ > - tv( 1, i, j ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 2, 2 )
+ tmp = tmp1 * tmat( 3, 2 )
+ tmat( 3, 3 ) = tmat( 3, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 3, 4 ) = tmat( 3, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 3, 5 ) = tmat( 3, 5 )
+ > - tmp * tmat( 2, 5 )
+ tv( 3, i, j ) = tv( 3, i, j )
+ > - tv( 2, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 4, 2 )
+ tmat( 4, 3 ) = tmat( 4, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 2, 5 )
+ tv( 4, i, j ) = tv( 4, i, j )
+ > - tv( 2, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 5, 2 )
+ tmat( 5, 3 ) = tmat( 5, 3 )
+ > - tmp * tmat( 2, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 2, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 2, 5 )
+ tv( 5, i, j ) = tv( 5, i, j )
+ > - tv( 2, i, j ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 3, 3 )
+ tmp = tmp1 * tmat( 4, 3 )
+ tmat( 4, 4 ) = tmat( 4, 4 )
+ > - tmp * tmat( 3, 4 )
+ tmat( 4, 5 ) = tmat( 4, 5 )
+ > - tmp * tmat( 3, 5 )
+ tv( 4, i, j ) = tv( 4, i, j )
+ > - tv( 3, i, j ) * tmp
+
+ tmp = tmp1 * tmat( 5, 3 )
+ tmat( 5, 4 ) = tmat( 5, 4 )
+ > - tmp * tmat( 3, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 3, 5 )
+ tv( 5, i, j ) = tv( 5, i, j )
+ > - tv( 3, i, j ) * tmp
+
+
+
+ tmp1 = 1.0d+00 / tmat( 4, 4 )
+ tmp = tmp1 * tmat( 5, 4 )
+ tmat( 5, 5 ) = tmat( 5, 5 )
+ > - tmp * tmat( 4, 5 )
+ tv( 5, i, j ) = tv( 5, i, j )
+ > - tv( 4, i, j ) * tmp
+
+c---------------------------------------------------------------------
+c back substitution
+c---------------------------------------------------------------------
+ tv( 5, i, j ) = tv( 5, i, j )
+ > / tmat( 5, 5 )
+
+ tv( 4, i, j ) = tv( 4, i, j )
+ > - tmat( 4, 5 ) * tv( 5, i, j )
+ tv( 4, i, j ) = tv( 4, i, j )
+ > / tmat( 4, 4 )
+
+ tv( 3, i, j ) = tv( 3, i, j )
+ > - tmat( 3, 4 ) * tv( 4, i, j )
+ > - tmat( 3, 5 ) * tv( 5, i, j )
+ tv( 3, i, j ) = tv( 3, i, j )
+ > / tmat( 3, 3 )
+
+ tv( 2, i, j ) = tv( 2, i, j )
+ > - tmat( 2, 3 ) * tv( 3, i, j )
+ > - tmat( 2, 4 ) * tv( 4, i, j )
+ > - tmat( 2, 5 ) * tv( 5, i, j )
+ tv( 2, i, j ) = tv( 2, i, j )
+ > / tmat( 2, 2 )
+
+ tv( 1, i, j ) = tv( 1, i, j )
+ > - tmat( 1, 2 ) * tv( 2, i, j )
+ > - tmat( 1, 3 ) * tv( 3, i, j )
+ > - tmat( 1, 4 ) * tv( 4, i, j )
+ > - tmat( 1, 5 ) * tv( 5, i, j )
+ tv( 1, i, j ) = tv( 1, i, j )
+ > / tmat( 1, 1 )
+
+ v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j )
+ v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j )
+ v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j )
+ v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j )
+ v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j )
+
+
+ enddo
+ end do
+
+c---------------------------------------------------------------------
+c send data to north and west
+c---------------------------------------------------------------------
+ iex = 3
+ call exchange_1( v,k,iex )
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine erhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c compute the right hand side based on exact solution
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, k, m
+ integer iglob, jglob
+ integer iex
+ integer L1, L2
+ integer ist1, iend1
+ integer jst1, jend1
+ double precision dsspm
+ double precision xi, eta, zeta
+ double precision q
+ double precision u21, u31, u41
+ double precision tmp
+ double precision u21i, u31i, u41i, u51i
+ double precision u21j, u31j, u41j, u51j
+ double precision u21k, u31k, u41k, u51k
+ double precision u21im1, u31im1, u41im1, u51im1
+ double precision u21jm1, u31jm1, u41jm1, u51jm1
+ double precision u21km1, u31km1, u41km1, u51km1
+
+ dsspm = dssp
+
+
+ do k = 1, nz
+ do j = 1, ny
+ do i = 1, nx
+ do m = 1, 5
+ frct( m, i, j, k ) = 0.0d+00
+ end do
+ end do
+ end do
+ end do
+
+ do k = 1, nz
+ zeta = ( dble(k-1) ) / ( nz - 1 )
+ do j = 1, ny
+ jglob = jpt + j
+ eta = ( dble(jglob-1) ) / ( ny0 - 1 )
+ do i = 1, nx
+ iglob = ipt + i
+ xi = ( dble(iglob-1) ) / ( nx0 - 1 )
+ do m = 1, 5
+ rsd(m,i,j,k) = ce(m,1)
+ > + ce(m,2) * xi
+ > + ce(m,3) * eta
+ > + ce(m,4) * zeta
+ > + ce(m,5) * xi * xi
+ > + ce(m,6) * eta * eta
+ > + ce(m,7) * zeta * zeta
+ > + ce(m,8) * xi * xi * xi
+ > + ce(m,9) * eta * eta * eta
+ > + ce(m,10) * zeta * zeta * zeta
+ > + ce(m,11) * xi * xi * xi * xi
+ > + ce(m,12) * eta * eta * eta * eta
+ > + ce(m,13) * zeta * zeta * zeta * zeta
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c xi-direction flux differences
+c---------------------------------------------------------------------
+c
+c iex = flag : iex = 0 north/south communication
+c : iex = 1 east/west communication
+c
+c---------------------------------------------------------------------
+ iex = 0
+
+c---------------------------------------------------------------------
+c communicate and receive/send two rows of data
+c---------------------------------------------------------------------
+ call exchange_3 (rsd,iex)
+
+ L1 = 0
+ if (north.eq.-1) L1 = 1
+ L2 = nx + 1
+ if (south.eq.-1) L2 = nx
+
+ ist1 = 1
+ iend1 = nx
+ if (north.eq.-1) ist1 = 4
+ if (south.eq.-1) iend1 = nx - 3
+
+ do k = 2, nz - 1
+ do j = jst, jend
+ do i = L1, L2
+ flux(1,i,j,k) = rsd(2,i,j,k)
+ u21 = rsd(2,i,j,k) / rsd(1,i,j,k)
+ q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k)
+ > + rsd(3,i,j,k) * rsd(3,i,j,k)
+ > + rsd(4,i,j,k) * rsd(4,i,j,k) )
+ > / rsd(1,i,j,k)
+ flux(2,i,j,k) = rsd(2,i,j,k) * u21 + c2 *
+ > ( rsd(5,i,j,k) - q )
+ flux(3,i,j,k) = rsd(3,i,j,k) * u21
+ flux(4,i,j,k) = rsd(4,i,j,k) * u21
+ flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u21
+ end do
+ end do
+ end do
+
+ do k = 2, nz - 1
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ frct(m,i,j,k) = frct(m,i,j,k)
+ > - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) )
+ end do
+ end do
+ do i = ist, L2
+ tmp = 1.0d+00 / rsd(1,i,j,k)
+
+ u21i = tmp * rsd(2,i,j,k)
+ u31i = tmp * rsd(3,i,j,k)
+ u41i = tmp * rsd(4,i,j,k)
+ u51i = tmp * rsd(5,i,j,k)
+
+ tmp = 1.0d+00 / rsd(1,i-1,j,k)
+
+ u21im1 = tmp * rsd(2,i-1,j,k)
+ u31im1 = tmp * rsd(3,i-1,j,k)
+ u41im1 = tmp * rsd(4,i-1,j,k)
+ u51im1 = tmp * rsd(5,i-1,j,k)
+
+ flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 *
+ > ( u21i - u21im1 )
+ flux(3,i,j,k) = tx3 * ( u31i - u31im1 )
+ flux(4,i,j,k) = tx3 * ( u41i - u41im1 )
+ flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+ > * tx3 * ( ( u21i **2 + u31i **2 + u41i **2 )
+ > - ( u21im1**2 + u31im1**2 + u41im1**2 ) )
+ > + (1.0d+00/6.0d+00)
+ > * tx3 * ( u21i**2 - u21im1**2 )
+ > + c1 * c5 * tx3 * ( u51i - u51im1 )
+ end do
+
+ do i = ist, iend
+ frct(1,i,j,k) = frct(1,i,j,k)
+ > + dx1 * tx1 * ( rsd(1,i-1,j,k)
+ > - 2.0d+00 * rsd(1,i,j,k)
+ > + rsd(1,i+1,j,k) )
+ frct(2,i,j,k) = frct(2,i,j,k)
+ > + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) )
+ > + dx2 * tx1 * ( rsd(2,i-1,j,k)
+ > - 2.0d+00 * rsd(2,i,j,k)
+ > + rsd(2,i+1,j,k) )
+ frct(3,i,j,k) = frct(3,i,j,k)
+ > + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) )
+ > + dx3 * tx1 * ( rsd(3,i-1,j,k)
+ > - 2.0d+00 * rsd(3,i,j,k)
+ > + rsd(3,i+1,j,k) )
+ frct(4,i,j,k) = frct(4,i,j,k)
+ > + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) )
+ > + dx4 * tx1 * ( rsd(4,i-1,j,k)
+ > - 2.0d+00 * rsd(4,i,j,k)
+ > + rsd(4,i+1,j,k) )
+ frct(5,i,j,k) = frct(5,i,j,k)
+ > + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) )
+ > + dx5 * tx1 * ( rsd(5,i-1,j,k)
+ > - 2.0d+00 * rsd(5,i,j,k)
+ > + rsd(5,i+1,j,k) )
+ end do
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation
+c---------------------------------------------------------------------
+ IF (north.eq.-1) then
+ do m = 1, 5
+ frct(m,2,j,k) = frct(m,2,j,k)
+ > - dsspm * ( + 5.0d+00 * rsd(m,2,j,k)
+ > - 4.0d+00 * rsd(m,3,j,k)
+ > + rsd(m,4,j,k) )
+ frct(m,3,j,k) = frct(m,3,j,k)
+ > - dsspm * ( - 4.0d+00 * rsd(m,2,j,k)
+ > + 6.0d+00 * rsd(m,3,j,k)
+ > - 4.0d+00 * rsd(m,4,j,k)
+ > + rsd(m,5,j,k) )
+ end do
+ END IF
+
+ do i = ist1,iend1
+ do m = 1, 5
+ frct(m,i,j,k) = frct(m,i,j,k)
+ > - dsspm * ( rsd(m,i-2,j,k)
+ > - 4.0d+00 * rsd(m,i-1,j,k)
+ > + 6.0d+00 * rsd(m,i,j,k)
+ > - 4.0d+00 * rsd(m,i+1,j,k)
+ > + rsd(m,i+2,j,k) )
+ end do
+ end do
+
+ IF (south.eq.-1) then
+ do m = 1, 5
+ frct(m,nx-2,j,k) = frct(m,nx-2,j,k)
+ > - dsspm * ( rsd(m,nx-4,j,k)
+ > - 4.0d+00 * rsd(m,nx-3,j,k)
+ > + 6.0d+00 * rsd(m,nx-2,j,k)
+ > - 4.0d+00 * rsd(m,nx-1,j,k) )
+ frct(m,nx-1,j,k) = frct(m,nx-1,j,k)
+ > - dsspm * ( rsd(m,nx-3,j,k)
+ > - 4.0d+00 * rsd(m,nx-2,j,k)
+ > + 5.0d+00 * rsd(m,nx-1,j,k) )
+ end do
+ END IF
+
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c eta-direction flux differences
+c---------------------------------------------------------------------
+c
+c iex = flag : iex = 0 north/south communication
+c : iex = 1 east/west communication
+c
+c---------------------------------------------------------------------
+ iex = 1
+
+c---------------------------------------------------------------------
+c communicate and receive/send two rows of data
+c---------------------------------------------------------------------
+ call exchange_3 (rsd,iex)
+
+ L1 = 0
+ if (west.eq.-1) L1 = 1
+ L2 = ny + 1
+ if (east.eq.-1) L2 = ny
+
+ jst1 = 1
+ jend1 = ny
+ if (west.eq.-1) jst1 = 4
+ if (east.eq.-1) jend1 = ny - 3
+
+ do k = 2, nz - 1
+ do j = L1, L2
+ do i = ist, iend
+ flux(1,i,j,k) = rsd(3,i,j,k)
+ u31 = rsd(3,i,j,k) / rsd(1,i,j,k)
+ q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k)
+ > + rsd(3,i,j,k) * rsd(3,i,j,k)
+ > + rsd(4,i,j,k) * rsd(4,i,j,k) )
+ > / rsd(1,i,j,k)
+ flux(2,i,j,k) = rsd(2,i,j,k) * u31
+ flux(3,i,j,k) = rsd(3,i,j,k) * u31 + c2 *
+ > ( rsd(5,i,j,k) - q )
+ flux(4,i,j,k) = rsd(4,i,j,k) * u31
+ flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u31
+ end do
+ end do
+ end do
+
+ do k = 2, nz - 1
+ do i = ist, iend
+ do j = jst, jend
+ do m = 1, 5
+ frct(m,i,j,k) = frct(m,i,j,k)
+ > - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) )
+ end do
+ end do
+ end do
+
+ do j = jst, L2
+ do i = ist, iend
+ tmp = 1.0d+00 / rsd(1,i,j,k)
+
+ u21j = tmp * rsd(2,i,j,k)
+ u31j = tmp * rsd(3,i,j,k)
+ u41j = tmp * rsd(4,i,j,k)
+ u51j = tmp * rsd(5,i,j,k)
+
+ tmp = 1.0d+00 / rsd(1,i,j-1,k)
+
+ u21jm1 = tmp * rsd(2,i,j-1,k)
+ u31jm1 = tmp * rsd(3,i,j-1,k)
+ u41jm1 = tmp * rsd(4,i,j-1,k)
+ u51jm1 = tmp * rsd(5,i,j-1,k)
+
+ flux(2,i,j,k) = ty3 * ( u21j - u21jm1 )
+ flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 *
+ > ( u31j - u31jm1 )
+ flux(4,i,j,k) = ty3 * ( u41j - u41jm1 )
+ flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+ > * ty3 * ( ( u21j **2 + u31j **2 + u41j **2 )
+ > - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) )
+ > + (1.0d+00/6.0d+00)
+ > * ty3 * ( u31j**2 - u31jm1**2 )
+ > + c1 * c5 * ty3 * ( u51j - u51jm1 )
+ end do
+ end do
+
+ do j = jst, jend
+ do i = ist, iend
+ frct(1,i,j,k) = frct(1,i,j,k)
+ > + dy1 * ty1 * ( rsd(1,i,j-1,k)
+ > - 2.0d+00 * rsd(1,i,j,k)
+ > + rsd(1,i,j+1,k) )
+ frct(2,i,j,k) = frct(2,i,j,k)
+ > + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) )
+ > + dy2 * ty1 * ( rsd(2,i,j-1,k)
+ > - 2.0d+00 * rsd(2,i,j,k)
+ > + rsd(2,i,j+1,k) )
+ frct(3,i,j,k) = frct(3,i,j,k)
+ > + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) )
+ > + dy3 * ty1 * ( rsd(3,i,j-1,k)
+ > - 2.0d+00 * rsd(3,i,j,k)
+ > + rsd(3,i,j+1,k) )
+ frct(4,i,j,k) = frct(4,i,j,k)
+ > + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) )
+ > + dy4 * ty1 * ( rsd(4,i,j-1,k)
+ > - 2.0d+00 * rsd(4,i,j,k)
+ > + rsd(4,i,j+1,k) )
+ frct(5,i,j,k) = frct(5,i,j,k)
+ > + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) )
+ > + dy5 * ty1 * ( rsd(5,i,j-1,k)
+ > - 2.0d+00 * rsd(5,i,j,k)
+ > + rsd(5,i,j+1,k) )
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c fourth-order dissipation
+c---------------------------------------------------------------------
+ IF (west.eq.-1) then
+ do i = ist, iend
+ do m = 1, 5
+ frct(m,i,2,k) = frct(m,i,2,k)
+ > - dsspm * ( + 5.0d+00 * rsd(m,i,2,k)
+ > - 4.0d+00 * rsd(m,i,3,k)
+ > + rsd(m,i,4,k) )
+ frct(m,i,3,k) = frct(m,i,3,k)
+ > - dsspm * ( - 4.0d+00 * rsd(m,i,2,k)
+ > + 6.0d+00 * rsd(m,i,3,k)
+ > - 4.0d+00 * rsd(m,i,4,k)
+ > + rsd(m,i,5,k) )
+ end do
+ end do
+ END IF
+
+ do j = jst1, jend1
+ do i = ist, iend
+ do m = 1, 5
+ frct(m,i,j,k) = frct(m,i,j,k)
+ > - dsspm * ( rsd(m,i,j-2,k)
+ > - 4.0d+00 * rsd(m,i,j-1,k)
+ > + 6.0d+00 * rsd(m,i,j,k)
+ > - 4.0d+00 * rsd(m,i,j+1,k)
+ > + rsd(m,i,j+2,k) )
+ end do
+ end do
+ end do
+
+ IF (east.eq.-1) then
+ do i = ist, iend
+ do m = 1, 5
+ frct(m,i,ny-2,k) = frct(m,i,ny-2,k)
+ > - dsspm * ( rsd(m,i,ny-4,k)
+ > - 4.0d+00 * rsd(m,i,ny-3,k)
+ > + 6.0d+00 * rsd(m,i,ny-2,k)
+ > - 4.0d+00 * rsd(m,i,ny-1,k) )
+ frct(m,i,ny-1,k) = frct(m,i,ny-1,k)
+ > - dsspm * ( rsd(m,i,ny-3,k)
+ > - 4.0d+00 * rsd(m,i,ny-2,k)
+ > + 5.0d+00 * rsd(m,i,ny-1,k) )
+ end do
+ end do
+ END IF
+
+ end do
+
+c---------------------------------------------------------------------
+c zeta-direction flux differences
+c---------------------------------------------------------------------
+ do k = 1, nz
+ do j = jst, jend
+ do i = ist, iend
+ flux(1,i,j,k) = rsd(4,i,j,k)
+ u41 = rsd(4,i,j,k) / rsd(1,i,j,k)
+ q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k)
+ > + rsd(3,i,j,k) * rsd(3,i,j,k)
+ > + rsd(4,i,j,k) * rsd(4,i,j,k) )
+ > / rsd(1,i,j,k)
+ flux(2,i,j,k) = rsd(2,i,j,k) * u41
+ flux(3,i,j,k) = rsd(3,i,j,k) * u41
+ flux(4,i,j,k) = rsd(4,i,j,k) * u41 + c2 *
+ > ( rsd(5,i,j,k) - q )
+ flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u41
+ end do
+ end do
+ end do
+
+ do k = 2, nz - 1
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ frct(m,i,j,k) = frct(m,i,j,k)
+ > - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) )
+ end do
+ end do
+ end do
+ end do
+
+ do k = 2, nz
+ do j = jst, jend
+ do i = ist, iend
+ tmp = 1.0d+00 / rsd(1,i,j,k)
+
+ u21k = tmp * rsd(2,i,j,k)
+ u31k = tmp * rsd(3,i,j,k)
+ u41k = tmp * rsd(4,i,j,k)
+ u51k = tmp * rsd(5,i,j,k)
+
+ tmp = 1.0d+00 / rsd(1,i,j,k-1)
+
+ u21km1 = tmp * rsd(2,i,j,k-1)
+ u31km1 = tmp * rsd(3,i,j,k-1)
+ u41km1 = tmp * rsd(4,i,j,k-1)
+ u51km1 = tmp * rsd(5,i,j,k-1)
+
+ flux(2,i,j,k) = tz3 * ( u21k - u21km1 )
+ flux(3,i,j,k) = tz3 * ( u31k - u31km1 )
+ flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * ( u41k
+ > - u41km1 )
+ flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+ > * tz3 * ( ( u21k **2 + u31k **2 + u41k **2 )
+ > - ( u21km1**2 + u31km1**2 + u41km1**2 ) )
+ > + (1.0d+00/6.0d+00)
+ > * tz3 * ( u41k**2 - u41km1**2 )
+ > + c1 * c5 * tz3 * ( u51k - u51km1 )
+ end do
+ end do
+ end do
+
+ do k = 2, nz - 1
+ do j = jst, jend
+ do i = ist, iend
+ frct(1,i,j,k) = frct(1,i,j,k)
+ > + dz1 * tz1 * ( rsd(1,i,j,k+1)
+ > - 2.0d+00 * rsd(1,i,j,k)
+ > + rsd(1,i,j,k-1) )
+ frct(2,i,j,k) = frct(2,i,j,k)
+ > + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) )
+ > + dz2 * tz1 * ( rsd(2,i,j,k+1)
+ > - 2.0d+00 * rsd(2,i,j,k)
+ > + rsd(2,i,j,k-1) )
+ frct(3,i,j,k) = frct(3,i,j,k)
+ > + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) )
+ > + dz3 * tz1 * ( rsd(3,i,j,k+1)
+ > - 2.0d+00 * rsd(3,i,j,k)
+ > + rsd(3,i,j,k-1) )
+ frct(4,i,j,k) = frct(4,i,j,k)
+ > + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) )
+ > + dz4 * tz1 * ( rsd(4,i,j,k+1)
+ > - 2.0d+00 * rsd(4,i,j,k)
+ > + rsd(4,i,j,k-1) )
+ frct(5,i,j,k) = frct(5,i,j,k)
+ > + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) )
+ > + dz5 * tz1 * ( rsd(5,i,j,k+1)
+ > - 2.0d+00 * rsd(5,i,j,k)
+ > + rsd(5,i,j,k-1) )
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c fourth-order dissipation
+c---------------------------------------------------------------------
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ frct(m,i,j,2) = frct(m,i,j,2)
+ > - dsspm * ( + 5.0d+00 * rsd(m,i,j,2)
+ > - 4.0d+00 * rsd(m,i,j,3)
+ > + rsd(m,i,j,4) )
+ frct(m,i,j,3) = frct(m,i,j,3)
+ > - dsspm * (- 4.0d+00 * rsd(m,i,j,2)
+ > + 6.0d+00 * rsd(m,i,j,3)
+ > - 4.0d+00 * rsd(m,i,j,4)
+ > + rsd(m,i,j,5) )
+ end do
+ end do
+ end do
+
+ do k = 4, nz - 3
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ frct(m,i,j,k) = frct(m,i,j,k)
+ > - dsspm * ( rsd(m,i,j,k-2)
+ > - 4.0d+00 * rsd(m,i,j,k-1)
+ > + 6.0d+00 * rsd(m,i,j,k)
+ > - 4.0d+00 * rsd(m,i,j,k+1)
+ > + rsd(m,i,j,k+2) )
+ end do
+ end do
+ end do
+ end do
+
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ frct(m,i,j,nz-2) = frct(m,i,j,nz-2)
+ > - dsspm * ( rsd(m,i,j,nz-4)
+ > - 4.0d+00 * rsd(m,i,j,nz-3)
+ > + 6.0d+00 * rsd(m,i,j,nz-2)
+ > - 4.0d+00 * rsd(m,i,j,nz-1) )
+ frct(m,i,j,nz-1) = frct(m,i,j,nz-1)
+ > - dsspm * ( rsd(m,i,j,nz-3)
+ > - 4.0d+00 * rsd(m,i,j,nz-2)
+ > + 5.0d+00 * rsd(m,i,j,nz-1) )
+ end do
+ end do
+ end do
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine error
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c compute the solution error
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, k, m
+ integer iglob, jglob
+ double precision tmp
+ double precision u000ijk(5), dummy(5)
+
+ integer IERROR
+
+
+ do m = 1, 5
+ errnm(m) = 0.0d+00
+ dummy(m) = 0.0d+00
+ end do
+
+ do k = 2, nz-1
+ do j = jst, jend
+ jglob = jpt + j
+ do i = ist, iend
+ iglob = ipt + i
+ call exact( iglob, jglob, k, u000ijk )
+ do m = 1, 5
+ tmp = ( u000ijk(m) - u(m,i,j,k) )
+ dummy(m) = dummy(m) + tmp ** 2
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c compute the global sum of individual contributions to dot product.
+c---------------------------------------------------------------------
+ call MPI_ALLREDUCE( dummy,
+ > errnm,
+ > 5,
+ > dp_type,
+ > MPI_SUM,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+ do m = 1, 5
+ errnm(m) = sqrt ( errnm(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) )
+ end do
+
+c if (id.eq.0) then
+c write (*,1002) ( errnm(m), m = 1, 5 )
+c end if
+
+ 1002 format (1x/1x,'RMS-norm of error in soln. to ',
+ > 'first pde = ',1pe12.5/,
+ > 1x,'RMS-norm of error in soln. to ',
+ > 'second pde = ',1pe12.5/,
+ > 1x,'RMS-norm of error in soln. to ',
+ > 'third pde = ',1pe12.5/,
+ > 1x,'RMS-norm of error in soln. to ',
+ > 'fourth pde = ',1pe12.5/,
+ > 1x,'RMS-norm of error in soln. to ',
+ > 'fifth pde = ',1pe12.5)
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exact( i, j, k, u000ijk )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c compute the exact solution at (i,j,k)
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ integer i, j, k
+ double precision u000ijk(*)
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer m
+ double precision xi, eta, zeta
+
+ xi = ( dble ( i - 1 ) ) / ( nx0 - 1 )
+ eta = ( dble ( j - 1 ) ) / ( ny0 - 1 )
+ zeta = ( dble ( k - 1 ) ) / ( nz - 1 )
+
+
+ do m = 1, 5
+ u000ijk(m) = ce(m,1)
+ > + ce(m,2) * xi
+ > + ce(m,3) * eta
+ > + ce(m,4) * zeta
+ > + ce(m,5) * xi * xi
+ > + ce(m,6) * eta * eta
+ > + ce(m,7) * zeta * zeta
+ > + ce(m,8) * xi * xi * xi
+ > + ce(m,9) * eta * eta * eta
+ > + ce(m,10) * zeta * zeta * zeta
+ > + ce(m,11) * xi * xi * xi * xi
+ > + ce(m,12) * eta * eta * eta * eta
+ > + ce(m,13) * zeta * zeta * zeta * zeta
+ end do
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exchange_1( g,k,iex )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+ double precision g(5,-1:isiz1+2,-1:isiz2+2,isiz3)
+ integer k
+ integer iex
+ integer i, j
+ double precision dum(5,isiz1+isiz2), dum1(5,isiz1+isiz2)
+
+ integer STATUS(MPI_STATUS_SIZE)
+ integer IERROR
+
+
+
+ if( iex .eq. 0 ) then
+
+ if( north .ne. -1 ) then
+ call MPI_RECV( dum1(1,jst),
+ > 5*(jend-jst+1),
+ > dp_type,
+ > north,
+ > from_n,
+ > MPI_COMM_WORLD,
+ > status,
+ > IERROR )
+ do j=jst,jend
+ g(1,0,j,k) = dum1(1,j)
+ g(2,0,j,k) = dum1(2,j)
+ g(3,0,j,k) = dum1(3,j)
+ g(4,0,j,k) = dum1(4,j)
+ g(5,0,j,k) = dum1(5,j)
+ enddo
+ endif
+
+ if( west .ne. -1 ) then
+ call MPI_RECV( dum1(1,ist),
+ > 5*(iend-ist+1),
+ > dp_type,
+ > west,
+ > from_w,
+ > MPI_COMM_WORLD,
+ > status,
+ > IERROR )
+ do i=ist,iend
+ g(1,i,0,k) = dum1(1,i)
+ g(2,i,0,k) = dum1(2,i)
+ g(3,i,0,k) = dum1(3,i)
+ g(4,i,0,k) = dum1(4,i)
+ g(5,i,0,k) = dum1(5,i)
+ enddo
+ endif
+
+ else if( iex .eq. 1 ) then
+
+ if( south .ne. -1 ) then
+ call MPI_RECV( dum1(1,jst),
+ > 5*(jend-jst+1),
+ > dp_type,
+ > south,
+ > from_s,
+ > MPI_COMM_WORLD,
+ > status,
+ > IERROR )
+ do j=jst,jend
+ g(1,nx+1,j,k) = dum1(1,j)
+ g(2,nx+1,j,k) = dum1(2,j)
+ g(3,nx+1,j,k) = dum1(3,j)
+ g(4,nx+1,j,k) = dum1(4,j)
+ g(5,nx+1,j,k) = dum1(5,j)
+ enddo
+ endif
+
+ if( east .ne. -1 ) then
+ call MPI_RECV( dum1(1,ist),
+ > 5*(iend-ist+1),
+ > dp_type,
+ > east,
+ > from_e,
+ > MPI_COMM_WORLD,
+ > status,
+ > IERROR )
+ do i=ist,iend
+ g(1,i,ny+1,k) = dum1(1,i)
+ g(2,i,ny+1,k) = dum1(2,i)
+ g(3,i,ny+1,k) = dum1(3,i)
+ g(4,i,ny+1,k) = dum1(4,i)
+ g(5,i,ny+1,k) = dum1(5,i)
+ enddo
+ endif
+
+ else if( iex .eq. 2 ) then
+
+ if( south .ne. -1 ) then
+ do j=jst,jend
+ dum(1,j) = g(1,nx,j,k)
+ dum(2,j) = g(2,nx,j,k)
+ dum(3,j) = g(3,nx,j,k)
+ dum(4,j) = g(4,nx,j,k)
+ dum(5,j) = g(5,nx,j,k)
+ enddo
+ call MPI_SEND( dum(1,jst),
+ > 5*(jend-jst+1),
+ > dp_type,
+ > south,
+ > from_n,
+ > MPI_COMM_WORLD,
+ > IERROR )
+ endif
+
+ if( east .ne. -1 ) then
+ do i=ist,iend
+ dum(1,i) = g(1,i,ny,k)
+ dum(2,i) = g(2,i,ny,k)
+ dum(3,i) = g(3,i,ny,k)
+ dum(4,i) = g(4,i,ny,k)
+ dum(5,i) = g(5,i,ny,k)
+ enddo
+ call MPI_SEND( dum(1,ist),
+ > 5*(iend-ist+1),
+ > dp_type,
+ > east,
+ > from_w,
+ > MPI_COMM_WORLD,
+ > IERROR )
+ endif
+
+ else
+
+ if( north .ne. -1 ) then
+ do j=jst,jend
+ dum(1,j) = g(1,1,j,k)
+ dum(2,j) = g(2,1,j,k)
+ dum(3,j) = g(3,1,j,k)
+ dum(4,j) = g(4,1,j,k)
+ dum(5,j) = g(5,1,j,k)
+ enddo
+ call MPI_SEND( dum(1,jst),
+ > 5*(jend-jst+1),
+ > dp_type,
+ > north,
+ > from_s,
+ > MPI_COMM_WORLD,
+ > IERROR )
+ endif
+
+ if( west .ne. -1 ) then
+ do i=ist,iend
+ dum(1,i) = g(1,i,1,k)
+ dum(2,i) = g(2,i,1,k)
+ dum(3,i) = g(3,i,1,k)
+ dum(4,i) = g(4,i,1,k)
+ dum(5,i) = g(5,i,1,k)
+ enddo
+ call MPI_SEND( dum(1,ist),
+ > 5*(iend-ist+1),
+ > dp_type,
+ > west,
+ > from_e,
+ > MPI_COMM_WORLD,
+ > IERROR )
+ endif
+
+ endif
+
+ end
+
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exchange_3(g,iex)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ double precision g(5,-1:isiz1+2,-1:isiz2+2,isiz3)
+ integer iex
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, k
+ integer ipos1, ipos2
+
+ integer mid
+ integer STATUS(MPI_STATUS_SIZE)
+ integer IERROR
+
+
+
+ if (iex.eq.0) then
+c---------------------------------------------------------------------
+c communicate in the south and north directions
+c---------------------------------------------------------------------
+ if (north.ne.-1) then
+ call MPI_IRECV( buf1,
+ > 10*ny*nz,
+ > dp_type,
+ > MPI_ANY_SOURCE,
+ > from_n,
+ > MPI_COMM_WORLD,
+ > mid,
+ > IERROR )
+ end if
+
+c---------------------------------------------------------------------
+c send south
+c---------------------------------------------------------------------
+ if (south.ne.-1) then
+ do k = 1,nz
+ do j = 1,ny
+ ipos1 = (k-1)*ny + j
+ ipos2 = ipos1 + ny*nz
+ buf(1,ipos1) = g(1,nx-1,j,k)
+ buf(2,ipos1) = g(2,nx-1,j,k)
+ buf(3,ipos1) = g(3,nx-1,j,k)
+ buf(4,ipos1) = g(4,nx-1,j,k)
+ buf(5,ipos1) = g(5,nx-1,j,k)
+ buf(1,ipos2) = g(1,nx,j,k)
+ buf(2,ipos2) = g(2,nx,j,k)
+ buf(3,ipos2) = g(3,nx,j,k)
+ buf(4,ipos2) = g(4,nx,j,k)
+ buf(5,ipos2) = g(5,nx,j,k)
+ end do
+ end do
+
+ call MPI_SEND( buf,
+ > 10*ny*nz,
+ > dp_type,
+ > south,
+ > from_n,
+ > MPI_COMM_WORLD,
+ > IERROR )
+ end if
+
+c---------------------------------------------------------------------
+c receive from north
+c---------------------------------------------------------------------
+ if (north.ne.-1) then
+ call MPI_WAIT( mid, STATUS, IERROR )
+
+ do k = 1,nz
+ do j = 1,ny
+ ipos1 = (k-1)*ny + j
+ ipos2 = ipos1 + ny*nz
+ g(1,-1,j,k) = buf1(1,ipos1)
+ g(2,-1,j,k) = buf1(2,ipos1)
+ g(3,-1,j,k) = buf1(3,ipos1)
+ g(4,-1,j,k) = buf1(4,ipos1)
+ g(5,-1,j,k) = buf1(5,ipos1)
+ g(1,0,j,k) = buf1(1,ipos2)
+ g(2,0,j,k) = buf1(2,ipos2)
+ g(3,0,j,k) = buf1(3,ipos2)
+ g(4,0,j,k) = buf1(4,ipos2)
+ g(5,0,j,k) = buf1(5,ipos2)
+ end do
+ end do
+
+ end if
+
+ if (south.ne.-1) then
+ call MPI_IRECV( buf1,
+ > 10*ny*nz,
+ > dp_type,
+ > MPI_ANY_SOURCE,
+ > from_s,
+ > MPI_COMM_WORLD,
+ > mid,
+ > IERROR )
+ end if
+
+c---------------------------------------------------------------------
+c send north
+c---------------------------------------------------------------------
+ if (north.ne.-1) then
+ do k = 1,nz
+ do j = 1,ny
+ ipos1 = (k-1)*ny + j
+ ipos2 = ipos1 + ny*nz
+ buf(1,ipos1) = g(1,2,j,k)
+ buf(2,ipos1) = g(2,2,j,k)
+ buf(3,ipos1) = g(3,2,j,k)
+ buf(4,ipos1) = g(4,2,j,k)
+ buf(5,ipos1) = g(5,2,j,k)
+ buf(1,ipos2) = g(1,1,j,k)
+ buf(2,ipos2) = g(2,1,j,k)
+ buf(3,ipos2) = g(3,1,j,k)
+ buf(4,ipos2) = g(4,1,j,k)
+ buf(5,ipos2) = g(5,1,j,k)
+ end do
+ end do
+
+ call MPI_SEND( buf,
+ > 10*ny*nz,
+ > dp_type,
+ > north,
+ > from_s,
+ > MPI_COMM_WORLD,
+ > IERROR )
+ end if
+
+c---------------------------------------------------------------------
+c receive from south
+c---------------------------------------------------------------------
+ if (south.ne.-1) then
+ call MPI_WAIT( mid, STATUS, IERROR )
+
+ do k = 1,nz
+ do j = 1,ny
+ ipos1 = (k-1)*ny + j
+ ipos2 = ipos1 + ny*nz
+ g(1,nx+2,j,k) = buf1(1,ipos1)
+ g(2,nx+2,j,k) = buf1(2,ipos1)
+ g(3,nx+2,j,k) = buf1(3,ipos1)
+ g(4,nx+2,j,k) = buf1(4,ipos1)
+ g(5,nx+2,j,k) = buf1(5,ipos1)
+ g(1,nx+1,j,k) = buf1(1,ipos2)
+ g(2,nx+1,j,k) = buf1(2,ipos2)
+ g(3,nx+1,j,k) = buf1(3,ipos2)
+ g(4,nx+1,j,k) = buf1(4,ipos2)
+ g(5,nx+1,j,k) = buf1(5,ipos2)
+ end do
+ end do
+ end if
+
+ else
+
+c---------------------------------------------------------------------
+c communicate in the east and west directions
+c---------------------------------------------------------------------
+ if (west.ne.-1) then
+ call MPI_IRECV( buf1,
+ > 10*nx*nz,
+ > dp_type,
+ > MPI_ANY_SOURCE,
+ > from_w,
+ > MPI_COMM_WORLD,
+ > mid,
+ > IERROR )
+ end if
+
+c---------------------------------------------------------------------
+c send east
+c---------------------------------------------------------------------
+ if (east.ne.-1) then
+ do k = 1,nz
+ do i = 1,nx
+ ipos1 = (k-1)*nx + i
+ ipos2 = ipos1 + nx*nz
+ buf(1,ipos1) = g(1,i,ny-1,k)
+ buf(2,ipos1) = g(2,i,ny-1,k)
+ buf(3,ipos1) = g(3,i,ny-1,k)
+ buf(4,ipos1) = g(4,i,ny-1,k)
+ buf(5,ipos1) = g(5,i,ny-1,k)
+ buf(1,ipos2) = g(1,i,ny,k)
+ buf(2,ipos2) = g(2,i,ny,k)
+ buf(3,ipos2) = g(3,i,ny,k)
+ buf(4,ipos2) = g(4,i,ny,k)
+ buf(5,ipos2) = g(5,i,ny,k)
+ end do
+ end do
+
+ call MPI_SEND( buf,
+ > 10*nx*nz,
+ > dp_type,
+ > east,
+ > from_w,
+ > MPI_COMM_WORLD,
+ > IERROR )
+ end if
+
+c---------------------------------------------------------------------
+c receive from west
+c---------------------------------------------------------------------
+ if (west.ne.-1) then
+ call MPI_WAIT( mid, STATUS, IERROR )
+
+ do k = 1,nz
+ do i = 1,nx
+ ipos1 = (k-1)*nx + i
+ ipos2 = ipos1 + nx*nz
+ g(1,i,-1,k) = buf1(1,ipos1)
+ g(2,i,-1,k) = buf1(2,ipos1)
+ g(3,i,-1,k) = buf1(3,ipos1)
+ g(4,i,-1,k) = buf1(4,ipos1)
+ g(5,i,-1,k) = buf1(5,ipos1)
+ g(1,i,0,k) = buf1(1,ipos2)
+ g(2,i,0,k) = buf1(2,ipos2)
+ g(3,i,0,k) = buf1(3,ipos2)
+ g(4,i,0,k) = buf1(4,ipos2)
+ g(5,i,0,k) = buf1(5,ipos2)
+ end do
+ end do
+
+ end if
+
+ if (east.ne.-1) then
+ call MPI_IRECV( buf1,
+ > 10*nx*nz,
+ > dp_type,
+ > MPI_ANY_SOURCE,
+ > from_e,
+ > MPI_COMM_WORLD,
+ > mid,
+ > IERROR )
+ end if
+
+c---------------------------------------------------------------------
+c send west
+c---------------------------------------------------------------------
+ if (west.ne.-1) then
+ do k = 1,nz
+ do i = 1,nx
+ ipos1 = (k-1)*nx + i
+ ipos2 = ipos1 + nx*nz
+ buf(1,ipos1) = g(1,i,2,k)
+ buf(2,ipos1) = g(2,i,2,k)
+ buf(3,ipos1) = g(3,i,2,k)
+ buf(4,ipos1) = g(4,i,2,k)
+ buf(5,ipos1) = g(5,i,2,k)
+ buf(1,ipos2) = g(1,i,1,k)
+ buf(2,ipos2) = g(2,i,1,k)
+ buf(3,ipos2) = g(3,i,1,k)
+ buf(4,ipos2) = g(4,i,1,k)
+ buf(5,ipos2) = g(5,i,1,k)
+ end do
+ end do
+
+ call MPI_SEND( buf,
+ > 10*nx*nz,
+ > dp_type,
+ > west,
+ > from_e,
+ > MPI_COMM_WORLD,
+ > IERROR )
+ end if
+
+c---------------------------------------------------------------------
+c receive from east
+c---------------------------------------------------------------------
+ if (east.ne.-1) then
+ call MPI_WAIT( mid, STATUS, IERROR )
+
+ do k = 1,nz
+ do i = 1,nx
+ ipos1 = (k-1)*nx + i
+ ipos2 = ipos1 + nx*nz
+ g(1,i,ny+2,k) = buf1(1,ipos1)
+ g(2,i,ny+2,k) = buf1(2,ipos1)
+ g(3,i,ny+2,k) = buf1(3,ipos1)
+ g(4,i,ny+2,k) = buf1(4,ipos1)
+ g(5,i,ny+2,k) = buf1(5,ipos1)
+ g(1,i,ny+1,k) = buf1(1,ipos2)
+ g(2,i,ny+1,k) = buf1(2,ipos2)
+ g(3,i,ny+1,k) = buf1(3,ipos2)
+ g(4,i,ny+1,k) = buf1(4,ipos2)
+ g(5,i,ny+1,k) = buf1(5,ipos2)
+ end do
+ end do
+
+ end if
+
+ end if
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exchange_4(g,h,ibeg,ifin1,jbeg,jfin1)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ double precision g(0:isiz2+1,0:isiz3+1),
+ > h(0:isiz2+1,0:isiz3+1)
+ integer ibeg, ifin1
+ integer jbeg, jfin1
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j
+ integer ny2
+ double precision dum(1024)
+
+ integer msgid1, msgid3
+ integer STATUS(MPI_STATUS_SIZE)
+ integer IERROR
+
+
+
+ ny2 = ny + 2
+
+c---------------------------------------------------------------------
+c communicate in the east and west directions
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c receive from east
+c---------------------------------------------------------------------
+ if (jfin1.eq.ny) then
+ call MPI_IRECV( dum,
+ > 2*nx,
+ > dp_type,
+ > MPI_ANY_SOURCE,
+ > from_e,
+ > MPI_COMM_WORLD,
+ > msgid3,
+ > IERROR )
+
+ call MPI_WAIT( msgid3, STATUS, IERROR )
+
+ do i = 1,nx
+ g(i,ny+1) = dum(i)
+ h(i,ny+1) = dum(i+nx)
+ end do
+
+ end if
+
+c---------------------------------------------------------------------
+c send west
+c---------------------------------------------------------------------
+ if (jbeg.eq.1) then
+ do i = 1,nx
+ dum(i) = g(i,1)
+ dum(i+nx) = h(i,1)
+ end do
+
+ call MPI_SEND( dum,
+ > 2*nx,
+ > dp_type,
+ > west,
+ > from_e,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+ end if
+
+c---------------------------------------------------------------------
+c communicate in the south and north directions
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c receive from south
+c---------------------------------------------------------------------
+ if (ifin1.eq.nx) then
+ call MPI_IRECV( dum,
+ > 2*ny2,
+ > dp_type,
+ > MPI_ANY_SOURCE,
+ > from_s,
+ > MPI_COMM_WORLD,
+ > msgid1,
+ > IERROR )
+
+ call MPI_WAIT( msgid1, STATUS, IERROR )
+
+ do j = 0,ny+1
+ g(nx+1,j) = dum(j+1)
+ h(nx+1,j) = dum(j+ny2+1)
+ end do
+
+ end if
+
+c---------------------------------------------------------------------
+c send north
+c---------------------------------------------------------------------
+ if (ibeg.eq.1) then
+ do j = 0,ny+1
+ dum(j+1) = g(1,j)
+ dum(j+ny2+1) = h(1,j)
+ end do
+
+ call MPI_SEND( dum,
+ > 2*ny2,
+ > dp_type,
+ > north,
+ > from_s,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+ end if
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exchange_5(g,ibeg,ifin1)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ double precision g(0:isiz2+1,0:isiz3+1)
+ integer ibeg, ifin1
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer k
+ double precision dum(1024)
+
+ integer msgid1
+ integer STATUS(MPI_STATUS_SIZE)
+ integer IERROR
+
+
+
+c---------------------------------------------------------------------
+c communicate in the south and north directions
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c receive from south
+c---------------------------------------------------------------------
+ if (ifin1.eq.nx) then
+ call MPI_IRECV( dum,
+ > nz,
+ > dp_type,
+ > MPI_ANY_SOURCE,
+ > from_s,
+ > MPI_COMM_WORLD,
+ > msgid1,
+ > IERROR )
+
+ call MPI_WAIT( msgid1, STATUS, IERROR )
+
+ do k = 1,nz
+ g(nx+1,k) = dum(k)
+ end do
+
+ end if
+
+c---------------------------------------------------------------------
+c send north
+c---------------------------------------------------------------------
+ if (ibeg.eq.1) then
+ do k = 1,nz
+ dum(k) = g(1,k)
+ end do
+
+ call MPI_SEND( dum,
+ > nz,
+ > dp_type,
+ > north,
+ > from_s,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+ end if
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exchange_6(g,jbeg,jfin1)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ double precision g(0:isiz2+1,0:isiz3+1)
+ integer jbeg, jfin1
+
+c---------------------------------------------------------------------
+c local parameters
+c---------------------------------------------------------------------
+ integer k
+ double precision dum(1024)
+
+ integer msgid3
+ integer STATUS(MPI_STATUS_SIZE)
+ integer IERROR
+
+
+
+c---------------------------------------------------------------------
+c communicate in the east and west directions
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c receive from east
+c---------------------------------------------------------------------
+ if (jfin1.eq.ny) then
+ call MPI_IRECV( dum,
+ > nz,
+ > dp_type,
+ > MPI_ANY_SOURCE,
+ > from_e,
+ > MPI_COMM_WORLD,
+ > msgid3,
+ > IERROR )
+
+ call MPI_WAIT( msgid3, STATUS, IERROR )
+
+ do k = 1,nz
+ g(ny+1,k) = dum(k)
+ end do
+
+ end if
+
+c---------------------------------------------------------------------
+c send west
+c---------------------------------------------------------------------
+ if (jbeg.eq.1) then
+ do k = 1,nz
+ dum(k) = g(1,k)
+ end do
+
+ call MPI_SEND( dum,
+ > nz,
+ > dp_type,
+ > west,
+ > from_e,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+ end if
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine init_comm
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c initialize MPI and establish rank and size
+c
+c This is a module in the MPI implementation of LUSSOR
+c pseudo application from the NAS Parallel Benchmarks.
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+ integer nodedim
+ integer IERROR
+
+
+c---------------------------------------------------------------------
+c initialize MPI communication
+c---------------------------------------------------------------------
+ call MPI_INIT( IERROR )
+
+c---------------------------------------------------------------------
+c establish the global rank of this process
+c---------------------------------------------------------------------
+ call MPI_COMM_RANK( MPI_COMM_WORLD,
+ > id,
+ > IERROR )
+
+c---------------------------------------------------------------------
+c establish the size of the global group
+c---------------------------------------------------------------------
+ call MPI_COMM_SIZE( MPI_COMM_WORLD,
+ > num,
+ > IERROR )
+
+ ndim = nodedim(num)
+
+ if (.not. convertdouble) then
+ dp_type = MPI_DOUBLE_PRECISION
+ else
+ dp_type = MPI_REAL
+ endif
+
+
+ return
+ end
--- /dev/null
+c
+c***controls printing of the progress of iterations: ipr inorm
+ 1 250
+c
+c***the maximum no. of pseudo-time steps to be performed: nitmax
+ 250
+c
+c***magnitude of the time step: dt
+ 2.0e+00
+c
+c***relaxation factor for SSOR iterations: omega
+ 1.2
+c
+c***tolerance levels for steady-state residuals: tolnwt(m),m=1,5
+ 1.0e-08 1.0e-08 1.0e-08 1.0e-08 1.0e-08
+c
+c***number of grid points in xi and eta and zeta directions: nx ny nz
+ 64 64 64
+c
+
+
+
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine jacld(k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c compute the lower triangular part of the jacobian matrix
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ integer k
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j
+ double precision r43
+ double precision c1345
+ double precision c34
+ double precision tmp1, tmp2, tmp3
+
+
+
+ r43 = ( 4.0d+00 / 3.0d+00 )
+ c1345 = c1 * c3 * c4 * c5
+ c34 = c3 * c4
+
+ do j = jst, jend
+ do i = ist, iend
+
+c---------------------------------------------------------------------
+c form the block daigonal
+c---------------------------------------------------------------------
+ tmp1 = 1.0d+00 / u(1,i,j,k)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ d(1,1,i,j) = 1.0d+00
+ > + dt * 2.0d+00 * ( tx1 * dx1
+ > + ty1 * dy1
+ > + tz1 * dz1 )
+ d(1,2,i,j) = 0.0d+00
+ d(1,3,i,j) = 0.0d+00
+ d(1,4,i,j) = 0.0d+00
+ d(1,5,i,j) = 0.0d+00
+
+ d(2,1,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) )
+ > + ty1 * ( - c34 * tmp2 * u(2,i,j,k) )
+ > + tz1 * ( - c34 * tmp2 * u(2,i,j,k) ) )
+ d(2,2,i,j) = 1.0d+00
+ > + dt * 2.0d+00
+ > * ( tx1 * r43 * c34 * tmp1
+ > + ty1 * c34 * tmp1
+ > + tz1 * c34 * tmp1 )
+ > + dt * 2.0d+00 * ( tx1 * dx2
+ > + ty1 * dy2
+ > + tz1 * dz2 )
+ d(2,3,i,j) = 0.0d+00
+ d(2,4,i,j) = 0.0d+00
+ d(2,5,i,j) = 0.0d+00
+
+ d(3,1,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( - c34 * tmp2 * u(3,i,j,k) )
+ > + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) )
+ > + tz1 * ( - c34 * tmp2 * u(3,i,j,k) ) )
+ d(3,2,i,j) = 0.0d+00
+ d(3,3,i,j) = 1.0d+00
+ > + dt * 2.0d+00
+ > * ( tx1 * c34 * tmp1
+ > + ty1 * r43 * c34 * tmp1
+ > + tz1 * c34 * tmp1 )
+ > + dt * 2.0d+00 * ( tx1 * dx3
+ > + ty1 * dy3
+ > + tz1 * dz3 )
+ d(3,4,i,j) = 0.0d+00
+ d(3,5,i,j) = 0.0d+00
+
+ d(4,1,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( - c34 * tmp2 * u(4,i,j,k) )
+ > + ty1 * ( - c34 * tmp2 * u(4,i,j,k) )
+ > + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) )
+ d(4,2,i,j) = 0.0d+00
+ d(4,3,i,j) = 0.0d+00
+ d(4,4,i,j) = 1.0d+00
+ > + dt * 2.0d+00
+ > * ( tx1 * c34 * tmp1
+ > + ty1 * c34 * tmp1
+ > + tz1 * r43 * c34 * tmp1 )
+ > + dt * 2.0d+00 * ( tx1 * dx4
+ > + ty1 * dy4
+ > + tz1 * dz4 )
+ d(4,5,i,j) = 0.0d+00
+
+ d(5,1,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+ > - ( c1345 ) * tmp2 * u(5,i,j,k) )
+ > + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+ > - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+ > - ( c1345 ) * tmp2 * u(5,i,j,k) )
+ > + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+ > - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+ > - ( c1345 ) * tmp2 * u(5,i,j,k) ) )
+ d(5,2,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k)
+ > + ty1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k)
+ > + tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) )
+ d(5,3,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k)
+ > + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k)
+ > + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) )
+ d(5,4,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
+ > + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
+ > + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) )
+ d(5,5,i,j) = 1.0d+00
+ > + dt * 2.0d+00 * ( tx1 * c1345 * tmp1
+ > + ty1 * c1345 * tmp1
+ > + tz1 * c1345 * tmp1 )
+ > + dt * 2.0d+00 * ( tx1 * dx5
+ > + ty1 * dy5
+ > + tz1 * dz5 )
+
+c---------------------------------------------------------------------
+c form the first block sub-diagonal
+c---------------------------------------------------------------------
+ tmp1 = 1.0d+00 / u(1,i,j,k-1)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ a(1,1,i,j) = - dt * tz1 * dz1
+ a(1,2,i,j) = 0.0d+00
+ a(1,3,i,j) = 0.0d+00
+ a(1,4,i,j) = - dt * tz2
+ a(1,5,i,j) = 0.0d+00
+
+ a(2,1,i,j) = - dt * tz2
+ > * ( - ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
+ > - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k-1) )
+ a(2,2,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 )
+ > - dt * tz1 * c34 * tmp1
+ > - dt * tz1 * dz2
+ a(2,3,i,j) = 0.0d+00
+ a(2,4,i,j) = - dt * tz2 * ( u(2,i,j,k-1) * tmp1 )
+ a(2,5,i,j) = 0.0d+00
+
+ a(3,1,i,j) = - dt * tz2
+ > * ( - ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
+ > - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k-1) )
+ a(3,2,i,j) = 0.0d+00
+ a(3,3,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 )
+ > - dt * tz1 * ( c34 * tmp1 )
+ > - dt * tz1 * dz3
+ a(3,4,i,j) = - dt * tz2 * ( u(3,i,j,k-1) * tmp1 )
+ a(3,5,i,j) = 0.0d+00
+
+ a(4,1,i,j) = - dt * tz2
+ > * ( - ( u(4,i,j,k-1) * tmp1 ) ** 2
+ > + 0.50d+00 * c2
+ > * ( ( u(2,i,j,k-1) * u(2,i,j,k-1)
+ > + u(3,i,j,k-1) * u(3,i,j,k-1)
+ > + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2 ) )
+ > - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k-1) )
+ a(4,2,i,j) = - dt * tz2
+ > * ( - c2 * ( u(2,i,j,k-1) * tmp1 ) )
+ a(4,3,i,j) = - dt * tz2
+ > * ( - c2 * ( u(3,i,j,k-1) * tmp1 ) )
+ a(4,4,i,j) = - dt * tz2 * ( 2.0d+00 - c2 )
+ > * ( u(4,i,j,k-1) * tmp1 )
+ > - dt * tz1 * ( r43 * c34 * tmp1 )
+ > - dt * tz1 * dz4
+ a(4,5,i,j) = - dt * tz2 * c2
+
+ a(5,1,i,j) = - dt * tz2
+ > * ( ( c2 * ( u(2,i,j,k-1) * u(2,i,j,k-1)
+ > + u(3,i,j,k-1) * u(3,i,j,k-1)
+ > + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2
+ > - c1 * ( u(5,i,j,k-1) * tmp1 ) )
+ > * ( u(4,i,j,k-1) * tmp1 ) )
+ > - dt * tz1
+ > * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k-1)**2)
+ > - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k-1)**2)
+ > - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k-1)**2)
+ > - c1345 * tmp2 * u(5,i,j,k-1) )
+ a(5,2,i,j) = - dt * tz2
+ > * ( - c2 * ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
+ > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k-1)
+ a(5,3,i,j) = - dt * tz2
+ > * ( - c2 * ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
+ > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k-1)
+ a(5,4,i,j) = - dt * tz2
+ > * ( c1 * ( u(5,i,j,k-1) * tmp1 )
+ > - 0.50d+00 * c2
+ > * ( ( u(2,i,j,k-1)*u(2,i,j,k-1)
+ > + u(3,i,j,k-1)*u(3,i,j,k-1)
+ > + 3.0d+00*u(4,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) )
+ > - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k-1)
+ a(5,5,i,j) = - dt * tz2
+ > * ( c1 * ( u(4,i,j,k-1) * tmp1 ) )
+ > - dt * tz1 * c1345 * tmp1
+ > - dt * tz1 * dz5
+
+c---------------------------------------------------------------------
+c form the second block sub-diagonal
+c---------------------------------------------------------------------
+ tmp1 = 1.0d+00 / u(1,i,j-1,k)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ b(1,1,i,j) = - dt * ty1 * dy1
+ b(1,2,i,j) = 0.0d+00
+ b(1,3,i,j) = - dt * ty2
+ b(1,4,i,j) = 0.0d+00
+ b(1,5,i,j) = 0.0d+00
+
+ b(2,1,i,j) = - dt * ty2
+ > * ( - ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 )
+ > - dt * ty1 * ( - c34 * tmp2 * u(2,i,j-1,k) )
+ b(2,2,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 )
+ > - dt * ty1 * ( c34 * tmp1 )
+ > - dt * ty1 * dy2
+ b(2,3,i,j) = - dt * ty2 * ( u(2,i,j-1,k) * tmp1 )
+ b(2,4,i,j) = 0.0d+00
+ b(2,5,i,j) = 0.0d+00
+
+ b(3,1,i,j) = - dt * ty2
+ > * ( - ( u(3,i,j-1,k) * tmp1 ) ** 2
+ > + 0.50d+00 * c2 * ( ( u(2,i,j-1,k) * u(2,i,j-1,k)
+ > + u(3,i,j-1,k) * u(3,i,j-1,k)
+ > + u(4,i,j-1,k) * u(4,i,j-1,k) )
+ > * tmp2 ) )
+ > - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j-1,k) )
+ b(3,2,i,j) = - dt * ty2
+ > * ( - c2 * ( u(2,i,j-1,k) * tmp1 ) )
+ b(3,3,i,j) = - dt * ty2 * ( ( 2.0d+00 - c2 )
+ > * ( u(3,i,j-1,k) * tmp1 ) )
+ > - dt * ty1 * ( r43 * c34 * tmp1 )
+ > - dt * ty1 * dy3
+ b(3,4,i,j) = - dt * ty2
+ > * ( - c2 * ( u(4,i,j-1,k) * tmp1 ) )
+ b(3,5,i,j) = - dt * ty2 * c2
+
+ b(4,1,i,j) = - dt * ty2
+ > * ( - ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 )
+ > - dt * ty1 * ( - c34 * tmp2 * u(4,i,j-1,k) )
+ b(4,2,i,j) = 0.0d+00
+ b(4,3,i,j) = - dt * ty2 * ( u(4,i,j-1,k) * tmp1 )
+ b(4,4,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 )
+ > - dt * ty1 * ( c34 * tmp1 )
+ > - dt * ty1 * dy4
+ b(4,5,i,j) = 0.0d+00
+
+ b(5,1,i,j) = - dt * ty2
+ > * ( ( c2 * ( u(2,i,j-1,k) * u(2,i,j-1,k)
+ > + u(3,i,j-1,k) * u(3,i,j-1,k)
+ > + u(4,i,j-1,k) * u(4,i,j-1,k) ) * tmp2
+ > - c1 * ( u(5,i,j-1,k) * tmp1 ) )
+ > * ( u(3,i,j-1,k) * tmp1 ) )
+ > - dt * ty1
+ > * ( - ( c34 - c1345 )*tmp3*(u(2,i,j-1,k)**2)
+ > - ( r43*c34 - c1345 )*tmp3*(u(3,i,j-1,k)**2)
+ > - ( c34 - c1345 )*tmp3*(u(4,i,j-1,k)**2)
+ > - c1345*tmp2*u(5,i,j-1,k) )
+ b(5,2,i,j) = - dt * ty2
+ > * ( - c2 * ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 )
+ > - dt * ty1
+ > * ( c34 - c1345 ) * tmp2 * u(2,i,j-1,k)
+ b(5,3,i,j) = - dt * ty2
+ > * ( c1 * ( u(5,i,j-1,k) * tmp1 )
+ > - 0.50d+00 * c2
+ > * ( ( u(2,i,j-1,k)*u(2,i,j-1,k)
+ > + 3.0d+00 * u(3,i,j-1,k)*u(3,i,j-1,k)
+ > + u(4,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) )
+ > - dt * ty1
+ > * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j-1,k)
+ b(5,4,i,j) = - dt * ty2
+ > * ( - c2 * ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 )
+ > - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j-1,k)
+ b(5,5,i,j) = - dt * ty2
+ > * ( c1 * ( u(3,i,j-1,k) * tmp1 ) )
+ > - dt * ty1 * c1345 * tmp1
+ > - dt * ty1 * dy5
+
+c---------------------------------------------------------------------
+c form the third block sub-diagonal
+c---------------------------------------------------------------------
+ tmp1 = 1.0d+00 / u(1,i-1,j,k)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ c(1,1,i,j) = - dt * tx1 * dx1
+ c(1,2,i,j) = - dt * tx2
+ c(1,3,i,j) = 0.0d+00
+ c(1,4,i,j) = 0.0d+00
+ c(1,5,i,j) = 0.0d+00
+
+ c(2,1,i,j) = - dt * tx2
+ > * ( - ( u(2,i-1,j,k) * tmp1 ) ** 2
+ > + c2 * 0.50d+00 * ( u(2,i-1,j,k) * u(2,i-1,j,k)
+ > + u(3,i-1,j,k) * u(3,i-1,j,k)
+ > + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2 )
+ > - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i-1,j,k) )
+ c(2,2,i,j) = - dt * tx2
+ > * ( ( 2.0d+00 - c2 ) * ( u(2,i-1,j,k) * tmp1 ) )
+ > - dt * tx1 * ( r43 * c34 * tmp1 )
+ > - dt * tx1 * dx2
+ c(2,3,i,j) = - dt * tx2
+ > * ( - c2 * ( u(3,i-1,j,k) * tmp1 ) )
+ c(2,4,i,j) = - dt * tx2
+ > * ( - c2 * ( u(4,i-1,j,k) * tmp1 ) )
+ c(2,5,i,j) = - dt * tx2 * c2
+
+ c(3,1,i,j) = - dt * tx2
+ > * ( - ( u(2,i-1,j,k) * u(3,i-1,j,k) ) * tmp2 )
+ > - dt * tx1 * ( - c34 * tmp2 * u(3,i-1,j,k) )
+ c(3,2,i,j) = - dt * tx2 * ( u(3,i-1,j,k) * tmp1 )
+ c(3,3,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 )
+ > - dt * tx1 * ( c34 * tmp1 )
+ > - dt * tx1 * dx3
+ c(3,4,i,j) = 0.0d+00
+ c(3,5,i,j) = 0.0d+00
+
+ c(4,1,i,j) = - dt * tx2
+ > * ( - ( u(2,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 )
+ > - dt * tx1 * ( - c34 * tmp2 * u(4,i-1,j,k) )
+ c(4,2,i,j) = - dt * tx2 * ( u(4,i-1,j,k) * tmp1 )
+ c(4,3,i,j) = 0.0d+00
+ c(4,4,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 )
+ > - dt * tx1 * ( c34 * tmp1 )
+ > - dt * tx1 * dx4
+ c(4,5,i,j) = 0.0d+00
+
+ c(5,1,i,j) = - dt * tx2
+ > * ( ( c2 * ( u(2,i-1,j,k) * u(2,i-1,j,k)
+ > + u(3,i-1,j,k) * u(3,i-1,j,k)
+ > + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2
+ > - c1 * ( u(5,i-1,j,k) * tmp1 ) )
+ > * ( u(2,i-1,j,k) * tmp1 ) )
+ > - dt * tx1
+ > * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i-1,j,k)**2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(3,i-1,j,k)**2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(4,i-1,j,k)**2 )
+ > - c1345 * tmp2 * u(5,i-1,j,k) )
+ c(5,2,i,j) = - dt * tx2
+ > * ( c1 * ( u(5,i-1,j,k) * tmp1 )
+ > - 0.50d+00 * c2
+ > * ( ( 3.0d+00*u(2,i-1,j,k)*u(2,i-1,j,k)
+ > + u(3,i-1,j,k)*u(3,i-1,j,k)
+ > + u(4,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 ) )
+ > - dt * tx1
+ > * ( r43*c34 - c1345 ) * tmp2 * u(2,i-1,j,k)
+ c(5,3,i,j) = - dt * tx2
+ > * ( - c2 * ( u(3,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 )
+ > - dt * tx1
+ > * ( c34 - c1345 ) * tmp2 * u(3,i-1,j,k)
+ c(5,4,i,j) = - dt * tx2
+ > * ( - c2 * ( u(4,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 )
+ > - dt * tx1
+ > * ( c34 - c1345 ) * tmp2 * u(4,i-1,j,k)
+ c(5,5,i,j) = - dt * tx2
+ > * ( c1 * ( u(2,i-1,j,k) * tmp1 ) )
+ > - dt * tx1 * c1345 * tmp1
+ > - dt * tx1 * dx5
+
+ end do
+ end do
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine jacu(k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the upper triangular part of the jacobian matrix
+c---------------------------------------------------------------------
+
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ integer k
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j
+ double precision r43
+ double precision c1345
+ double precision c34
+ double precision tmp1, tmp2, tmp3
+
+
+
+ r43 = ( 4.0d+00 / 3.0d+00 )
+ c1345 = c1 * c3 * c4 * c5
+ c34 = c3 * c4
+
+ do j = jst, jend
+ do i = ist, iend
+
+c---------------------------------------------------------------------
+c form the block daigonal
+c---------------------------------------------------------------------
+ tmp1 = 1.0d+00 / u(1,i,j,k)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ d(1,1,i,j) = 1.0d+00
+ > + dt * 2.0d+00 * ( tx1 * dx1
+ > + ty1 * dy1
+ > + tz1 * dz1 )
+ d(1,2,i,j) = 0.0d+00
+ d(1,3,i,j) = 0.0d+00
+ d(1,4,i,j) = 0.0d+00
+ d(1,5,i,j) = 0.0d+00
+
+ d(2,1,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) )
+ > + ty1 * ( - c34 * tmp2 * u(2,i,j,k) )
+ > + tz1 * ( - c34 * tmp2 * u(2,i,j,k) ) )
+ d(2,2,i,j) = 1.0d+00
+ > + dt * 2.0d+00
+ > * ( tx1 * r43 * c34 * tmp1
+ > + ty1 * c34 * tmp1
+ > + tz1 * c34 * tmp1 )
+ > + dt * 2.0d+00 * ( tx1 * dx2
+ > + ty1 * dy2
+ > + tz1 * dz2 )
+ d(2,3,i,j) = 0.0d+00
+ d(2,4,i,j) = 0.0d+00
+ d(2,5,i,j) = 0.0d+00
+
+ d(3,1,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( - c34 * tmp2 * u(3,i,j,k) )
+ > + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) )
+ > + tz1 * ( - c34 * tmp2 * u(3,i,j,k) ) )
+ d(3,2,i,j) = 0.0d+00
+ d(3,3,i,j) = 1.0d+00
+ > + dt * 2.0d+00
+ > * ( tx1 * c34 * tmp1
+ > + ty1 * r43 * c34 * tmp1
+ > + tz1 * c34 * tmp1 )
+ > + dt * 2.0d+00 * ( tx1 * dx3
+ > + ty1 * dy3
+ > + tz1 * dz3 )
+ d(3,4,i,j) = 0.0d+00
+ d(3,5,i,j) = 0.0d+00
+
+ d(4,1,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( - c34 * tmp2 * u(4,i,j,k) )
+ > + ty1 * ( - c34 * tmp2 * u(4,i,j,k) )
+ > + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) )
+ d(4,2,i,j) = 0.0d+00
+ d(4,3,i,j) = 0.0d+00
+ d(4,4,i,j) = 1.0d+00
+ > + dt * 2.0d+00
+ > * ( tx1 * c34 * tmp1
+ > + ty1 * c34 * tmp1
+ > + tz1 * r43 * c34 * tmp1 )
+ > + dt * 2.0d+00 * ( tx1 * dx4
+ > + ty1 * dy4
+ > + tz1 * dz4 )
+ d(4,5,i,j) = 0.0d+00
+
+ d(5,1,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+ > - ( c1345 ) * tmp2 * u(5,i,j,k) )
+ > + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+ > - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+ > - ( c1345 ) * tmp2 * u(5,i,j,k) )
+ > + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
+ > - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
+ > - ( c1345 ) * tmp2 * u(5,i,j,k) ) )
+ d(5,2,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k)
+ > + ty1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k)
+ > + tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) )
+ d(5,3,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k)
+ > + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k)
+ > + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) )
+ d(5,4,i,j) = dt * 2.0d+00
+ > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
+ > + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
+ > + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) )
+ d(5,5,i,j) = 1.0d+00
+ > + dt * 2.0d+00 * ( tx1 * c1345 * tmp1
+ > + ty1 * c1345 * tmp1
+ > + tz1 * c1345 * tmp1 )
+ > + dt * 2.0d+00 * ( tx1 * dx5
+ > + ty1 * dy5
+ > + tz1 * dz5 )
+
+c---------------------------------------------------------------------
+c form the first block sub-diagonal
+c---------------------------------------------------------------------
+ tmp1 = 1.0d+00 / u(1,i+1,j,k)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ a(1,1,i,j) = - dt * tx1 * dx1
+ a(1,2,i,j) = dt * tx2
+ a(1,3,i,j) = 0.0d+00
+ a(1,4,i,j) = 0.0d+00
+ a(1,5,i,j) = 0.0d+00
+
+ a(2,1,i,j) = dt * tx2
+ > * ( - ( u(2,i+1,j,k) * tmp1 ) ** 2
+ > + c2 * 0.50d+00 * ( u(2,i+1,j,k) * u(2,i+1,j,k)
+ > + u(3,i+1,j,k) * u(3,i+1,j,k)
+ > + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2 )
+ > - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i+1,j,k) )
+ a(2,2,i,j) = dt * tx2
+ > * ( ( 2.0d+00 - c2 ) * ( u(2,i+1,j,k) * tmp1 ) )
+ > - dt * tx1 * ( r43 * c34 * tmp1 )
+ > - dt * tx1 * dx2
+ a(2,3,i,j) = dt * tx2
+ > * ( - c2 * ( u(3,i+1,j,k) * tmp1 ) )
+ a(2,4,i,j) = dt * tx2
+ > * ( - c2 * ( u(4,i+1,j,k) * tmp1 ) )
+ a(2,5,i,j) = dt * tx2 * c2
+
+ a(3,1,i,j) = dt * tx2
+ > * ( - ( u(2,i+1,j,k) * u(3,i+1,j,k) ) * tmp2 )
+ > - dt * tx1 * ( - c34 * tmp2 * u(3,i+1,j,k) )
+ a(3,2,i,j) = dt * tx2 * ( u(3,i+1,j,k) * tmp1 )
+ a(3,3,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 )
+ > - dt * tx1 * ( c34 * tmp1 )
+ > - dt * tx1 * dx3
+ a(3,4,i,j) = 0.0d+00
+ a(3,5,i,j) = 0.0d+00
+
+ a(4,1,i,j) = dt * tx2
+ > * ( - ( u(2,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 )
+ > - dt * tx1 * ( - c34 * tmp2 * u(4,i+1,j,k) )
+ a(4,2,i,j) = dt * tx2 * ( u(4,i+1,j,k) * tmp1 )
+ a(4,3,i,j) = 0.0d+00
+ a(4,4,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 )
+ > - dt * tx1 * ( c34 * tmp1 )
+ > - dt * tx1 * dx4
+ a(4,5,i,j) = 0.0d+00
+
+ a(5,1,i,j) = dt * tx2
+ > * ( ( c2 * ( u(2,i+1,j,k) * u(2,i+1,j,k)
+ > + u(3,i+1,j,k) * u(3,i+1,j,k)
+ > + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2
+ > - c1 * ( u(5,i+1,j,k) * tmp1 ) )
+ > * ( u(2,i+1,j,k) * tmp1 ) )
+ > - dt * tx1
+ > * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i+1,j,k)**2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(3,i+1,j,k)**2 )
+ > - ( c34 - c1345 ) * tmp3 * ( u(4,i+1,j,k)**2 )
+ > - c1345 * tmp2 * u(5,i+1,j,k) )
+ a(5,2,i,j) = dt * tx2
+ > * ( c1 * ( u(5,i+1,j,k) * tmp1 )
+ > - 0.50d+00 * c2
+ > * ( ( 3.0d+00*u(2,i+1,j,k)*u(2,i+1,j,k)
+ > + u(3,i+1,j,k)*u(3,i+1,j,k)
+ > + u(4,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 ) )
+ > - dt * tx1
+ > * ( r43*c34 - c1345 ) * tmp2 * u(2,i+1,j,k)
+ a(5,3,i,j) = dt * tx2
+ > * ( - c2 * ( u(3,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 )
+ > - dt * tx1
+ > * ( c34 - c1345 ) * tmp2 * u(3,i+1,j,k)
+ a(5,4,i,j) = dt * tx2
+ > * ( - c2 * ( u(4,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 )
+ > - dt * tx1
+ > * ( c34 - c1345 ) * tmp2 * u(4,i+1,j,k)
+ a(5,5,i,j) = dt * tx2
+ > * ( c1 * ( u(2,i+1,j,k) * tmp1 ) )
+ > - dt * tx1 * c1345 * tmp1
+ > - dt * tx1 * dx5
+
+c---------------------------------------------------------------------
+c form the second block sub-diagonal
+c---------------------------------------------------------------------
+ tmp1 = 1.0d+00 / u(1,i,j+1,k)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ b(1,1,i,j) = - dt * ty1 * dy1
+ b(1,2,i,j) = 0.0d+00
+ b(1,3,i,j) = dt * ty2
+ b(1,4,i,j) = 0.0d+00
+ b(1,5,i,j) = 0.0d+00
+
+ b(2,1,i,j) = dt * ty2
+ > * ( - ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 )
+ > - dt * ty1 * ( - c34 * tmp2 * u(2,i,j+1,k) )
+ b(2,2,i,j) = dt * ty2 * ( u(3,i,j+1,k) * tmp1 )
+ > - dt * ty1 * ( c34 * tmp1 )
+ > - dt * ty1 * dy2
+ b(2,3,i,j) = dt * ty2 * ( u(2,i,j+1,k) * tmp1 )
+ b(2,4,i,j) = 0.0d+00
+ b(2,5,i,j) = 0.0d+00
+
+ b(3,1,i,j) = dt * ty2
+ > * ( - ( u(3,i,j+1,k) * tmp1 ) ** 2
+ > + 0.50d+00 * c2 * ( ( u(2,i,j+1,k) * u(2,i,j+1,k)
+ > + u(3,i,j+1,k) * u(3,i,j+1,k)
+ > + u(4,i,j+1,k) * u(4,i,j+1,k) )
+ > * tmp2 ) )
+ > - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j+1,k) )
+ b(3,2,i,j) = dt * ty2
+ > * ( - c2 * ( u(2,i,j+1,k) * tmp1 ) )
+ b(3,3,i,j) = dt * ty2 * ( ( 2.0d+00 - c2 )
+ > * ( u(3,i,j+1,k) * tmp1 ) )
+ > - dt * ty1 * ( r43 * c34 * tmp1 )
+ > - dt * ty1 * dy3
+ b(3,4,i,j) = dt * ty2
+ > * ( - c2 * ( u(4,i,j+1,k) * tmp1 ) )
+ b(3,5,i,j) = dt * ty2 * c2
+
+ b(4,1,i,j) = dt * ty2
+ > * ( - ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 )
+ > - dt * ty1 * ( - c34 * tmp2 * u(4,i,j+1,k) )
+ b(4,2,i,j) = 0.0d+00
+ b(4,3,i,j) = dt * ty2 * ( u(4,i,j+1,k) * tmp1 )
+ b(4,4,i,j) = dt * ty2 * ( u(3,i,j+1,k) * tmp1 )
+ > - dt * ty1 * ( c34 * tmp1 )
+ > - dt * ty1 * dy4
+ b(4,5,i,j) = 0.0d+00
+
+ b(5,1,i,j) = dt * ty2
+ > * ( ( c2 * ( u(2,i,j+1,k) * u(2,i,j+1,k)
+ > + u(3,i,j+1,k) * u(3,i,j+1,k)
+ > + u(4,i,j+1,k) * u(4,i,j+1,k) ) * tmp2
+ > - c1 * ( u(5,i,j+1,k) * tmp1 ) )
+ > * ( u(3,i,j+1,k) * tmp1 ) )
+ > - dt * ty1
+ > * ( - ( c34 - c1345 )*tmp3*(u(2,i,j+1,k)**2)
+ > - ( r43*c34 - c1345 )*tmp3*(u(3,i,j+1,k)**2)
+ > - ( c34 - c1345 )*tmp3*(u(4,i,j+1,k)**2)
+ > - c1345*tmp2*u(5,i,j+1,k) )
+ b(5,2,i,j) = dt * ty2
+ > * ( - c2 * ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 )
+ > - dt * ty1
+ > * ( c34 - c1345 ) * tmp2 * u(2,i,j+1,k)
+ b(5,3,i,j) = dt * ty2
+ > * ( c1 * ( u(5,i,j+1,k) * tmp1 )
+ > - 0.50d+00 * c2
+ > * ( ( u(2,i,j+1,k)*u(2,i,j+1,k)
+ > + 3.0d+00 * u(3,i,j+1,k)*u(3,i,j+1,k)
+ > + u(4,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) )
+ > - dt * ty1
+ > * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j+1,k)
+ b(5,4,i,j) = dt * ty2
+ > * ( - c2 * ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 )
+ > - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j+1,k)
+ b(5,5,i,j) = dt * ty2
+ > * ( c1 * ( u(3,i,j+1,k) * tmp1 ) )
+ > - dt * ty1 * c1345 * tmp1
+ > - dt * ty1 * dy5
+
+c---------------------------------------------------------------------
+c form the third block sub-diagonal
+c---------------------------------------------------------------------
+ tmp1 = 1.0d+00 / u(1,i,j,k+1)
+ tmp2 = tmp1 * tmp1
+ tmp3 = tmp1 * tmp2
+
+ c(1,1,i,j) = - dt * tz1 * dz1
+ c(1,2,i,j) = 0.0d+00
+ c(1,3,i,j) = 0.0d+00
+ c(1,4,i,j) = dt * tz2
+ c(1,5,i,j) = 0.0d+00
+
+ c(2,1,i,j) = dt * tz2
+ > * ( - ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
+ > - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k+1) )
+ c(2,2,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 )
+ > - dt * tz1 * c34 * tmp1
+ > - dt * tz1 * dz2
+ c(2,3,i,j) = 0.0d+00
+ c(2,4,i,j) = dt * tz2 * ( u(2,i,j,k+1) * tmp1 )
+ c(2,5,i,j) = 0.0d+00
+
+ c(3,1,i,j) = dt * tz2
+ > * ( - ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
+ > - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k+1) )
+ c(3,2,i,j) = 0.0d+00
+ c(3,3,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 )
+ > - dt * tz1 * ( c34 * tmp1 )
+ > - dt * tz1 * dz3
+ c(3,4,i,j) = dt * tz2 * ( u(3,i,j,k+1) * tmp1 )
+ c(3,5,i,j) = 0.0d+00
+
+ c(4,1,i,j) = dt * tz2
+ > * ( - ( u(4,i,j,k+1) * tmp1 ) ** 2
+ > + 0.50d+00 * c2
+ > * ( ( u(2,i,j,k+1) * u(2,i,j,k+1)
+ > + u(3,i,j,k+1) * u(3,i,j,k+1)
+ > + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2 ) )
+ > - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k+1) )
+ c(4,2,i,j) = dt * tz2
+ > * ( - c2 * ( u(2,i,j,k+1) * tmp1 ) )
+ c(4,3,i,j) = dt * tz2
+ > * ( - c2 * ( u(3,i,j,k+1) * tmp1 ) )
+ c(4,4,i,j) = dt * tz2 * ( 2.0d+00 - c2 )
+ > * ( u(4,i,j,k+1) * tmp1 )
+ > - dt * tz1 * ( r43 * c34 * tmp1 )
+ > - dt * tz1 * dz4
+ c(4,5,i,j) = dt * tz2 * c2
+
+ c(5,1,i,j) = dt * tz2
+ > * ( ( c2 * ( u(2,i,j,k+1) * u(2,i,j,k+1)
+ > + u(3,i,j,k+1) * u(3,i,j,k+1)
+ > + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2
+ > - c1 * ( u(5,i,j,k+1) * tmp1 ) )
+ > * ( u(4,i,j,k+1) * tmp1 ) )
+ > - dt * tz1
+ > * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k+1)**2)
+ > - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k+1)**2)
+ > - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k+1)**2)
+ > - c1345 * tmp2 * u(5,i,j,k+1) )
+ c(5,2,i,j) = dt * tz2
+ > * ( - c2 * ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
+ > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k+1)
+ c(5,3,i,j) = dt * tz2
+ > * ( - c2 * ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
+ > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k+1)
+ c(5,4,i,j) = dt * tz2
+ > * ( c1 * ( u(5,i,j,k+1) * tmp1 )
+ > - 0.50d+00 * c2
+ > * ( ( u(2,i,j,k+1)*u(2,i,j,k+1)
+ > + u(3,i,j,k+1)*u(3,i,j,k+1)
+ > + 3.0d+00*u(4,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) )
+ > - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k+1)
+ c(5,5,i,j) = dt * tz2
+ > * ( c1 * ( u(4,i,j,k+1) * tmp1 ) )
+ > - dt * tz1 * c1345 * tmp1
+ > - dt * tz1 * dz5
+
+ end do
+ end do
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine l2norm ( ldx, ldy, ldz,
+ > nx0, ny0, nz0,
+ > ist, iend,
+ > jst, jend,
+ > v, sum )
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c to compute the l2-norm of vector v.
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ integer ldx, ldy, ldz
+ integer nx0, ny0, nz0
+ integer ist, iend
+ integer jst, jend
+ double precision v(5,-1:ldx+2,-1:ldy+2,*), sum(5)
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, k, m
+ double precision dummy(5)
+
+ integer IERROR
+
+
+ do m = 1, 5
+ dummy(m) = 0.0d+00
+ end do
+
+ do k = 2, nz0-1
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ dummy(m) = dummy(m) + v(m,i,j,k) * v(m,i,j,k)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c compute the global sum of individual contributions to dot product.
+c---------------------------------------------------------------------
+ call MPI_ALLREDUCE( dummy,
+ > sum,
+ > 5,
+ > dp_type,
+ > MPI_SUM,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+ do m = 1, 5
+ sum(m) = sqrt ( sum(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) )
+ end do
+
+ return
+ end
--- /dev/null
+!-------------------------------------------------------------------------!
+! !
+! N A S P A R A L L E L B E N C H M A R K S 3.3 !
+! !
+! L U !
+! !
+!-------------------------------------------------------------------------!
+! !
+! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
+! It is described in NAS Technical Reports 95-020 and 02-007 !
+! !
+! Permission to use, copy, distribute and modify this software !
+! for any purpose with or without fee is hereby granted. We !
+! request, however, that all derived work reference the NAS !
+! Parallel Benchmarks 3.3. This software is provided "as is" !
+! without express or implied warranty. !
+! !
+! Information on NPB 3.3, including the technical report, the !
+! original specifications, source code, results and information !
+! on how to submit new results, is available at: !
+! !
+! http://www.nas.nasa.gov/Software/NPB/ !
+! !
+! Send comments or suggestions to npb@nas.nasa.gov !
+! !
+! NAS Parallel Benchmarks Group !
+! NASA Ames Research Center !
+! Mail Stop: T27A-1 !
+! Moffett Field, CA 94035-1000 !
+! !
+! E-mail: npb@nas.nasa.gov !
+! Fax: (650) 604-3957 !
+! !
+!-------------------------------------------------------------------------!
+
+c---------------------------------------------------------------------
+c
+c Authors: S. Weeratunga
+c V. Venkatakrishnan
+c E. Barszcz
+c M. Yarrow
+c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+ program applu
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c driver for the performance evaluation of the solver for
+c five coupled parabolic/elliptic partial differential equations.
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+ character class
+ logical verified
+ double precision mflops
+ integer ierr
+
+c---------------------------------------------------------------------
+c initialize communications
+c---------------------------------------------------------------------
+ call init_comm()
+
+c---------------------------------------------------------------------
+c read input data
+c---------------------------------------------------------------------
+ call read_input()
+
+c---------------------------------------------------------------------
+c set up processor grid
+c---------------------------------------------------------------------
+ call proc_grid()
+
+c---------------------------------------------------------------------
+c determine the neighbors
+c---------------------------------------------------------------------
+ call neighbors()
+
+c---------------------------------------------------------------------
+c set up sub-domain sizes
+c---------------------------------------------------------------------
+ call subdomain()
+
+c---------------------------------------------------------------------
+c set up coefficients
+c---------------------------------------------------------------------
+ call setcoeff()
+
+c---------------------------------------------------------------------
+c set the masks required for comm
+c---------------------------------------------------------------------
+ call sethyper()
+
+c---------------------------------------------------------------------
+c set the boundary values for dependent variables
+c---------------------------------------------------------------------
+ call setbv()
+
+c---------------------------------------------------------------------
+c set the initial values for dependent variables
+c---------------------------------------------------------------------
+ call setiv()
+
+c---------------------------------------------------------------------
+c compute the forcing term based on prescribed exact solution
+c---------------------------------------------------------------------
+ call erhs()
+
+c---------------------------------------------------------------------
+c perform one SSOR iteration to touch all data and program pages
+c---------------------------------------------------------------------
+ call ssor(1)
+
+c---------------------------------------------------------------------
+c reset the boundary and initial values
+c---------------------------------------------------------------------
+ call setbv()
+ call setiv()
+
+c---------------------------------------------------------------------
+c perform the SSOR iterations
+c---------------------------------------------------------------------
+ call ssor(itmax)
+
+c---------------------------------------------------------------------
+c compute the solution error
+c---------------------------------------------------------------------
+ call error()
+
+c---------------------------------------------------------------------
+c compute the surface integral
+c---------------------------------------------------------------------
+ call pintgr()
+
+c---------------------------------------------------------------------
+c verification test
+c---------------------------------------------------------------------
+ IF (id.eq.0) THEN
+ call verify ( rsdnm, errnm, frc, class, verified )
+ mflops = float(itmax)*(1984.77*float( nx0 )
+ > *float( ny0 )
+ > *float( nz0 )
+ > -10923.3*(float( nx0+ny0+nz0 )/3.)**2
+ > +27770.9* float( nx0+ny0+nz0 )/3.
+ > -144010.)
+ > / (maxtime*1000000.)
+
+ call print_results('LU', class, nx0,
+ > ny0, nz0, itmax, nnodes_compiled,
+ > num, maxtime, mflops, ' floating point', verified,
+ > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6,
+ > '(none)')
+
+ END IF
+
+ call mpi_finalize(ierr)
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'mpif.h'
+
+ integer node, no_nodes, root, comm_setup,
+ > comm_solve, comm_rhs, dp_type
+ common /mpistuff/ node, no_nodes, root, comm_setup,
+ > comm_solve, comm_rhs, dp_type
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine neighbors ()
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c figure out the neighbors and their wrap numbers for each processor
+c---------------------------------------------------------------------
+
+ south = -1
+ east = -1
+ north = -1
+ west = -1
+
+ if (row.gt.1) then
+ north = id -1
+ else
+ north = -1
+ end if
+
+ if (row.lt.xdim) then
+ south = id + 1
+ else
+ south = -1
+ end if
+
+ if (col.gt.1) then
+ west = id- xdim
+ else
+ west = -1
+ end if
+
+ if (col.lt.ydim) then
+ east = id + xdim
+ else
+ east = -1
+ end if
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ integer function nodedim(num)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c compute the exponent where num = 2**nodedim
+c NOTE: assumes a power-of-two number of nodes
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+c---------------------------------------------------------------------
+c input parameters
+c---------------------------------------------------------------------
+ integer num
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ double precision fnum
+
+
+ fnum = dble(num)
+ nodedim = log(fnum)/log(2.0d+0) + 0.00001
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine pintgr
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, k
+ integer ibeg, ifin, ifin1
+ integer jbeg, jfin, jfin1
+ integer iglob, iglob1, iglob2
+ integer jglob, jglob1, jglob2
+ integer ind1, ind2
+ double precision phi1(0:isiz2+1,0:isiz3+1),
+ > phi2(0:isiz2+1,0:isiz3+1)
+ double precision frc1, frc2, frc3
+ double precision dummy
+
+ integer IERROR
+
+
+c---------------------------------------------------------------------
+c set up the sub-domains for integeration in each processor
+c---------------------------------------------------------------------
+ ibeg = nx + 1
+ ifin = 0
+ iglob1 = ipt + 1
+ iglob2 = ipt + nx
+ if (iglob1.ge.ii1.and.iglob2.lt.ii2+nx) ibeg = 1
+ if (iglob1.gt.ii1-nx.and.iglob2.le.ii2) ifin = nx
+ if (ii1.ge.iglob1.and.ii1.le.iglob2) ibeg = ii1 - ipt
+ if (ii2.ge.iglob1.and.ii2.le.iglob2) ifin = ii2 - ipt
+ jbeg = ny + 1
+ jfin = 0
+ jglob1 = jpt + 1
+ jglob2 = jpt + ny
+ if (jglob1.ge.ji1.and.jglob2.lt.ji2+ny) jbeg = 1
+ if (jglob1.gt.ji1-ny.and.jglob2.le.ji2) jfin = ny
+ if (ji1.ge.jglob1.and.ji1.le.jglob2) jbeg = ji1 - jpt
+ if (ji2.ge.jglob1.and.ji2.le.jglob2) jfin = ji2 - jpt
+ ifin1 = ifin
+ jfin1 = jfin
+ if (ipt + ifin1.eq.ii2) ifin1 = ifin -1
+ if (jpt + jfin1.eq.ji2) jfin1 = jfin -1
+
+c---------------------------------------------------------------------
+c initialize
+c---------------------------------------------------------------------
+ do i = 0,isiz2+1
+ do k = 0,isiz3+1
+ phi1(i,k) = 0.
+ phi2(i,k) = 0.
+ end do
+ end do
+
+ do j = jbeg,jfin
+ jglob = jpt + j
+ do i = ibeg,ifin
+ iglob = ipt + i
+
+ k = ki1
+
+ phi1(i,j) = c2*( u(5,i,j,k)
+ > - 0.50d+00 * ( u(2,i,j,k) ** 2
+ > + u(3,i,j,k) ** 2
+ > + u(4,i,j,k) ** 2 )
+ > / u(1,i,j,k) )
+
+ k = ki2
+
+ phi2(i,j) = c2*( u(5,i,j,k)
+ > - 0.50d+00 * ( u(2,i,j,k) ** 2
+ > + u(3,i,j,k) ** 2
+ > + u(4,i,j,k) ** 2 )
+ > / u(1,i,j,k) )
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c communicate in i and j directions
+c---------------------------------------------------------------------
+ call exchange_4(phi1,phi2,ibeg,ifin1,jbeg,jfin1)
+
+ frc1 = 0.0d+00
+
+ do j = jbeg,jfin1
+ do i = ibeg, ifin1
+ frc1 = frc1 + ( phi1(i,j)
+ > + phi1(i+1,j)
+ > + phi1(i,j+1)
+ > + phi1(i+1,j+1)
+ > + phi2(i,j)
+ > + phi2(i+1,j)
+ > + phi2(i,j+1)
+ > + phi2(i+1,j+1) )
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c compute the global sum of individual contributions to frc1
+c---------------------------------------------------------------------
+ dummy = frc1
+ call MPI_ALLREDUCE( dummy,
+ > frc1,
+ > 1,
+ > dp_type,
+ > MPI_SUM,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+ frc1 = dxi * deta * frc1
+
+c---------------------------------------------------------------------
+c initialize
+c---------------------------------------------------------------------
+ do i = 0,isiz2+1
+ do k = 0,isiz3+1
+ phi1(i,k) = 0.
+ phi2(i,k) = 0.
+ end do
+ end do
+ jglob = jpt + jbeg
+ ind1 = 0
+ if (jglob.eq.ji1) then
+ ind1 = 1
+ do k = ki1, ki2
+ do i = ibeg, ifin
+ iglob = ipt + i
+ phi1(i,k) = c2*( u(5,i,jbeg,k)
+ > - 0.50d+00 * ( u(2,i,jbeg,k) ** 2
+ > + u(3,i,jbeg,k) ** 2
+ > + u(4,i,jbeg,k) ** 2 )
+ > / u(1,i,jbeg,k) )
+ end do
+ end do
+ end if
+
+ jglob = jpt + jfin
+ ind2 = 0
+ if (jglob.eq.ji2) then
+ ind2 = 1
+ do k = ki1, ki2
+ do i = ibeg, ifin
+ iglob = ipt + i
+ phi2(i,k) = c2*( u(5,i,jfin,k)
+ > - 0.50d+00 * ( u(2,i,jfin,k) ** 2
+ > + u(3,i,jfin,k) ** 2
+ > + u(4,i,jfin,k) ** 2 )
+ > / u(1,i,jfin,k) )
+ end do
+ end do
+ end if
+
+c---------------------------------------------------------------------
+c communicate in i direction
+c---------------------------------------------------------------------
+ if (ind1.eq.1) then
+ call exchange_5(phi1,ibeg,ifin1)
+ end if
+ if (ind2.eq.1) then
+ call exchange_5 (phi2,ibeg,ifin1)
+ end if
+
+ frc2 = 0.0d+00
+ do k = ki1, ki2-1
+ do i = ibeg, ifin1
+ frc2 = frc2 + ( phi1(i,k)
+ > + phi1(i+1,k)
+ > + phi1(i,k+1)
+ > + phi1(i+1,k+1)
+ > + phi2(i,k)
+ > + phi2(i+1,k)
+ > + phi2(i,k+1)
+ > + phi2(i+1,k+1) )
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c compute the global sum of individual contributions to frc2
+c---------------------------------------------------------------------
+ dummy = frc2
+ call MPI_ALLREDUCE( dummy,
+ > frc2,
+ > 1,
+ > dp_type,
+ > MPI_SUM,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+ frc2 = dxi * dzeta * frc2
+
+c---------------------------------------------------------------------
+c initialize
+c---------------------------------------------------------------------
+ do i = 0,isiz2+1
+ do k = 0,isiz3+1
+ phi1(i,k) = 0.
+ phi2(i,k) = 0.
+ end do
+ end do
+ iglob = ipt + ibeg
+ ind1 = 0
+ if (iglob.eq.ii1) then
+ ind1 = 1
+ do k = ki1, ki2
+ do j = jbeg, jfin
+ jglob = jpt + j
+ phi1(j,k) = c2*( u(5,ibeg,j,k)
+ > - 0.50d+00 * ( u(2,ibeg,j,k) ** 2
+ > + u(3,ibeg,j,k) ** 2
+ > + u(4,ibeg,j,k) ** 2 )
+ > / u(1,ibeg,j,k) )
+ end do
+ end do
+ end if
+
+ iglob = ipt + ifin
+ ind2 = 0
+ if (iglob.eq.ii2) then
+ ind2 = 1
+ do k = ki1, ki2
+ do j = jbeg, jfin
+ jglob = jpt + j
+ phi2(j,k) = c2*( u(5,ifin,j,k)
+ > - 0.50d+00 * ( u(2,ifin,j,k) ** 2
+ > + u(3,ifin,j,k) ** 2
+ > + u(4,ifin,j,k) ** 2 )
+ > / u(1,ifin,j,k) )
+ end do
+ end do
+ end if
+
+c---------------------------------------------------------------------
+c communicate in j direction
+c---------------------------------------------------------------------
+ if (ind1.eq.1) then
+ call exchange_6(phi1,jbeg,jfin1)
+ end if
+ if (ind2.eq.1) then
+ call exchange_6(phi2,jbeg,jfin1)
+ end if
+
+ frc3 = 0.0d+00
+
+ do k = ki1, ki2-1
+ do j = jbeg, jfin1
+ frc3 = frc3 + ( phi1(j,k)
+ > + phi1(j+1,k)
+ > + phi1(j,k+1)
+ > + phi1(j+1,k+1)
+ > + phi2(j,k)
+ > + phi2(j+1,k)
+ > + phi2(j,k+1)
+ > + phi2(j+1,k+1) )
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c compute the global sum of individual contributions to frc3
+c---------------------------------------------------------------------
+ dummy = frc3
+ call MPI_ALLREDUCE( dummy,
+ > frc3,
+ > 1,
+ > dp_type,
+ > MPI_SUM,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+ frc3 = deta * dzeta * frc3
+ frc = 0.25d+00 * ( frc1 + frc2 + frc3 )
+c if (id.eq.0) write (*,1001) frc
+
+ return
+
+ 1001 format (//5x,'surface integral = ',1pe12.5//)
+
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine proc_grid
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c set up a two-d grid for processors: column-major ordering of unknowns
+c NOTE: assumes a power-of-two number of processors
+c
+c---------------------------------------------------------------------
+
+ xdim = 2**(ndim/2)
+ if (mod(ndim,2).eq.1) xdim = xdim + xdim
+ ydim = num/xdim
+
+ row = mod(id,xdim) + 1
+ col = id/xdim + 1
+
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine read_input
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+ integer IERROR, fstatus, nnodes
+
+
+c---------------------------------------------------------------------
+c only root reads the input file
+c if input file does not exist, it uses defaults
+c ipr = 1 for detailed progress output
+c inorm = how often the norm is printed (once every inorm iterations)
+c itmax = number of pseudo time steps
+c dt = time step
+c omega 1 over-relaxation factor for SSOR
+c tolrsd = steady state residual tolerance levels
+c nx, ny, nz = number of grid points in x, y, z directions
+c---------------------------------------------------------------------
+ ROOT = 0
+ if (id .eq. ROOT) then
+
+ write(*, 1000)
+
+ open (unit=3,file='inputlu.data',status='old',
+ > access='sequential',form='formatted', iostat=fstatus)
+ if (fstatus .eq. 0) then
+
+ write(*, *) 'Reading from input file inputlu.data'
+
+ read (3,*)
+ read (3,*)
+ read (3,*) ipr, inorm
+ read (3,*)
+ read (3,*)
+ read (3,*) itmax
+ read (3,*)
+ read (3,*)
+ read (3,*) dt
+ read (3,*)
+ read (3,*)
+ read (3,*) omega
+ read (3,*)
+ read (3,*)
+ read (3,*) tolrsd(1),tolrsd(2),tolrsd(3),tolrsd(4),tolrsd(5)
+ read (3,*)
+ read (3,*)
+ read (3,*) nx0, ny0, nz0
+ close(3)
+ else
+ ipr = ipr_default
+ inorm = inorm_default
+ itmax = itmax_default
+ dt = dt_default
+ omega = omega_default
+ tolrsd(1) = tolrsd1_def
+ tolrsd(2) = tolrsd2_def
+ tolrsd(3) = tolrsd3_def
+ tolrsd(4) = tolrsd4_def
+ tolrsd(5) = tolrsd5_def
+ nx0 = isiz01
+ ny0 = isiz02
+ nz0 = isiz03
+ endif
+
+c---------------------------------------------------------------------
+c check problem size
+c---------------------------------------------------------------------
+ call MPI_COMM_SIZE(MPI_COMM_WORLD, nnodes, ierror)
+ if (nnodes .ne. nnodes_compiled) then
+ write (*, 2000) nnodes, nnodes_compiled
+ 2000 format (5x,'Warning: program is running on',i3,' processors'
+ > /5x,'but was compiled for ', i3)
+ endif
+
+ if ( ( nx0 .lt. 4 ) .or.
+ > ( ny0 .lt. 4 ) .or.
+ > ( nz0 .lt. 4 ) ) then
+
+ write (*,2001)
+ 2001 format (5x,'PROBLEM SIZE IS TOO SMALL - ',
+ > /5x,'SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5')
+ CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR )
+
+ end if
+
+ if ( ( nx0 .gt. isiz01 ) .or.
+ > ( ny0 .gt. isiz02 ) .or.
+ > ( nz0 .gt. isiz03 ) ) then
+
+ write (*,2002)
+ 2002 format (5x,'PROBLEM SIZE IS TOO LARGE - ',
+ > /5x,'NX, NY AND NZ SHOULD BE LESS THAN OR EQUAL TO ',
+ > /5x,'ISIZ01, ISIZ02 AND ISIZ03 RESPECTIVELY')
+ CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR )
+
+ end if
+
+
+ write(*, 1001) nx0, ny0, nz0
+ write(*, 1002) itmax
+ write(*, 1003) nnodes
+
+ 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- LU Benchmark',/)
+ 1001 format(' Size: ', i4, 'x', i4, 'x', i4)
+ 1002 format(' Iterations: ', i4)
+ 1003 format(' Number of processes: ', i5, /)
+
+
+
+ end if
+
+ call bcast_inputs
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the right hand sides
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, k, m
+ integer iex
+ integer L1, L2
+ integer ist1, iend1
+ integer jst1, jend1
+ double precision q
+ double precision u21, u31, u41
+ double precision tmp
+ double precision u21i, u31i, u41i, u51i
+ double precision u21j, u31j, u41j, u51j
+ double precision u21k, u31k, u41k, u51k
+ double precision u21im1, u31im1, u41im1, u51im1
+ double precision u21jm1, u31jm1, u41jm1, u51jm1
+ double precision u21km1, u31km1, u41km1, u51km1
+
+
+
+ do k = 1, nz
+ do j = 1, ny
+ do i = 1, nx
+ do m = 1, 5
+ rsd(m,i,j,k) = - frct(m,i,j,k)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c xi-direction flux differences
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c iex = flag : iex = 0 north/south communication
+c : iex = 1 east/west communication
+c---------------------------------------------------------------------
+ iex = 0
+
+c---------------------------------------------------------------------
+c communicate and receive/send two rows of data
+c---------------------------------------------------------------------
+ call exchange_3(u,iex)
+
+ L1 = 0
+ if (north.eq.-1) L1 = 1
+ L2 = nx + 1
+ if (south.eq.-1) L2 = nx
+
+ ist1 = 1
+ iend1 = nx
+ if (north.eq.-1) ist1 = 4
+ if (south.eq.-1) iend1 = nx - 3
+
+ do k = 2, nz - 1
+ do j = jst, jend
+ do i = L1, L2
+ flux(1,i,j,k) = u(2,i,j,k)
+ u21 = u(2,i,j,k) / u(1,i,j,k)
+
+ q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k)
+ > + u(3,i,j,k) * u(3,i,j,k)
+ > + u(4,i,j,k) * u(4,i,j,k) )
+ > / u(1,i,j,k)
+
+ flux(2,i,j,k) = u(2,i,j,k) * u21 + c2 *
+ > ( u(5,i,j,k) - q )
+ flux(3,i,j,k) = u(3,i,j,k) * u21
+ flux(4,i,j,k) = u(4,i,j,k) * u21
+ flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u21
+ end do
+
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,j,k) = rsd(m,i,j,k)
+ > - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) )
+ end do
+ end do
+
+ do i = ist, L2
+ tmp = 1.0d+00 / u(1,i,j,k)
+
+ u21i = tmp * u(2,i,j,k)
+ u31i = tmp * u(3,i,j,k)
+ u41i = tmp * u(4,i,j,k)
+ u51i = tmp * u(5,i,j,k)
+
+ tmp = 1.0d+00 / u(1,i-1,j,k)
+
+ u21im1 = tmp * u(2,i-1,j,k)
+ u31im1 = tmp * u(3,i-1,j,k)
+ u41im1 = tmp * u(4,i-1,j,k)
+ u51im1 = tmp * u(5,i-1,j,k)
+
+ flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * (u21i-u21im1)
+ flux(3,i,j,k) = tx3 * ( u31i - u31im1 )
+ flux(4,i,j,k) = tx3 * ( u41i - u41im1 )
+ flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+ > * tx3 * ( ( u21i **2 + u31i **2 + u41i **2 )
+ > - ( u21im1**2 + u31im1**2 + u41im1**2 ) )
+ > + (1.0d+00/6.0d+00)
+ > * tx3 * ( u21i**2 - u21im1**2 )
+ > + c1 * c5 * tx3 * ( u51i - u51im1 )
+ end do
+
+ do i = ist, iend
+ rsd(1,i,j,k) = rsd(1,i,j,k)
+ > + dx1 * tx1 * ( u(1,i-1,j,k)
+ > - 2.0d+00 * u(1,i,j,k)
+ > + u(1,i+1,j,k) )
+ rsd(2,i,j,k) = rsd(2,i,j,k)
+ > + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) )
+ > + dx2 * tx1 * ( u(2,i-1,j,k)
+ > - 2.0d+00 * u(2,i,j,k)
+ > + u(2,i+1,j,k) )
+ rsd(3,i,j,k) = rsd(3,i,j,k)
+ > + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) )
+ > + dx3 * tx1 * ( u(3,i-1,j,k)
+ > - 2.0d+00 * u(3,i,j,k)
+ > + u(3,i+1,j,k) )
+ rsd(4,i,j,k) = rsd(4,i,j,k)
+ > + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) )
+ > + dx4 * tx1 * ( u(4,i-1,j,k)
+ > - 2.0d+00 * u(4,i,j,k)
+ > + u(4,i+1,j,k) )
+ rsd(5,i,j,k) = rsd(5,i,j,k)
+ > + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) )
+ > + dx5 * tx1 * ( u(5,i-1,j,k)
+ > - 2.0d+00 * u(5,i,j,k)
+ > + u(5,i+1,j,k) )
+ end do
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation
+c---------------------------------------------------------------------
+ IF (north.eq.-1) then
+ do m = 1, 5
+ rsd(m,2,j,k) = rsd(m,2,j,k)
+ > - dssp * ( + 5.0d+00 * u(m,2,j,k)
+ > - 4.0d+00 * u(m,3,j,k)
+ > + u(m,4,j,k) )
+ rsd(m,3,j,k) = rsd(m,3,j,k)
+ > - dssp * ( - 4.0d+00 * u(m,2,j,k)
+ > + 6.0d+00 * u(m,3,j,k)
+ > - 4.0d+00 * u(m,4,j,k)
+ > + u(m,5,j,k) )
+ end do
+ END IF
+
+ do i = ist1,iend1
+ do m = 1, 5
+ rsd(m,i,j,k) = rsd(m,i,j,k)
+ > - dssp * ( u(m,i-2,j,k)
+ > - 4.0d+00 * u(m,i-1,j,k)
+ > + 6.0d+00 * u(m,i,j,k)
+ > - 4.0d+00 * u(m,i+1,j,k)
+ > + u(m,i+2,j,k) )
+ end do
+ end do
+
+ IF (south.eq.-1) then
+ do m = 1, 5
+ rsd(m,nx-2,j,k) = rsd(m,nx-2,j,k)
+ > - dssp * ( u(m,nx-4,j,k)
+ > - 4.0d+00 * u(m,nx-3,j,k)
+ > + 6.0d+00 * u(m,nx-2,j,k)
+ > - 4.0d+00 * u(m,nx-1,j,k) )
+ rsd(m,nx-1,j,k) = rsd(m,nx-1,j,k)
+ > - dssp * ( u(m,nx-3,j,k)
+ > - 4.0d+00 * u(m,nx-2,j,k)
+ > + 5.0d+00 * u(m,nx-1,j,k) )
+ end do
+ END IF
+
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c eta-direction flux differences
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c iex = flag : iex = 0 north/south communication
+c---------------------------------------------------------------------
+ iex = 1
+
+c---------------------------------------------------------------------
+c communicate and receive/send two rows of data
+c---------------------------------------------------------------------
+ call exchange_3(u,iex)
+
+ L1 = 0
+ if (west.eq.-1) L1 = 1
+ L2 = ny + 1
+ if (east.eq.-1) L2 = ny
+
+ jst1 = 1
+ jend1 = ny
+ if (west.eq.-1) jst1 = 4
+ if (east.eq.-1) jend1 = ny - 3
+
+ do k = 2, nz - 1
+ do j = L1, L2
+ do i = ist, iend
+ flux(1,i,j,k) = u(3,i,j,k)
+ u31 = u(3,i,j,k) / u(1,i,j,k)
+
+ q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k)
+ > + u(3,i,j,k) * u(3,i,j,k)
+ > + u(4,i,j,k) * u(4,i,j,k) )
+ > / u(1,i,j,k)
+
+ flux(2,i,j,k) = u(2,i,j,k) * u31
+ flux(3,i,j,k) = u(3,i,j,k) * u31 + c2 * (u(5,i,j,k)-q)
+ flux(4,i,j,k) = u(4,i,j,k) * u31
+ flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u31
+ end do
+ end do
+
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,j,k) = rsd(m,i,j,k)
+ > - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) )
+ end do
+ end do
+ end do
+
+ do j = jst, L2
+ do i = ist, iend
+ tmp = 1.0d+00 / u(1,i,j,k)
+
+ u21j = tmp * u(2,i,j,k)
+ u31j = tmp * u(3,i,j,k)
+ u41j = tmp * u(4,i,j,k)
+ u51j = tmp * u(5,i,j,k)
+
+ tmp = 1.0d+00 / u(1,i,j-1,k)
+ u21jm1 = tmp * u(2,i,j-1,k)
+ u31jm1 = tmp * u(3,i,j-1,k)
+ u41jm1 = tmp * u(4,i,j-1,k)
+ u51jm1 = tmp * u(5,i,j-1,k)
+
+ flux(2,i,j,k) = ty3 * ( u21j - u21jm1 )
+ flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * (u31j-u31jm1)
+ flux(4,i,j,k) = ty3 * ( u41j - u41jm1 )
+ flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+ > * ty3 * ( ( u21j **2 + u31j **2 + u41j **2 )
+ > - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) )
+ > + (1.0d+00/6.0d+00)
+ > * ty3 * ( u31j**2 - u31jm1**2 )
+ > + c1 * c5 * ty3 * ( u51j - u51jm1 )
+ end do
+ end do
+
+ do j = jst, jend
+ do i = ist, iend
+
+ rsd(1,i,j,k) = rsd(1,i,j,k)
+ > + dy1 * ty1 * ( u(1,i,j-1,k)
+ > - 2.0d+00 * u(1,i,j,k)
+ > + u(1,i,j+1,k) )
+
+ rsd(2,i,j,k) = rsd(2,i,j,k)
+ > + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) )
+ > + dy2 * ty1 * ( u(2,i,j-1,k)
+ > - 2.0d+00 * u(2,i,j,k)
+ > + u(2,i,j+1,k) )
+
+ rsd(3,i,j,k) = rsd(3,i,j,k)
+ > + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) )
+ > + dy3 * ty1 * ( u(3,i,j-1,k)
+ > - 2.0d+00 * u(3,i,j,k)
+ > + u(3,i,j+1,k) )
+
+ rsd(4,i,j,k) = rsd(4,i,j,k)
+ > + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) )
+ > + dy4 * ty1 * ( u(4,i,j-1,k)
+ > - 2.0d+00 * u(4,i,j,k)
+ > + u(4,i,j+1,k) )
+
+ rsd(5,i,j,k) = rsd(5,i,j,k)
+ > + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) )
+ > + dy5 * ty1 * ( u(5,i,j-1,k)
+ > - 2.0d+00 * u(5,i,j,k)
+ > + u(5,i,j+1,k) )
+
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c fourth-order dissipation
+c---------------------------------------------------------------------
+ IF (west.eq.-1) then
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,2,k) = rsd(m,i,2,k)
+ > - dssp * ( + 5.0d+00 * u(m,i,2,k)
+ > - 4.0d+00 * u(m,i,3,k)
+ > + u(m,i,4,k) )
+ rsd(m,i,3,k) = rsd(m,i,3,k)
+ > - dssp * ( - 4.0d+00 * u(m,i,2,k)
+ > + 6.0d+00 * u(m,i,3,k)
+ > - 4.0d+00 * u(m,i,4,k)
+ > + u(m,i,5,k) )
+ end do
+ end do
+ END IF
+
+ do j = jst1, jend1
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,j,k) = rsd(m,i,j,k)
+ > - dssp * ( u(m,i,j-2,k)
+ > - 4.0d+00 * u(m,i,j-1,k)
+ > + 6.0d+00 * u(m,i,j,k)
+ > - 4.0d+00 * u(m,i,j+1,k)
+ > + u(m,i,j+2,k) )
+ end do
+ end do
+ end do
+
+ IF (east.eq.-1) then
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,ny-2,k) = rsd(m,i,ny-2,k)
+ > - dssp * ( u(m,i,ny-4,k)
+ > - 4.0d+00 * u(m,i,ny-3,k)
+ > + 6.0d+00 * u(m,i,ny-2,k)
+ > - 4.0d+00 * u(m,i,ny-1,k) )
+ rsd(m,i,ny-1,k) = rsd(m,i,ny-1,k)
+ > - dssp * ( u(m,i,ny-3,k)
+ > - 4.0d+00 * u(m,i,ny-2,k)
+ > + 5.0d+00 * u(m,i,ny-1,k) )
+ end do
+ end do
+ END IF
+
+ end do
+
+c---------------------------------------------------------------------
+c zeta-direction flux differences
+c---------------------------------------------------------------------
+ do k = 1, nz
+ do j = jst, jend
+ do i = ist, iend
+ flux(1,i,j,k) = u(4,i,j,k)
+ u41 = u(4,i,j,k) / u(1,i,j,k)
+
+ q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k)
+ > + u(3,i,j,k) * u(3,i,j,k)
+ > + u(4,i,j,k) * u(4,i,j,k) )
+ > / u(1,i,j,k)
+
+ flux(2,i,j,k) = u(2,i,j,k) * u41
+ flux(3,i,j,k) = u(3,i,j,k) * u41
+ flux(4,i,j,k) = u(4,i,j,k) * u41 + c2 * (u(5,i,j,k)-q)
+ flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u41
+ end do
+ end do
+ end do
+
+ do k = 2, nz - 1
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,j,k) = rsd(m,i,j,k)
+ > - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) )
+ end do
+ end do
+ end do
+ end do
+
+ do k = 2, nz
+ do j = jst, jend
+ do i = ist, iend
+ tmp = 1.0d+00 / u(1,i,j,k)
+
+ u21k = tmp * u(2,i,j,k)
+ u31k = tmp * u(3,i,j,k)
+ u41k = tmp * u(4,i,j,k)
+ u51k = tmp * u(5,i,j,k)
+
+ tmp = 1.0d+00 / u(1,i,j,k-1)
+
+ u21km1 = tmp * u(2,i,j,k-1)
+ u31km1 = tmp * u(3,i,j,k-1)
+ u41km1 = tmp * u(4,i,j,k-1)
+ u51km1 = tmp * u(5,i,j,k-1)
+
+ flux(2,i,j,k) = tz3 * ( u21k - u21km1 )
+ flux(3,i,j,k) = tz3 * ( u31k - u31km1 )
+ flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * (u41k-u41km1)
+ flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
+ > * tz3 * ( ( u21k **2 + u31k **2 + u41k **2 )
+ > - ( u21km1**2 + u31km1**2 + u41km1**2 ) )
+ > + (1.0d+00/6.0d+00)
+ > * tz3 * ( u41k**2 - u41km1**2 )
+ > + c1 * c5 * tz3 * ( u51k - u51km1 )
+ end do
+ end do
+ end do
+
+ do k = 2, nz - 1
+ do j = jst, jend
+ do i = ist, iend
+ rsd(1,i,j,k) = rsd(1,i,j,k)
+ > + dz1 * tz1 * ( u(1,i,j,k-1)
+ > - 2.0d+00 * u(1,i,j,k)
+ > + u(1,i,j,k+1) )
+ rsd(2,i,j,k) = rsd(2,i,j,k)
+ > + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) )
+ > + dz2 * tz1 * ( u(2,i,j,k-1)
+ > - 2.0d+00 * u(2,i,j,k)
+ > + u(2,i,j,k+1) )
+ rsd(3,i,j,k) = rsd(3,i,j,k)
+ > + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) )
+ > + dz3 * tz1 * ( u(3,i,j,k-1)
+ > - 2.0d+00 * u(3,i,j,k)
+ > + u(3,i,j,k+1) )
+ rsd(4,i,j,k) = rsd(4,i,j,k)
+ > + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) )
+ > + dz4 * tz1 * ( u(4,i,j,k-1)
+ > - 2.0d+00 * u(4,i,j,k)
+ > + u(4,i,j,k+1) )
+ rsd(5,i,j,k) = rsd(5,i,j,k)
+ > + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) )
+ > + dz5 * tz1 * ( u(5,i,j,k-1)
+ > - 2.0d+00 * u(5,i,j,k)
+ > + u(5,i,j,k+1) )
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c fourth-order dissipation
+c---------------------------------------------------------------------
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,j,2) = rsd(m,i,j,2)
+ > - dssp * ( + 5.0d+00 * u(m,i,j,2)
+ > - 4.0d+00 * u(m,i,j,3)
+ > + u(m,i,j,4) )
+ rsd(m,i,j,3) = rsd(m,i,j,3)
+ > - dssp * ( - 4.0d+00 * u(m,i,j,2)
+ > + 6.0d+00 * u(m,i,j,3)
+ > - 4.0d+00 * u(m,i,j,4)
+ > + u(m,i,j,5) )
+ end do
+ end do
+ end do
+
+ do k = 4, nz - 3
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,j,k) = rsd(m,i,j,k)
+ > - dssp * ( u(m,i,j,k-2)
+ > - 4.0d+00 * u(m,i,j,k-1)
+ > + 6.0d+00 * u(m,i,j,k)
+ > - 4.0d+00 * u(m,i,j,k+1)
+ > + u(m,i,j,k+2) )
+ end do
+ end do
+ end do
+ end do
+
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,j,nz-2) = rsd(m,i,j,nz-2)
+ > - dssp * ( u(m,i,j,nz-4)
+ > - 4.0d+00 * u(m,i,j,nz-3)
+ > + 6.0d+00 * u(m,i,j,nz-2)
+ > - 4.0d+00 * u(m,i,j,nz-1) )
+ rsd(m,i,j,nz-1) = rsd(m,i,j,nz-1)
+ > - dssp * ( u(m,i,j,nz-3)
+ > - 4.0d+00 * u(m,i,j,nz-2)
+ > + 5.0d+00 * u(m,i,j,nz-1) )
+ end do
+ end do
+ end do
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setbv
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c set the boundary values of dependent variables
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, k
+ integer iglob, jglob
+
+c---------------------------------------------------------------------
+c set the dependent variable values along the top and bottom faces
+c---------------------------------------------------------------------
+ do j = 1, ny
+ jglob = jpt + j
+ do i = 1, nx
+ iglob = ipt + i
+ call exact( iglob, jglob, 1, u( 1, i, j, 1 ) )
+ call exact( iglob, jglob, nz, u( 1, i, j, nz ) )
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c set the dependent variable values along north and south faces
+c---------------------------------------------------------------------
+ IF (west.eq.-1) then
+ do k = 1, nz
+ do i = 1, nx
+ iglob = ipt + i
+ call exact( iglob, 1, k, u( 1, i, 1, k ) )
+ end do
+ end do
+ END IF
+
+ IF (east.eq.-1) then
+ do k = 1, nz
+ do i = 1, nx
+ iglob = ipt + i
+ call exact( iglob, ny0, k, u( 1, i, ny, k ) )
+ end do
+ end do
+ END IF
+
+c---------------------------------------------------------------------
+c set the dependent variable values along east and west faces
+c---------------------------------------------------------------------
+ IF (north.eq.-1) then
+ do k = 1, nz
+ do j = 1, ny
+ jglob = jpt + j
+ call exact( 1, jglob, k, u( 1, 1, j, k ) )
+ end do
+ end do
+ END IF
+
+ IF (south.eq.-1) then
+ do k = 1, nz
+ do j = 1, ny
+ jglob = jpt + j
+ call exact( nx0, jglob, k, u( 1, nx, j, k ) )
+ end do
+ end do
+ END IF
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setcoeff
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+c set up coefficients
+c---------------------------------------------------------------------
+ dxi = 1.0d+00 / ( nx0 - 1 )
+ deta = 1.0d+00 / ( ny0 - 1 )
+ dzeta = 1.0d+00 / ( nz0 - 1 )
+
+ tx1 = 1.0d+00 / ( dxi * dxi )
+ tx2 = 1.0d+00 / ( 2.0d+00 * dxi )
+ tx3 = 1.0d+00 / dxi
+
+ ty1 = 1.0d+00 / ( deta * deta )
+ ty2 = 1.0d+00 / ( 2.0d+00 * deta )
+ ty3 = 1.0d+00 / deta
+
+ tz1 = 1.0d+00 / ( dzeta * dzeta )
+ tz2 = 1.0d+00 / ( 2.0d+00 * dzeta )
+ tz3 = 1.0d+00 / dzeta
+
+ ii1 = 2
+ ii2 = nx0 - 1
+ ji1 = 2
+ ji2 = ny0 - 2
+ ki1 = 3
+ ki2 = nz0 - 1
+
+c---------------------------------------------------------------------
+c diffusion coefficients
+c---------------------------------------------------------------------
+ dx1 = 0.75d+00
+ dx2 = dx1
+ dx3 = dx1
+ dx4 = dx1
+ dx5 = dx1
+
+ dy1 = 0.75d+00
+ dy2 = dy1
+ dy3 = dy1
+ dy4 = dy1
+ dy5 = dy1
+
+ dz1 = 1.00d+00
+ dz2 = dz1
+ dz3 = dz1
+ dz4 = dz1
+ dz5 = dz1
+
+c---------------------------------------------------------------------
+c fourth difference dissipation
+c---------------------------------------------------------------------
+ dssp = ( max (dx1, dy1, dz1 ) ) / 4.0d+00
+
+c---------------------------------------------------------------------
+c coefficients of the exact solution to the first pde
+c---------------------------------------------------------------------
+ ce(1,1) = 2.0d+00
+ ce(1,2) = 0.0d+00
+ ce(1,3) = 0.0d+00
+ ce(1,4) = 4.0d+00
+ ce(1,5) = 5.0d+00
+ ce(1,6) = 3.0d+00
+ ce(1,7) = 5.0d-01
+ ce(1,8) = 2.0d-02
+ ce(1,9) = 1.0d-02
+ ce(1,10) = 3.0d-02
+ ce(1,11) = 5.0d-01
+ ce(1,12) = 4.0d-01
+ ce(1,13) = 3.0d-01
+
+c---------------------------------------------------------------------
+c coefficients of the exact solution to the second pde
+c---------------------------------------------------------------------
+ ce(2,1) = 1.0d+00
+ ce(2,2) = 0.0d+00
+ ce(2,3) = 0.0d+00
+ ce(2,4) = 0.0d+00
+ ce(2,5) = 1.0d+00
+ ce(2,6) = 2.0d+00
+ ce(2,7) = 3.0d+00
+ ce(2,8) = 1.0d-02
+ ce(2,9) = 3.0d-02
+ ce(2,10) = 2.0d-02
+ ce(2,11) = 4.0d-01
+ ce(2,12) = 3.0d-01
+ ce(2,13) = 5.0d-01
+
+c---------------------------------------------------------------------
+c coefficients of the exact solution to the third pde
+c---------------------------------------------------------------------
+ ce(3,1) = 2.0d+00
+ ce(3,2) = 2.0d+00
+ ce(3,3) = 0.0d+00
+ ce(3,4) = 0.0d+00
+ ce(3,5) = 0.0d+00
+ ce(3,6) = 2.0d+00
+ ce(3,7) = 3.0d+00
+ ce(3,8) = 4.0d-02
+ ce(3,9) = 3.0d-02
+ ce(3,10) = 5.0d-02
+ ce(3,11) = 3.0d-01
+ ce(3,12) = 5.0d-01
+ ce(3,13) = 4.0d-01
+
+c---------------------------------------------------------------------
+c coefficients of the exact solution to the fourth pde
+c---------------------------------------------------------------------
+ ce(4,1) = 2.0d+00
+ ce(4,2) = 2.0d+00
+ ce(4,3) = 0.0d+00
+ ce(4,4) = 0.0d+00
+ ce(4,5) = 0.0d+00
+ ce(4,6) = 2.0d+00
+ ce(4,7) = 3.0d+00
+ ce(4,8) = 3.0d-02
+ ce(4,9) = 5.0d-02
+ ce(4,10) = 4.0d-02
+ ce(4,11) = 2.0d-01
+ ce(4,12) = 1.0d-01
+ ce(4,13) = 3.0d-01
+
+c---------------------------------------------------------------------
+c coefficients of the exact solution to the fifth pde
+c---------------------------------------------------------------------
+ ce(5,1) = 5.0d+00
+ ce(5,2) = 4.0d+00
+ ce(5,3) = 3.0d+00
+ ce(5,4) = 2.0d+00
+ ce(5,5) = 1.0d-01
+ ce(5,6) = 4.0d-01
+ ce(5,7) = 3.0d-01
+ ce(5,8) = 5.0d-02
+ ce(5,9) = 4.0d-02
+ ce(5,10) = 3.0d-02
+ ce(5,11) = 1.0d-01
+ ce(5,12) = 3.0d-01
+ ce(5,13) = 2.0d-01
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine sethyper
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c for each column in a hyperplane, istart = first row,
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j
+ integer iglob, jglob
+ integer kp
+
+c---------------------------------------------------------------------
+c compute the pointers for hyperplanes
+c---------------------------------------------------------------------
+ do kp = 2,nx0+ny0
+ icomms(kp) = .false.
+ icommn(kp) = .false.
+ icomme(kp) = .false.
+ icommw(kp) = .false.
+
+c---------------------------------------------------------------------
+c check to see if comm. to south is required
+c---------------------------------------------------------------------
+ if (south.ne.-1) then
+ i = iend
+ iglob = ipt + i
+ jglob = kp - iglob
+ j = jglob - jpt
+ if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and.
+ > j.le.jend) icomms(kp) = .true.
+ end if
+
+c---------------------------------------------------------------------
+c check to see if comm. to north is required
+c---------------------------------------------------------------------
+ if (north.ne.-1) then
+ i = ist
+ iglob = ipt + i
+ jglob = kp - iglob
+ j = jglob - jpt
+ if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and.
+ > j.le.jend) icommn(kp) = .true.
+ end if
+
+c---------------------------------------------------------------------
+c check to see if comm. to east is required
+c---------------------------------------------------------------------
+ if (east.ne.-1) then
+ j = jend
+ jglob = jpt + j
+ iglob = kp - jglob
+ i = iglob - ipt
+ if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and.
+ > i.le.iend) icomme(kp) = .true.
+ end if
+
+c---------------------------------------------------------------------
+c check to see if comm. to west is required
+c---------------------------------------------------------------------
+ if (west.ne.-1) then
+ j = jst
+ jglob = jpt + j
+ iglob = kp - jglob
+ i = iglob - ipt
+ if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and.
+ > i.le.iend) icommw(kp) = .true.
+ end if
+
+ end do
+
+ icomms(1) = .false.
+ icommn(1) = .false.
+ icomme(1) = .false.
+ icommw(1) = .false.
+ icomms(nx0+ny0+1) = .false.
+ icommn(nx0+ny0+1) = .false.
+ icomme(nx0+ny0+1) = .false.
+ icommw(nx0+ny0+1) = .false.
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+ subroutine setiv
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c set the initial values of independent variables based on tri-linear
+c interpolation of boundary values in the computational space.
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, k, m
+ integer iglob, jglob
+ double precision xi, eta, zeta
+ double precision pxi, peta, pzeta
+ double precision ue_1jk(5),ue_nx0jk(5),ue_i1k(5),
+ > ue_iny0k(5),ue_ij1(5),ue_ijnz(5)
+
+
+ do k = 2, nz - 1
+ zeta = ( dble (k-1) ) / (nz-1)
+ do j = 1, ny
+ jglob = jpt + j
+ IF (jglob.ne.1.and.jglob.ne.ny0) then
+ eta = ( dble (jglob-1) ) / (ny0-1)
+ do i = 1, nx
+ iglob = ipt + i
+ IF (iglob.ne.1.and.iglob.ne.nx0) then
+ xi = ( dble (iglob-1) ) / (nx0-1)
+ call exact (1,jglob,k,ue_1jk)
+ call exact (nx0,jglob,k,ue_nx0jk)
+ call exact (iglob,1,k,ue_i1k)
+ call exact (iglob,ny0,k,ue_iny0k)
+ call exact (iglob,jglob,1,ue_ij1)
+ call exact (iglob,jglob,nz,ue_ijnz)
+ do m = 1, 5
+ pxi = ( 1.0d+00 - xi ) * ue_1jk(m)
+ > + xi * ue_nx0jk(m)
+ peta = ( 1.0d+00 - eta ) * ue_i1k(m)
+ > + eta * ue_iny0k(m)
+ pzeta = ( 1.0d+00 - zeta ) * ue_ij1(m)
+ > + zeta * ue_ijnz(m)
+
+ u( m, i, j, k ) = pxi + peta + pzeta
+ > - pxi * peta - peta * pzeta - pzeta * pxi
+ > + pxi * peta * pzeta
+
+ end do
+ END IF
+ end do
+ END IF
+ end do
+ end do
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine ssor(niter)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c to perform pseudo-time stepping SSOR iterations
+c for five nonlinear pde's.
+c---------------------------------------------------------------------
+
+ implicit none
+ integer niter
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer i, j, k, m
+ integer istep
+ double precision tmp
+ double precision delunm(5), tv(5,isiz1,isiz2)
+
+ external timer_read
+ double precision wtime, timer_read
+
+ integer IERROR
+
+
+ ROOT = 0
+
+c---------------------------------------------------------------------
+c begin pseudo-time stepping iterations
+c---------------------------------------------------------------------
+ tmp = 1.0d+00 / ( omega * ( 2.0d+00 - omega ) )
+
+c---------------------------------------------------------------------
+c initialize a,b,c,d to zero (guarantees that page tables have been
+c formed, if applicable on given architecture, before timestepping).
+c---------------------------------------------------------------------
+ do m=1,isiz2
+ do k=1,isiz1
+ do j=1,5
+ do i=1,5
+ a(i,j,k,m) = 0.d0
+ b(i,j,k,m) = 0.d0
+ c(i,j,k,m) = 0.d0
+ d(i,j,k,m) = 0.d0
+ enddo
+ enddo
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c compute the steady-state residuals
+c---------------------------------------------------------------------
+ call rhs
+
+c---------------------------------------------------------------------
+c compute the L2 norms of newton iteration residuals
+c---------------------------------------------------------------------
+ call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
+ > ist, iend, jst, jend,
+ > rsd, rsdnm )
+
+ call MPI_BARRIER( MPI_COMM_WORLD, IERROR )
+
+ call timer_clear(1)
+ call timer_start(1)
+
+c---------------------------------------------------------------------
+c the timestep loop
+c---------------------------------------------------------------------
+ do istep = 1, niter
+
+ if (id .eq. 0) then
+ if (mod ( istep, 20) .eq. 0 .or.
+ > istep .eq. itmax .or.
+ > istep .eq. 1) then
+ if (niter .gt. 1) write( *, 200) istep
+ 200 format(' Time step ', i4)
+ endif
+ endif
+
+c---------------------------------------------------------------------
+c perform SSOR iteration
+c---------------------------------------------------------------------
+ do k = 2, nz - 1
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ rsd(m,i,j,k) = dt * rsd(m,i,j,k)
+ end do
+ end do
+ end do
+ end do
+
+ DO k = 2, nz -1
+c---------------------------------------------------------------------
+c form the lower triangular part of the jacobian matrix
+c---------------------------------------------------------------------
+ call jacld(k)
+
+c---------------------------------------------------------------------
+c perform the lower triangular solution
+c---------------------------------------------------------------------
+ call blts( isiz1, isiz2, isiz3,
+ > nx, ny, nz, k,
+ > omega,
+ > rsd,
+ > a, b, c, d,
+ > ist, iend, jst, jend,
+ > nx0, ny0, ipt, jpt)
+ END DO
+
+ DO k = nz - 1, 2, -1
+c---------------------------------------------------------------------
+c form the strictly upper triangular part of the jacobian matrix
+c---------------------------------------------------------------------
+ call jacu(k)
+
+c---------------------------------------------------------------------
+c perform the upper triangular solution
+c---------------------------------------------------------------------
+ call buts( isiz1, isiz2, isiz3,
+ > nx, ny, nz, k,
+ > omega,
+ > rsd, tv,
+ > d, a, b, c,
+ > ist, iend, jst, jend,
+ > nx0, ny0, ipt, jpt)
+ END DO
+
+c---------------------------------------------------------------------
+c update the variables
+c---------------------------------------------------------------------
+
+ do k = 2, nz-1
+ do j = jst, jend
+ do i = ist, iend
+ do m = 1, 5
+ u( m, i, j, k ) = u( m, i, j, k )
+ > + tmp * rsd( m, i, j, k )
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c compute the max-norms of newton iteration corrections
+c---------------------------------------------------------------------
+ if ( mod ( istep, inorm ) .eq. 0 ) then
+ call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
+ > ist, iend, jst, jend,
+ > rsd, delunm )
+c if ( ipr .eq. 1 .and. id .eq. 0 ) then
+c write (*,1006) ( delunm(m), m = 1, 5 )
+c else if ( ipr .eq. 2 .and. id .eq. 0 ) then
+c write (*,'(i5,f15.6)') istep,delunm(5)
+c end if
+ end if
+
+c---------------------------------------------------------------------
+c compute the steady-state residuals
+c---------------------------------------------------------------------
+ call rhs
+
+c---------------------------------------------------------------------
+c compute the max-norms of newton iteration residuals
+c---------------------------------------------------------------------
+ if ( ( mod ( istep, inorm ) .eq. 0 ) .or.
+ > ( istep .eq. itmax ) ) then
+ call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
+ > ist, iend, jst, jend,
+ > rsd, rsdnm )
+c if ( ipr .eq. 1.and.id.eq.0 ) then
+c write (*,1007) ( rsdnm(m), m = 1, 5 )
+c end if
+ end if
+
+c---------------------------------------------------------------------
+c check the newton-iteration residuals against the tolerance levels
+c---------------------------------------------------------------------
+ if ( ( rsdnm(1) .lt. tolrsd(1) ) .and.
+ > ( rsdnm(2) .lt. tolrsd(2) ) .and.
+ > ( rsdnm(3) .lt. tolrsd(3) ) .and.
+ > ( rsdnm(4) .lt. tolrsd(4) ) .and.
+ > ( rsdnm(5) .lt. tolrsd(5) ) ) then
+c if (ipr .eq. 1 .and. id.eq.0) then
+c write (*,1004) istep
+c end if
+ return
+ end if
+
+ end do
+
+ call timer_stop(1)
+ wtime = timer_read(1)
+
+
+ call MPI_ALLREDUCE( wtime,
+ > maxtime,
+ > 1,
+ > MPI_DOUBLE_PRECISION,
+ > MPI_MAX,
+ > MPI_COMM_WORLD,
+ > IERROR )
+
+
+
+ return
+
+ 1001 format (1x/5x,'pseudo-time SSOR iteration no.=',i4/)
+ 1004 format (1x/1x,'convergence was achieved after ',i4,
+ > ' pseudo-time steps' )
+ 1006 format (1x/1x,'RMS-norm of SSOR-iteration correction ',
+ > 'for first pde = ',1pe12.5/,
+ > 1x,'RMS-norm of SSOR-iteration correction ',
+ > 'for second pde = ',1pe12.5/,
+ > 1x,'RMS-norm of SSOR-iteration correction ',
+ > 'for third pde = ',1pe12.5/,
+ > 1x,'RMS-norm of SSOR-iteration correction ',
+ > 'for fourth pde = ',1pe12.5/,
+ > 1x,'RMS-norm of SSOR-iteration correction ',
+ > 'for fifth pde = ',1pe12.5)
+ 1007 format (1x/1x,'RMS-norm of steady-state residual for ',
+ > 'first pde = ',1pe12.5/,
+ > 1x,'RMS-norm of steady-state residual for ',
+ > 'second pde = ',1pe12.5/,
+ > 1x,'RMS-norm of steady-state residual for ',
+ > 'third pde = ',1pe12.5/,
+ > 1x,'RMS-norm of steady-state residual for ',
+ > 'fourth pde = ',1pe12.5/,
+ > 1x,'RMS-norm of steady-state residual for ',
+ > 'fifth pde = ',1pe12.5)
+
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine subdomain
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+c---------------------------------------------------------------------
+c local variables
+c---------------------------------------------------------------------
+ integer mm, ierror, errorcode
+
+
+c---------------------------------------------------------------------
+c
+c set up the sub-domain sizes
+c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c x dimension
+c---------------------------------------------------------------------
+ mm = mod(nx0,xdim)
+ if (row.le.mm) then
+ nx = nx0/xdim + 1
+ ipt = (row-1)*nx
+ else
+ nx = nx0/xdim
+ ipt = (row-1)*nx + mm
+ end if
+
+c---------------------------------------------------------------------
+c y dimension
+c---------------------------------------------------------------------
+ mm = mod(ny0,ydim)
+ if (col.le.mm) then
+ ny = ny0/ydim + 1
+ jpt = (col-1)*ny
+ else
+ ny = ny0/ydim
+ jpt = (col-1)*ny + mm
+ end if
+
+c---------------------------------------------------------------------
+c z dimension
+c---------------------------------------------------------------------
+ nz = nz0
+
+c---------------------------------------------------------------------
+c check the sub-domain size
+c---------------------------------------------------------------------
+ if ( ( nx .lt. 4 ) .or.
+ > ( ny .lt. 4 ) .or.
+ > ( nz .lt. 4 ) ) then
+ write (*,2001) nx, ny, nz
+ 2001 format (5x,'SUBDOMAIN SIZE IS TOO SMALL - ',
+ > /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS',
+ > /5x,'SO THAT NX, NY AND NZ ARE GREATER THAN OR EQUAL',
+ > /5x,'TO 4 THEY ARE CURRENTLY', 3I3)
+ CALL MPI_ABORT( MPI_COMM_WORLD,
+ > ERRORCODE,
+ > IERROR )
+ end if
+
+ if ( ( nx .gt. isiz1 ) .or.
+ > ( ny .gt. isiz2 ) .or.
+ > ( nz .gt. isiz3 ) ) then
+ write (*,2002) nx, ny, nz
+ 2002 format (5x,'SUBDOMAIN SIZE IS TOO LARGE - ',
+ > /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS',
+ > /5x,'SO THAT NX, NY AND NZ ARE LESS THAN OR EQUAL TO ',
+ > /5x,'ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY. THEY ARE',
+ > /5x,'CURRENTLY', 3I4)
+ CALL MPI_ABORT( MPI_COMM_WORLD,
+ > ERRORCODE,
+ > IERROR )
+ end if
+
+
+c---------------------------------------------------------------------
+c set up the start and end in i and j extents for all processors
+c---------------------------------------------------------------------
+ ist = 1
+ iend = nx
+ if (north.eq.-1) ist = 2
+ if (south.eq.-1) iend = nx - 1
+
+ jst = 1
+ jend = ny
+ if (west.eq.-1) jst = 2
+ if (east.eq.-1) jend = ny - 1
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine verify(xcr, xce, xci, class, verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c verification routine
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'mpinpb.h'
+ include 'applu.incl'
+
+ double precision xcr(5), xce(5), xci
+ double precision xcrref(5),xceref(5),xciref,
+ > xcrdif(5),xcedif(5),xcidif,
+ > epsilon, dtref
+ integer m
+ character class
+ logical verified
+
+c---------------------------------------------------------------------
+c tolerance level
+c---------------------------------------------------------------------
+ epsilon = 1.0d-08
+
+ class = 'U'
+ verified = .true.
+
+ do m = 1,5
+ xcrref(m) = 1.0
+ xceref(m) = 1.0
+ end do
+ xciref = 1.0
+
+ if ( (nx0 .eq. 12 ) .and.
+ > (ny0 .eq. 12 ) .and.
+ > (nz0 .eq. 12 ) .and.
+ > (itmax .eq. 50 )) then
+
+ class = 'S'
+ dtref = 5.0d-1
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual, for the (12X12X12) grid,
+c after 50 time steps, with DT = 5.0d-01
+c---------------------------------------------------------------------
+ xcrref(1) = 1.6196343210976702d-02
+ xcrref(2) = 2.1976745164821318d-03
+ xcrref(3) = 1.5179927653399185d-03
+ xcrref(4) = 1.5029584435994323d-03
+ xcrref(5) = 3.4264073155896461d-02
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error, for the (12X12X12) grid,
+c after 50 time steps, with DT = 5.0d-01
+c---------------------------------------------------------------------
+ xceref(1) = 6.4223319957960924d-04
+ xceref(2) = 8.4144342047347926d-05
+ xceref(3) = 5.8588269616485186d-05
+ xceref(4) = 5.8474222595157350d-05
+ xceref(5) = 1.3103347914111294d-03
+
+c---------------------------------------------------------------------
+c Reference value of surface integral, for the (12X12X12) grid,
+c after 50 time steps, with DT = 5.0d-01
+c---------------------------------------------------------------------
+ xciref = 7.8418928865937083d+00
+
+
+ elseif ( (nx0 .eq. 33) .and.
+ > (ny0 .eq. 33) .and.
+ > (nz0 .eq. 33) .and.
+ > (itmax . eq. 300) ) then
+
+ class = 'W' !SPEC95fp size
+ dtref = 1.5d-3
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual, for the (33x33x33) grid,
+c after 300 time steps, with DT = 1.5d-3
+c---------------------------------------------------------------------
+ xcrref(1) = 0.1236511638192d+02
+ xcrref(2) = 0.1317228477799d+01
+ xcrref(3) = 0.2550120713095d+01
+ xcrref(4) = 0.2326187750252d+01
+ xcrref(5) = 0.2826799444189d+02
+
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error, for the (33X33X33) grid,
+c---------------------------------------------------------------------
+ xceref(1) = 0.4867877144216d+00
+ xceref(2) = 0.5064652880982d-01
+ xceref(3) = 0.9281818101960d-01
+ xceref(4) = 0.8570126542733d-01
+ xceref(5) = 0.1084277417792d+01
+
+
+c---------------------------------------------------------------------
+c Reference value of surface integral, for the (33X33X33) grid,
+c after 300 time steps, with DT = 1.5d-3
+c---------------------------------------------------------------------
+ xciref = 0.1161399311023d+02
+
+ elseif ( (nx0 .eq. 64) .and.
+ > (ny0 .eq. 64) .and.
+ > (nz0 .eq. 64) .and.
+ > (itmax . eq. 250) ) then
+
+ class = 'A'
+ dtref = 2.0d+0
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual, for the (64X64X64) grid,
+c after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+ xcrref(1) = 7.7902107606689367d+02
+ xcrref(2) = 6.3402765259692870d+01
+ xcrref(3) = 1.9499249727292479d+02
+ xcrref(4) = 1.7845301160418537d+02
+ xcrref(5) = 1.8384760349464247d+03
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error, for the (64X64X64) grid,
+c after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+ xceref(1) = 2.9964085685471943d+01
+ xceref(2) = 2.8194576365003349d+00
+ xceref(3) = 7.3473412698774742d+00
+ xceref(4) = 6.7139225687777051d+00
+ xceref(5) = 7.0715315688392578d+01
+
+c---------------------------------------------------------------------
+c Reference value of surface integral, for the (64X64X64) grid,
+c after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+ xciref = 2.6030925604886277d+01
+
+
+ elseif ( (nx0 .eq. 102) .and.
+ > (ny0 .eq. 102) .and.
+ > (nz0 .eq. 102) .and.
+ > (itmax . eq. 250) ) then
+
+ class = 'B'
+ dtref = 2.0d+0
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual, for the (102X102X102) grid,
+c after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+ xcrref(1) = 3.5532672969982736d+03
+ xcrref(2) = 2.6214750795310692d+02
+ xcrref(3) = 8.8333721850952190d+02
+ xcrref(4) = 7.7812774739425265d+02
+ xcrref(5) = 7.3087969592545314d+03
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error, for the (102X102X102)
+c grid, after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+ xceref(1) = 1.1401176380212709d+02
+ xceref(2) = 8.1098963655421574d+00
+ xceref(3) = 2.8480597317698308d+01
+ xceref(4) = 2.5905394567832939d+01
+ xceref(5) = 2.6054907504857413d+02
+
+c---------------------------------------------------------------------
+c Reference value of surface integral, for the (102X102X102) grid,
+c after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+ xciref = 4.7887162703308227d+01
+
+ elseif ( (nx0 .eq. 162) .and.
+ > (ny0 .eq. 162) .and.
+ > (nz0 .eq. 162) .and.
+ > (itmax . eq. 250) ) then
+
+ class = 'C'
+ dtref = 2.0d+0
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual, for the (162X162X162) grid,
+c after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+ xcrref(1) = 1.03766980323537846d+04
+ xcrref(2) = 8.92212458801008552d+02
+ xcrref(3) = 2.56238814582660871d+03
+ xcrref(4) = 2.19194343857831427d+03
+ xcrref(5) = 1.78078057261061185d+04
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error, for the (162X162X162)
+c grid, after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+ xceref(1) = 2.15986399716949279d+02
+ xceref(2) = 1.55789559239863600d+01
+ xceref(3) = 5.41318863077207766d+01
+ xceref(4) = 4.82262643154045421d+01
+ xceref(5) = 4.55902910043250358d+02
+
+c---------------------------------------------------------------------
+c Reference value of surface integral, for the (162X162X162) grid,
+c after 250 time steps, with DT = 2.0d+00
+c---------------------------------------------------------------------
+ xciref = 6.66404553572181300d+01
+
+ elseif ( (nx0 .eq. 408) .and.
+ > (ny0 .eq. 408) .and.
+ > (nz0 .eq. 408) .and.
+ > (itmax . eq. 300) ) then
+
+ class = 'D'
+ dtref = 1.0d+0
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual, for the (408X408X408) grid,
+c after 300 time steps, with DT = 1.0d+00
+c---------------------------------------------------------------------
+ xcrref(1) = 0.4868417937025d+05
+ xcrref(2) = 0.4696371050071d+04
+ xcrref(3) = 0.1218114549776d+05
+ xcrref(4) = 0.1033801493461d+05
+ xcrref(5) = 0.7142398413817d+05
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error, for the (408X408X408)
+c grid, after 300 time steps, with DT = 1.0d+00
+c---------------------------------------------------------------------
+ xceref(1) = 0.3752393004482d+03
+ xceref(2) = 0.3084128893659d+02
+ xceref(3) = 0.9434276905469d+02
+ xceref(4) = 0.8230686681928d+02
+ xceref(5) = 0.7002620636210d+03
+
+c---------------------------------------------------------------------
+c Reference value of surface integral, for the (408X408X408) grid,
+c after 300 time steps, with DT = 1.0d+00
+c---------------------------------------------------------------------
+ xciref = 0.8334101392503d+02
+
+ elseif ( (nx0 .eq. 1020) .and.
+ > (ny0 .eq. 1020) .and.
+ > (nz0 .eq. 1020) .and.
+ > (itmax . eq. 300) ) then
+
+ class = 'E'
+ dtref = 0.5d+0
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual, for the (1020X1020X1020) grid,
+c after 300 time steps, with DT = 0.5d+00
+c---------------------------------------------------------------------
+ xcrref(1) = 0.2099641687874d+06
+ xcrref(2) = 0.2130403143165d+05
+ xcrref(3) = 0.5319228789371d+05
+ xcrref(4) = 0.4509761639833d+05
+ xcrref(5) = 0.2932360006590d+06
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error, for the (1020X1020X1020)
+c grid, after 300 time steps, with DT = 0.5d+00
+c---------------------------------------------------------------------
+ xceref(1) = 0.4800572578333d+03
+ xceref(2) = 0.4221993400184d+02
+ xceref(3) = 0.1210851906824d+03
+ xceref(4) = 0.1047888986770d+03
+ xceref(5) = 0.8363028257389d+03
+
+c---------------------------------------------------------------------
+c Reference value of surface integral, for the (1020X1020X1020) grid,
+c after 300 time steps, with DT = 0.5d+00
+c---------------------------------------------------------------------
+ xciref = 0.9512163272273d+02
+
+ else
+ verified = .FALSE.
+ endif
+
+c---------------------------------------------------------------------
+c verification test for residuals if gridsize is one of
+c the defined grid sizes above (class .ne. 'U')
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Compute the difference of solution values and the known reference values.
+c---------------------------------------------------------------------
+ do m = 1, 5
+
+ xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m))
+ xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
+
+ enddo
+ xcidif = dabs((xci - xciref)/xciref)
+
+
+c---------------------------------------------------------------------
+c Output the comparison of computed results to known cases.
+c---------------------------------------------------------------------
+
+ if (class .ne. 'U') then
+ write(*, 1990) class
+ 1990 format(/, ' Verification being performed for class ', a)
+ write (*,2000) epsilon
+ 2000 format(' Accuracy setting for epsilon = ', E20.13)
+ verified = (dabs(dt-dtref) .le. epsilon)
+ if (.not.verified) then
+ class = 'U'
+ write (*,1000) dtref
+ 1000 format(' DT does not match the reference value of ',
+ > E15.8)
+ endif
+ else
+ write(*, 1995)
+ 1995 format(' Unknown class')
+ endif
+
+
+ if (class .ne. 'U') then
+ write (*,2001)
+ else
+ write (*, 2005)
+ endif
+
+ 2001 format(' Comparison of RMS-norms of residual')
+ 2005 format(' RMS-norms of residual')
+ do m = 1, 5
+ if (class .eq. 'U') then
+ write(*, 2015) m, xcr(m)
+ else if (xcrdif(m) .le. epsilon) then
+ write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
+ else
+ verified = .false.
+ write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
+ endif
+ enddo
+
+ if (class .ne. 'U') then
+ write (*,2002)
+ else
+ write (*,2006)
+ endif
+ 2002 format(' Comparison of RMS-norms of solution error')
+ 2006 format(' RMS-norms of solution error')
+
+ do m = 1, 5
+ if (class .eq. 'U') then
+ write(*, 2015) m, xce(m)
+ else if (xcedif(m) .le. epsilon) then
+ write (*,2011) m,xce(m),xceref(m),xcedif(m)
+ else
+ verified = .false.
+ write (*,2010) m,xce(m),xceref(m),xcedif(m)
+ endif
+ enddo
+
+ 2010 format(' FAILURE: ', i2, 2x, E20.13, E20.13, E20.13)
+ 2011 format(' ', i2, 2x, E20.13, E20.13, E20.13)
+ 2015 format(' ', i2, 2x, E20.13)
+
+ if (class .ne. 'U') then
+ write (*,2025)
+ else
+ write (*,2026)
+ endif
+ 2025 format(' Comparison of surface integral')
+ 2026 format(' Surface integral')
+
+
+ if (class .eq. 'U') then
+ write(*, 2030) xci
+ else if (xcidif .le. epsilon) then
+ write(*, 2032) xci, xciref, xcidif
+ else
+ verified = .false.
+ write(*, 2031) xci, xciref, xcidif
+ endif
+
+ 2030 format(' ', 4x, E20.13)
+ 2031 format(' FAILURE: ', 4x, E20.13, E20.13, E20.13)
+ 2032 format(' ', 4x, E20.13, E20.13, E20.13)
+
+
+
+ if (class .eq. 'U') then
+ write(*, 2022)
+ write(*, 2023)
+ 2022 format(' No reference values provided')
+ 2023 format(' No verification performed')
+ else if (verified) then
+ write(*, 2020)
+ 2020 format(' Verification Successful')
+ else
+ write(*, 2021)
+ 2021 format(' Verification failed')
+ endif
+
+ return
+
+
+ end
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=mg
+BENCHMARKU=MG
+
+include ../config/make.def
+
+OBJS = mg.o ${COMMON}/print_results.o \
+ ${COMMON}/${RAND}.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+${PROGRAM}: config ${OBJS}
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+mg.o: mg.f globals.h mpinpb.h npbparams.h
+ ${FCOMPILE} mg.f
+
+clean:
+ - rm -f *.o *~
+ - rm -f npbparams.h core
+
+
+
--- /dev/null
+Some info about the MG benchmark
+================================
+
+'mg_demo' demonstrates the capabilities of a very simple multigrid
+solver in computing a three dimensional potential field. This is
+a simplified multigrid solver in two important respects:
+
+ (1) it solves only a constant coefficient equation,
+ and that only on a uniform cubical grid,
+
+ (2) it solves only a single equation, representing
+ a scalar field rather than a vector field.
+
+We chose it for its portability and simplicity, and expect that a
+supercomputer which can run it effectively will also be able to
+run more complex multigrid programs at least as well.
+
+ Eric Barszcz Paul Frederickson
+ RIACS
+ NASA Ames Research Center NASA Ames Research Center
+
+========================================================================
+Running the program: (Note: also see parameter lm information in the
+ two sections immediately below this section)
+
+The program may be run with or without an input deck (called "mg.input").
+The following describes a few things about the input deck if you want to
+use one.
+
+The four lines below are the "mg.input" file required to run a
+problem of total size 256x256x256, for 4 iterations (Class "A"),
+and presumes the use of 8 processors:
+
+ 8 = top level
+ 256 256 256 = nx ny nz
+ 4 = nit
+ 0 0 0 0 0 0 0 0 = debug_vec
+
+The first line of input indicates how many levels of multi-grid
+cycle will be applied to a particular subpartition. Presuming that
+8 processors are solving this problem (recall that the number of
+processors is specified to MPI as a run parameter, and MPI subsequently
+determines this for the code via an MPI subroutine call), a 2x2x2
+processor grid is formed, and thus each partition on a processor is
+of size 128x128x128. Therefore, a maximum of 8 multi-grid levels may
+be used. These are of size 128,64,32,16,8,4,2,1, with the coarsest
+level being a single point on a given processor.
+
+
+Next, consider the same size problem but running on 1 processor. The
+following "mg.input" file is appropriate:
+
+ 9 = top level
+ 256 256 256 = nx ny nz
+ 4 = nit
+ 0 0 0 0 0 0 0 0 = debug_vec
+
+Since this processor must solve the full 256x256x256 problem, this
+permits 9 multi-grid levels (256,128,64,32,16,8,4,2,1), resulting in
+a coarsest multi-grid level of a single point on the processor
+
+
+Next, consider the same size problem but running on 2 processors. The
+following "mg.input" file is required:
+
+ 8 = top level
+ 256 256 256 = nx ny nz
+ 4 = nit
+ 0 0 0 0 0 0 0 0 = debug_vec
+
+The algorithm for partitioning the full grid onto some power of 2 number
+of processors is to start by splitting the last dimension of the grid
+(z dimension) in 2: the problem is now partitioned onto 2 processors.
+Next the middle dimension (y dimension) is split in 2: the problem is now
+partitioned onto 4 processors. Next, first dimension (x dimension) is
+split in 2: the problem is now partitioned onto 8 processors. Next, the
+last dimension (z dimension) is split again in 2: the problem is now
+partitioned onto 16 processors. This partitioning is repeated until all
+of the power of 2 processors have been allocated.
+
+Thus to run the above problem on 2 processors, the grid partitioning
+algorithm will allocate the two processors across the last dimension,
+creating two partitions each of size 256x256x128. The coarsest level of
+multi-grid must be a single point surrounded by a cubic number of grid
+points. Therefore, each of the two processor partitions will contain 4
+coarsest multi-grid level points, each surrounded by a cube of grid points
+of size 128x128x128, indicated by a top level of 8.
+
+
+Next, consider the same size problem but running on 4 processors. The
+following "mg.input" file is required:
+
+ 8 = top level
+ 256 256 256 = nx ny nz
+ 4 = nit
+ 0 0 0 0 0 0 0 0 = debug_vec
+
+The partitioning algorithm will create 4 partitions, each of size
+256x128x128. Each partition will contain 2 coarsest multi-grid level
+points each surrounded by a cube of grid points of size 128x128x128,
+indicated by a top level of 8.
+
+
+Next, consider the same size problem but running on 16 processors. The
+following "mg.input" file is required:
+
+ 7 = top level
+ 256 256 256 = nx ny nz
+ 4 = nit
+ 0 0 0 0 0 0 0 0 = debug_vec
+
+On each node a partition of size 128x128x64 will be created. A maximum
+of 7 multi-grid levels (64,32,16,8,4,2,1) may be used, resulting in each
+partions containing 4 coarsest multi-grid level points, each surrounded
+by a cube of grid points of size 64x64x64, indicated by a top level of 7.
+
+
+
+
+Note that non-cubic problem sizes may also be considered:
+
+The four lines below are the "mg.input" file appropriate for running a
+problem of total size 256x512x512, for 20 iterations and presumes the
+use of 32 processors (note: this is NOT a class C problem):
+
+ 8 = top level
+ 256 512 512 = nx ny nz
+ 20 = nit
+ 0 0 0 0 0 0 0 0 = debug_vec
+
+The first line of input indicates how many levels of multi-grid
+cycle will be applied to a particular subpartition. Presuming that
+32 processors are solving this problem, a 2x4x4 processor grid is
+formed, and thus each partition on a processor is of size 128x128x128.
+Therefore, a maximum of 8 multi-grid levels may be used. These are of
+size 128,64,32,16,8,4,2,1, with the coarsest level being a single
+point on a given processor.
+
--- /dev/null
+c---------------------------------------------------------------------
+c Parameter lm (declared and set in "npbparams.h") is the log-base2 of
+c the edge size max for the partition on a given node, so must be changed
+c either to save space (if running a small case) or made bigger for larger
+c cases, for example, 512^3. Thus lm=7 means that the largest dimension
+c of a partition that can be solved on a node is 2^7 = 128. lm is set
+c automatically in npbparams.h
+c Parameters ndim1, ndim2, ndim3 are the local problem dimensions.
+c---------------------------------------------------------------------
+
+ include 'npbparams.h'
+
+ integer nm ! actual dimension including ghost cells for communications
+ > , nv ! size of rhs array
+ > , nr ! size of residual array
+ > , nm2 ! size of communication buffer
+ > , maxlevel! maximum number of levels
+
+ parameter( nm=2+2**lm, nv=(2+2**ndim1)*(2+2**ndim2)*(2+2**ndim3) )
+ parameter( nm2=2*nm*nm, maxlevel=(lt_default+1) )
+ parameter( nr = (8*(nv+nm**2+5*nm+14*lt_default-7*lm))/7 )
+ integer maxprocs
+ parameter( maxprocs = 131072 ) ! this is the upper proc limit that
+ ! the current "nr" parameter can handle
+c---------------------------------------------------------------------
+ integer nbr(3,-1:1,maxlevel), msg_type(3,-1:1)
+ integer msg_id(3,-1:1,2),nx(maxlevel),ny(maxlevel),nz(maxlevel)
+ common /mg3/ nbr,msg_type,msg_id,nx,ny,nz
+
+ character class
+ common /ClassType/class
+
+ integer debug_vec(0:7)
+ common /my_debug/ debug_vec
+
+ integer ir(maxlevel), m1(maxlevel), m2(maxlevel), m3(maxlevel)
+ integer lt, lb
+ common /fap/ ir,m1,m2,m3,lt,lb
+
+ logical dead(maxlevel), give_ex(3,maxlevel), take_ex(3,maxlevel)
+ common /comm_ex/ dead, give_ex, take_ex
+
+c---------------------------------------------------------------------
+c Set at m=1024, can handle cases up to 1024^3 case
+c---------------------------------------------------------------------
+ integer m
+c parameter( m=1037 )
+ parameter( m=nm+1 )
+
+ double precision buff(nm2,4)
+ common /buffer/ buff
+
+
+
+
--- /dev/null
+!-------------------------------------------------------------------------!
+! !
+! N A S P A R A L L E L B E N C H M A R K S 3.3 !
+! !
+! M G !
+! !
+!-------------------------------------------------------------------------!
+! !
+! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
+! It is described in NAS Technical Reports 95-020 and 02-007 !
+! !
+! Permission to use, copy, distribute and modify this software !
+! for any purpose with or without fee is hereby granted. We !
+! request, however, that all derived work reference the NAS !
+! Parallel Benchmarks 3.3. This software is provided "as is" !
+! without express or implied warranty. !
+! !
+! Information on NPB 3.3, including the technical report, the !
+! original specifications, source code, results and information !
+! on how to submit new results, is available at: !
+! !
+! http://www.nas.nasa.gov/Software/NPB/ !
+! !
+! Send comments or suggestions to npb@nas.nasa.gov !
+! !
+! NAS Parallel Benchmarks Group !
+! NASA Ames Research Center !
+! Mail Stop: T27A-1 !
+! Moffett Field, CA 94035-1000 !
+! !
+! E-mail: npb@nas.nasa.gov !
+! Fax: (650) 604-3957 !
+! !
+!-------------------------------------------------------------------------!
+
+
+c---------------------------------------------------------------------
+c
+c Authors: E. Barszcz
+c P. Frederickson
+c A. Woo
+c M. Yarrow
+c R. F. Van der Wijngaart
+c
+c---------------------------------------------------------------------
+
+
+c---------------------------------------------------------------------
+ program mg_mpi
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+c---------------------------------------------------------------------------c
+c k is the current level. It is passed down through subroutine args
+c and is NOT global. it is the current iteration
+c---------------------------------------------------------------------------c
+
+ integer k, it
+
+ external timer_read
+ double precision t, t0, tinit, mflops, timer_read
+
+c---------------------------------------------------------------------------c
+c These arrays are in common because they are quite large
+c and probably shouldn't be allocated on the stack. They
+c are always passed as subroutine args.
+c---------------------------------------------------------------------------c
+
+ double precision u(nr),v(nv),r(nr),a(0:3),c(0:3)
+ common /noautom/ u,v,r
+
+ double precision rnm2, rnmu, old2, oldu, epsilon
+ integer n1, n2, n3, nit
+ double precision nn, verify_value, err
+ logical verified
+
+ integer ierr,i, fstatus
+ integer T_bench, T_init
+ parameter (T_bench=1, T_init=2)
+
+ call mpi_init(ierr)
+ call mpi_comm_rank(mpi_comm_world, me, ierr)
+ call mpi_comm_size(mpi_comm_world, nprocs, ierr)
+
+ root = 0
+ if (nprocs_compiled .gt. maxprocs) then
+ if (me .eq. root) write(*,20) nprocs_compiled, maxprocs
+ 20 format(' ERROR: compiled for ',i8,' processes'//
+ & ' The maximum size allowed for this benchmark is ',i6)
+ call mpi_abort(MPI_COMM_WORLD, ierr)
+ stop
+ endif
+
+ if (.not. convertdouble) then
+ dp_type = MPI_DOUBLE_PRECISION
+ else
+ dp_type = MPI_REAL
+ endif
+
+
+ call timer_clear(T_bench)
+ call timer_clear(T_init)
+
+ call mpi_barrier(MPI_COMM_WORLD, ierr)
+
+ call timer_start(T_init)
+
+
+c---------------------------------------------------------------------
+c Read in and broadcast input data
+c---------------------------------------------------------------------
+
+ if( me .eq. root )then
+ write (*, 1000)
+
+ open(unit=7,file="mg.input", status="old", iostat=fstatus)
+ if (fstatus .eq. 0) then
+ write(*,50)
+ 50 format(' Reading from input file mg.input')
+ read(7,*) lt
+ read(7,*) nx(lt), ny(lt), nz(lt)
+ read(7,*) nit
+ read(7,*) (debug_vec(i),i=0,7)
+ else
+ write(*,51)
+ 51 format(' No input file. Using compiled defaults ')
+ lt = lt_default
+ nit = nit_default
+ nx(lt) = nx_default
+ ny(lt) = ny_default
+ nz(lt) = nz_default
+ do i = 0,7
+ debug_vec(i) = debug_default
+ end do
+ endif
+ endif
+
+ call mpi_bcast(lt, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+ call mpi_bcast(nit, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+ call mpi_bcast(nx(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+ call mpi_bcast(ny(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+ call mpi_bcast(nz(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
+ call mpi_bcast(debug_vec(0), 8, MPI_INTEGER, 0,
+ > mpi_comm_world, ierr)
+
+ if ( (nx(lt) .ne. ny(lt)) .or. (nx(lt) .ne. nz(lt)) ) then
+ Class = 'U'
+ else if( nx(lt) .eq. 32 .and. nit .eq. 4 ) then
+ Class = 'S'
+ else if( nx(lt) .eq. 128 .and. nit .eq. 4 ) then
+ Class = 'W'
+ else if( nx(lt) .eq. 256 .and. nit .eq. 4 ) then
+ Class = 'A'
+ else if( nx(lt) .eq. 256 .and. nit .eq. 20 ) then
+ Class = 'B'
+ else if( nx(lt) .eq. 512 .and. nit .eq. 20 ) then
+ Class = 'C'
+ else if( nx(lt) .eq. 1024 .and. nit .eq. 50 ) then
+ Class = 'D'
+ else if( nx(lt) .eq. 2048 .and. nit .eq. 50 ) then
+ Class = 'E'
+ else
+ Class = 'U'
+ endif
+
+c---------------------------------------------------------------------
+c Use these for debug info:
+c---------------------------------------------------------------------
+c debug_vec(0) = 1 !=> report all norms
+c debug_vec(1) = 1 !=> some setup information
+c debug_vec(1) = 2 !=> more setup information
+c debug_vec(2) = k => at level k or below, show result of resid
+c debug_vec(3) = k => at level k or below, show result of psinv
+c debug_vec(4) = k => at level k or below, show result of rprj
+c debug_vec(5) = k => at level k or below, show result of interp
+c debug_vec(6) = 1 => (unused)
+c debug_vec(7) = 1 => (unused)
+c---------------------------------------------------------------------
+ a(0) = -8.0D0/3.0D0
+ a(1) = 0.0D0
+ a(2) = 1.0D0/6.0D0
+ a(3) = 1.0D0/12.0D0
+
+ if(Class .eq. 'A' .or. Class .eq. 'S'.or. Class .eq.'W') then
+c---------------------------------------------------------------------
+c Coefficients for the S(a) smoother
+c---------------------------------------------------------------------
+ c(0) = -3.0D0/8.0D0
+ c(1) = +1.0D0/32.0D0
+ c(2) = -1.0D0/64.0D0
+ c(3) = 0.0D0
+ else
+c---------------------------------------------------------------------
+c Coefficients for the S(b) smoother
+c---------------------------------------------------------------------
+ c(0) = -3.0D0/17.0D0
+ c(1) = +1.0D0/33.0D0
+ c(2) = -1.0D0/61.0D0
+ c(3) = 0.0D0
+ endif
+ lb = 1
+ k = lt
+
+ call setup(n1,n2,n3,k)
+ call zero3(u,n1,n2,n3)
+ call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
+
+ call norm2u3(v,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
+
+ if( me .eq. root )then
+ write (*, 1001) nx(lt),ny(lt),nz(lt), Class
+ write (*, 1002) nit
+
+ 1000 format(//,' NAS Parallel Benchmarks 3.3 -- MG Benchmark', /)
+ 1001 format(' Size: ', i4, 'x', i4, 'x', i4, ' (class ', A, ')' )
+ 1002 format(' Iterations: ', i4)
+ 1003 format(' Number of processes: ', i6)
+ if (nprocs .ne. nprocs_compiled) then
+ write (*, 1004) nprocs_compiled
+ write (*, 1005) nprocs
+ 1004 format(' WARNING: compiled for ', i6, ' processes ')
+ 1005 format(' Number of active processes: ', i6, /)
+ else
+ write (*, 1003) nprocs
+ endif
+ endif
+
+ call resid(u,v,r,n1,n2,n3,a,k)
+ call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
+ old2 = rnm2
+ oldu = rnmu
+
+c---------------------------------------------------------------------
+c One iteration for startup
+c---------------------------------------------------------------------
+ call mg3P(u,v,r,a,c,n1,n2,n3,k)
+ call resid(u,v,r,n1,n2,n3,a,k)
+ call setup(n1,n2,n3,k)
+ call zero3(u,n1,n2,n3)
+ call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
+
+ call timer_stop(T_init)
+ if( me .eq. root )then
+ tinit = timer_read(T_init)
+ write( *,'(/A,F15.3,A/)' )
+ > ' Initialization time: ',tinit, ' seconds'
+ endif
+
+ call mpi_barrier(mpi_comm_world,ierr)
+
+ call timer_start(T_bench)
+
+ call resid(u,v,r,n1,n2,n3,a,k)
+ call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
+ old2 = rnm2
+ oldu = rnmu
+
+ do it=1,nit
+ if (it.eq.1 .or. it.eq.nit .or. mod(it,5).eq.0) then
+ if (me .eq. root) write(*,80) it
+ 80 format(' iter ',i4)
+ endif
+ call mg3P(u,v,r,a,c,n1,n2,n3,k)
+ call resid(u,v,r,n1,n2,n3,a,k)
+ enddo
+
+
+ call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
+
+ call timer_stop(T_bench)
+
+ t0 = timer_read(T_bench)
+
+ call mpi_reduce(t0,t,1,dp_type,
+ > mpi_max,root,mpi_comm_world,ierr)
+ verified = .FALSE.
+ verify_value = 0.0
+ if( me .eq. root )then
+ write(*,100)
+ 100 format(/' Benchmark completed ')
+
+ epsilon = 1.d-8
+ if (Class .ne. 'U') then
+ if(Class.eq.'S') then
+ verify_value = 0.5307707005734d-04
+ elseif(Class.eq.'W') then
+ verify_value = 0.6467329375339d-05
+ elseif(Class.eq.'A') then
+ verify_value = 0.2433365309069d-05
+ elseif(Class.eq.'B') then
+ verify_value = 0.1800564401355d-05
+ elseif(Class.eq.'C') then
+ verify_value = 0.5706732285740d-06
+ elseif(Class.eq.'D') then
+ verify_value = 0.1583275060440d-09
+ elseif(Class.eq.'E') then
+ verify_value = 0.5630442584711d-10
+ endif
+
+ err = abs( rnm2 - verify_value ) / verify_value
+ if( err .le. epsilon ) then
+ verified = .TRUE.
+ write(*, 200)
+ write(*, 201) rnm2
+ write(*, 202) err
+ 200 format(' VERIFICATION SUCCESSFUL ')
+ 201 format(' L2 Norm is ', E20.13)
+ 202 format(' Error is ', E20.13)
+ else
+ verified = .FALSE.
+ write(*, 300)
+ write(*, 301) rnm2
+ write(*, 302) verify_value
+ 300 format(' VERIFICATION FAILED')
+ 301 format(' L2 Norm is ', E20.13)
+ 302 format(' The correct L2 Norm is ', E20.13)
+ endif
+ else
+ verified = .FALSE.
+ write (*, 400)
+ write (*, 401)
+ write (*, 201) rnm2
+ 400 format(' Problem size unknown')
+ 401 format(' NO VERIFICATION PERFORMED')
+ endif
+
+ nn = 1.0d0*nx(lt)*ny(lt)*nz(lt)
+
+ if( t .ne. 0. ) then
+ mflops = 58.*1.0D-6*nit*nn / t
+ else
+ mflops = 0.0
+ endif
+
+ call print_results('MG', class, nx(lt), ny(lt), nz(lt),
+ > nit, nprocs_compiled, nprocs, t,
+ > mflops, ' floating point',
+ > verified, npbversion, compiletime,
+ > cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+
+
+ endif
+
+
+ call mpi_finalize(ierr)
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setup(n1,n2,n3,k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer is1, is2, is3, ie1, ie2, ie3
+ common /grid/ is1,is2,is3,ie1,ie2,ie3
+
+ integer n1,n2,n3,k
+ integer dx, dy, log_p, d, i, j
+
+ integer ax, next(3),mi(3,maxlevel),mip(3,maxlevel)
+ integer ng(3,maxlevel)
+ integer idi(3), pi(3), idin(3,-1:1)
+ integer s, dir,ierr
+
+ do j=-1,1,1
+ do d=1,3
+ msg_type(d,j) = 100*(j+2+10*d)
+ enddo
+ enddo
+
+ ng(1,lt) = nx(lt)
+ ng(2,lt) = ny(lt)
+ ng(3,lt) = nz(lt)
+ do ax=1,3
+ next(ax) = 1
+ do k=lt-1,1,-1
+ ng(ax,k) = ng(ax,k+1)/2
+ enddo
+ enddo
+ 61 format(10i4)
+ do k=lt,1,-1
+ nx(k) = ng(1,k)
+ ny(k) = ng(2,k)
+ nz(k) = ng(3,k)
+ enddo
+
+ log_p = log(float(nprocs)+0.0001)/log(2.0)
+ dx = log_p/3
+ pi(1) = 2**dx
+ idi(1) = mod(me,pi(1))
+
+ dy = (log_p-dx)/2
+ pi(2) = 2**dy
+ idi(2) = mod((me/pi(1)),pi(2))
+
+ pi(3) = nprocs/(pi(1)*pi(2))
+ idi(3) = me/(pi(1)*pi(2))
+
+ do k = lt,1,-1
+ dead(k) = .false.
+ do ax = 1,3
+ take_ex(ax,k) = .false.
+ give_ex(ax,k) = .false.
+
+ mi(ax,k) = 2 +
+ > ((idi(ax)+1)*ng(ax,k))/pi(ax) -
+ > ((idi(ax)+0)*ng(ax,k))/pi(ax)
+ mip(ax,k) = 2 +
+ > ((next(ax)+idi(ax)+1)*ng(ax,k))/pi(ax) -
+ > ((next(ax)+idi(ax)+0)*ng(ax,k))/pi(ax)
+
+ if(mip(ax,k).eq.2.or.mi(ax,k).eq.2)then
+ next(ax) = 2*next(ax)
+ endif
+
+ if( k+1 .le. lt )then
+ if((mip(ax,k).eq.2).and.(mi(ax,k).eq.3))then
+ give_ex(ax,k+1) = .true.
+ endif
+ if((mip(ax,k).eq.3).and.(mi(ax,k).eq.2))then
+ take_ex(ax,k+1) = .true.
+ endif
+ endif
+ enddo
+
+ if( mi(1,k).eq.2 .or.
+ > mi(2,k).eq.2 .or.
+ > mi(3,k).eq.2 )then
+ dead(k) = .true.
+ endif
+ m1(k) = mi(1,k)
+ m2(k) = mi(2,k)
+ m3(k) = mi(3,k)
+
+ do ax=1,3
+ idin(ax,+1) = mod( idi(ax) + next(ax) + pi(ax) , pi(ax) )
+ idin(ax,-1) = mod( idi(ax) - next(ax) + pi(ax) , pi(ax) )
+ enddo
+ do dir = 1,-1,-2
+ nbr(1,dir,k) = idin(1,dir) + pi(1)
+ > *(idi(2) + pi(2)
+ > * idi(3))
+ nbr(2,dir,k) = idi(1) + pi(1)
+ > *(idin(2,dir) + pi(2)
+ > * idi(3))
+ nbr(3,dir,k) = idi(1) + pi(1)
+ > *(idi(2) + pi(2)
+ > * idin(3,dir))
+ enddo
+ enddo
+
+ k = lt
+ is1 = 2 + ng(1,k) - ((pi(1) -idi(1))*ng(1,lt))/pi(1)
+ ie1 = 1 + ng(1,k) - ((pi(1)-1-idi(1))*ng(1,lt))/pi(1)
+ n1 = 3 + ie1 - is1
+ is2 = 2 + ng(2,k) - ((pi(2) -idi(2))*ng(2,lt))/pi(2)
+ ie2 = 1 + ng(2,k) - ((pi(2)-1-idi(2))*ng(2,lt))/pi(2)
+ n2 = 3 + ie2 - is2
+ is3 = 2 + ng(3,k) - ((pi(3) -idi(3))*ng(3,lt))/pi(3)
+ ie3 = 1 + ng(3,k) - ((pi(3)-1-idi(3))*ng(3,lt))/pi(3)
+ n3 = 3 + ie3 - is3
+
+
+ ir(lt)=1
+ do j = lt-1, 1, -1
+ ir(j)=ir(j+1)+m1(j+1)*m2(j+1)*m3(j+1)
+ enddo
+
+
+ if( debug_vec(1) .ge. 1 )then
+ if( me .eq. root )write(*,*)' in setup, '
+ if( me .eq. root )write(*,*)' me k lt nx ny nz ',
+ > ' n1 n2 n3 is1 is2 is3 ie1 ie2 ie3'
+ do i=0,nprocs-1
+ if( me .eq. i )then
+ write(*,9) me,k,lt,ng(1,k),ng(2,k),ng(3,k),
+ > n1,n2,n3,is1,is2,is3,ie1,ie2,ie3
+ 9 format(15i4)
+ endif
+ call mpi_barrier(mpi_comm_world,ierr)
+ enddo
+ endif
+ if( debug_vec(1) .ge. 2 )then
+ do i=0,nprocs-1
+ if( me .eq. i )then
+ write(*,*)' '
+ write(*,*)' processor =',me
+ do k=lt,1,-1
+ write(*,7)k,idi(1),idi(2),idi(3),
+ > ((nbr(d,j,k),j=-1,1,2),d=1,3),
+ > (mi(d,k),d=1,3)
+ enddo
+ 7 format(i4,'idi=',3i4,'nbr=',3(2i4,' '),'mi=',3i4,' ')
+ write(*,*)'idi(s) = ',(idi(s),s=1,3)
+ write(*,*)'dead(2), dead(1) = ',dead(2),dead(1)
+ do ax=1,3
+ write(*,*)'give_ex(ax,2)= ',give_ex(ax,2)
+ write(*,*)'take_ex(ax,2)= ',take_ex(ax,2)
+ enddo
+ endif
+ call mpi_barrier(mpi_comm_world,ierr)
+ enddo
+ endif
+
+ k = lt
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine mg3P(u,v,r,a,c,n1,n2,n3,k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c multigrid V-cycle routine
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer n1, n2, n3, k
+ double precision u(nr),v(nv),r(nr)
+ double precision a(0:3),c(0:3)
+
+ integer j
+
+c---------------------------------------------------------------------
+c down cycle.
+c restrict the residual from the find grid to the coarse
+c---------------------------------------------------------------------
+
+ do k= lt, lb+1 , -1
+ j = k-1
+ call rprj3(r(ir(k)),m1(k),m2(k),m3(k),
+ > r(ir(j)),m1(j),m2(j),m3(j),k)
+ enddo
+
+ k = lb
+c---------------------------------------------------------------------
+c compute an approximate solution on the coarsest grid
+c---------------------------------------------------------------------
+ call zero3(u(ir(k)),m1(k),m2(k),m3(k))
+ call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
+
+ do k = lb+1, lt-1
+ j = k-1
+c---------------------------------------------------------------------
+c prolongate from level k-1 to k
+c---------------------------------------------------------------------
+ call zero3(u(ir(k)),m1(k),m2(k),m3(k))
+ call interp(u(ir(j)),m1(j),m2(j),m3(j),
+ > u(ir(k)),m1(k),m2(k),m3(k),k)
+c---------------------------------------------------------------------
+c compute residual for level k
+c---------------------------------------------------------------------
+ call resid(u(ir(k)),r(ir(k)),r(ir(k)),m1(k),m2(k),m3(k),a,k)
+c---------------------------------------------------------------------
+c apply smoother
+c---------------------------------------------------------------------
+ call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
+ enddo
+ 200 continue
+ j = lt - 1
+ k = lt
+ call interp(u(ir(j)),m1(j),m2(j),m3(j),u,n1,n2,n3,k)
+ call resid(u,v,r,n1,n2,n3,a,k)
+ call psinv(r,u,n1,n2,n3,c,k)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine psinv( r,u,n1,n2,n3,c,k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c psinv applies an approximate inverse as smoother: u = u + Cr
+c
+c This implementation costs 15A + 4M per result, where
+c A and M denote the costs of Addition and Multiplication.
+c Presuming coefficient c(3) is zero (the NPB assumes this,
+c but it is thus not a general case), 2A + 1M may be eliminated,
+c resulting in 13A + 3M.
+c Note that this vectorizes, and is also fine for cache
+c based machines.
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'globals.h'
+
+ integer n1,n2,n3,k
+ double precision u(n1,n2,n3),r(n1,n2,n3),c(0:3)
+ integer i3, i2, i1
+
+ double precision r1(m), r2(m)
+
+ do i3=2,n3-1
+ do i2=2,n2-1
+ do i1=1,n1
+ r1(i1) = r(i1,i2-1,i3) + r(i1,i2+1,i3)
+ > + r(i1,i2,i3-1) + r(i1,i2,i3+1)
+ r2(i1) = r(i1,i2-1,i3-1) + r(i1,i2+1,i3-1)
+ > + r(i1,i2-1,i3+1) + r(i1,i2+1,i3+1)
+ enddo
+ do i1=2,n1-1
+ u(i1,i2,i3) = u(i1,i2,i3)
+ > + c(0) * r(i1,i2,i3)
+ > + c(1) * ( r(i1-1,i2,i3) + r(i1+1,i2,i3)
+ > + r1(i1) )
+ > + c(2) * ( r2(i1) + r1(i1-1) + r1(i1+1) )
+c---------------------------------------------------------------------
+c Assume c(3) = 0 (Enable line below if c(3) not= 0)
+c---------------------------------------------------------------------
+c > + c(3) * ( r2(i1-1) + r2(i1+1) )
+c---------------------------------------------------------------------
+ enddo
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c exchange boundary points
+c---------------------------------------------------------------------
+ call comm3(u,n1,n2,n3,k)
+
+ if( debug_vec(0) .ge. 1 )then
+ call rep_nrm(u,n1,n2,n3,' psinv',k)
+ endif
+
+ if( debug_vec(3) .ge. k )then
+ call showall(u,n1,n2,n3)
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine resid( u,v,r,n1,n2,n3,a,k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c resid computes the residual: r = v - Au
+c
+c This implementation costs 15A + 4M per result, where
+c A and M denote the costs of Addition (or Subtraction) and
+c Multiplication, respectively.
+c Presuming coefficient a(1) is zero (the NPB assumes this,
+c but it is thus not a general case), 3A + 1M may be eliminated,
+c resulting in 12A + 3M.
+c Note that this vectorizes, and is also fine for cache
+c based machines.
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'globals.h'
+
+ integer n1,n2,n3,k
+ double precision u(n1,n2,n3),v(n1,n2,n3),r(n1,n2,n3),a(0:3)
+ integer i3, i2, i1
+ double precision u1(m), u2(m)
+
+ do i3=2,n3-1
+ do i2=2,n2-1
+ do i1=1,n1
+ u1(i1) = u(i1,i2-1,i3) + u(i1,i2+1,i3)
+ > + u(i1,i2,i3-1) + u(i1,i2,i3+1)
+ u2(i1) = u(i1,i2-1,i3-1) + u(i1,i2+1,i3-1)
+ > + u(i1,i2-1,i3+1) + u(i1,i2+1,i3+1)
+ enddo
+ do i1=2,n1-1
+ r(i1,i2,i3) = v(i1,i2,i3)
+ > - a(0) * u(i1,i2,i3)
+c---------------------------------------------------------------------
+c Assume a(1) = 0 (Enable 2 lines below if a(1) not= 0)
+c---------------------------------------------------------------------
+c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)
+c > + u1(i1) )
+c---------------------------------------------------------------------
+ > - a(2) * ( u2(i1) + u1(i1-1) + u1(i1+1) )
+ > - a(3) * ( u2(i1-1) + u2(i1+1) )
+ enddo
+ enddo
+ enddo
+
+c---------------------------------------------------------------------
+c exchange boundary data
+c---------------------------------------------------------------------
+ call comm3(r,n1,n2,n3,k)
+
+ if( debug_vec(0) .ge. 1 )then
+ call rep_nrm(r,n1,n2,n3,' resid',k)
+ endif
+
+ if( debug_vec(2) .ge. k )then
+ call showall(r,n1,n2,n3)
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine rprj3( r,m1k,m2k,m3k,s,m1j,m2j,m3j,k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c rprj3 projects onto the next coarser grid,
+c using a trilinear Finite Element projection: s = r' = P r
+c
+c This implementation costs 20A + 4M per result, where
+c A and M denote the costs of Addition and Multiplication.
+c Note that this vectorizes, and is also fine for cache
+c based machines.
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer m1k, m2k, m3k, m1j, m2j, m3j,k
+ double precision r(m1k,m2k,m3k), s(m1j,m2j,m3j)
+ integer j3, j2, j1, i3, i2, i1, d1, d2, d3, j
+
+ double precision x1(m), y1(m), x2,y2
+
+
+ if(m1k.eq.3)then
+ d1 = 2
+ else
+ d1 = 1
+ endif
+
+ if(m2k.eq.3)then
+ d2 = 2
+ else
+ d2 = 1
+ endif
+
+ if(m3k.eq.3)then
+ d3 = 2
+ else
+ d3 = 1
+ endif
+
+ do j3=2,m3j-1
+ i3 = 2*j3-d3
+C i3 = 2*j3-1
+ do j2=2,m2j-1
+ i2 = 2*j2-d2
+C i2 = 2*j2-1
+
+ do j1=2,m1j
+ i1 = 2*j1-d1
+C i1 = 2*j1-1
+ x1(i1-1) = r(i1-1,i2-1,i3 ) + r(i1-1,i2+1,i3 )
+ > + r(i1-1,i2, i3-1) + r(i1-1,i2, i3+1)
+ y1(i1-1) = r(i1-1,i2-1,i3-1) + r(i1-1,i2-1,i3+1)
+ > + r(i1-1,i2+1,i3-1) + r(i1-1,i2+1,i3+1)
+ enddo
+
+ do j1=2,m1j-1
+ i1 = 2*j1-d1
+C i1 = 2*j1-1
+ y2 = r(i1, i2-1,i3-1) + r(i1, i2-1,i3+1)
+ > + r(i1, i2+1,i3-1) + r(i1, i2+1,i3+1)
+ x2 = r(i1, i2-1,i3 ) + r(i1, i2+1,i3 )
+ > + r(i1, i2, i3-1) + r(i1, i2, i3+1)
+ s(j1,j2,j3) =
+ > 0.5D0 * r(i1,i2,i3)
+ > + 0.25D0 * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) + x2)
+ > + 0.125D0 * ( x1(i1-1) + x1(i1+1) + y2)
+ > + 0.0625D0 * ( y1(i1-1) + y1(i1+1) )
+ enddo
+
+ enddo
+ enddo
+
+
+ j = k-1
+ call comm3(s,m1j,m2j,m3j,j)
+
+ if( debug_vec(0) .ge. 1 )then
+ call rep_nrm(s,m1j,m2j,m3j,' rprj3',k-1)
+ endif
+
+ if( debug_vec(4) .ge. k )then
+ call showall(s,m1j,m2j,m3j)
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine interp( z,mm1,mm2,mm3,u,n1,n2,n3,k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c interp adds the trilinear interpolation of the correction
+c from the coarser grid to the current approximation: u = u + Qu'
+c
+c Observe that this implementation costs 16A + 4M, where
+c A and M denote the costs of Addition and Multiplication.
+c Note that this vectorizes, and is also fine for cache
+c based machines. Vector machines may get slightly better
+c performance however, with 8 separate "do i1" loops, rather than 4.
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer mm1, mm2, mm3, n1, n2, n3,k
+ double precision z(mm1,mm2,mm3),u(n1,n2,n3)
+ integer i3, i2, i1, d1, d2, d3, t1, t2, t3
+
+c note that m = 1037 in globals.h but for this only need to be
+c 535 to handle up to 1024^3
+c integer m
+c parameter( m=535 )
+ double precision z1(m),z2(m),z3(m)
+
+
+ if( n1 .ne. 3 .and. n2 .ne. 3 .and. n3 .ne. 3 ) then
+
+ do i3=1,mm3-1
+ do i2=1,mm2-1
+
+ do i1=1,mm1
+ z1(i1) = z(i1,i2+1,i3) + z(i1,i2,i3)
+ z2(i1) = z(i1,i2,i3+1) + z(i1,i2,i3)
+ z3(i1) = z(i1,i2+1,i3+1) + z(i1,i2,i3+1) + z1(i1)
+ enddo
+
+ do i1=1,mm1-1
+ u(2*i1-1,2*i2-1,2*i3-1)=u(2*i1-1,2*i2-1,2*i3-1)
+ > +z(i1,i2,i3)
+ u(2*i1,2*i2-1,2*i3-1)=u(2*i1,2*i2-1,2*i3-1)
+ > +0.5d0*(z(i1+1,i2,i3)+z(i1,i2,i3))
+ enddo
+ do i1=1,mm1-1
+ u(2*i1-1,2*i2,2*i3-1)=u(2*i1-1,2*i2,2*i3-1)
+ > +0.5d0 * z1(i1)
+ u(2*i1,2*i2,2*i3-1)=u(2*i1,2*i2,2*i3-1)
+ > +0.25d0*( z1(i1) + z1(i1+1) )
+ enddo
+ do i1=1,mm1-1
+ u(2*i1-1,2*i2-1,2*i3)=u(2*i1-1,2*i2-1,2*i3)
+ > +0.5d0 * z2(i1)
+ u(2*i1,2*i2-1,2*i3)=u(2*i1,2*i2-1,2*i3)
+ > +0.25d0*( z2(i1) + z2(i1+1) )
+ enddo
+ do i1=1,mm1-1
+ u(2*i1-1,2*i2,2*i3)=u(2*i1-1,2*i2,2*i3)
+ > +0.25d0* z3(i1)
+ u(2*i1,2*i2,2*i3)=u(2*i1,2*i2,2*i3)
+ > +0.125d0*( z3(i1) + z3(i1+1) )
+ enddo
+ enddo
+ enddo
+
+ else
+
+ if(n1.eq.3)then
+ d1 = 2
+ t1 = 1
+ else
+ d1 = 1
+ t1 = 0
+ endif
+
+ if(n2.eq.3)then
+ d2 = 2
+ t2 = 1
+ else
+ d2 = 1
+ t2 = 0
+ endif
+
+ if(n3.eq.3)then
+ d3 = 2
+ t3 = 1
+ else
+ d3 = 1
+ t3 = 0
+ endif
+
+ do i3=d3,mm3-1
+ do i2=d2,mm2-1
+ do i1=d1,mm1-1
+ u(2*i1-d1,2*i2-d2,2*i3-d3)=u(2*i1-d1,2*i2-d2,2*i3-d3)
+ > +z(i1,i2,i3)
+ enddo
+ do i1=1,mm1-1
+ u(2*i1-t1,2*i2-d2,2*i3-d3)=u(2*i1-t1,2*i2-d2,2*i3-d3)
+ > +0.5D0*(z(i1+1,i2,i3)+z(i1,i2,i3))
+ enddo
+ enddo
+ do i2=1,mm2-1
+ do i1=d1,mm1-1
+ u(2*i1-d1,2*i2-t2,2*i3-d3)=u(2*i1-d1,2*i2-t2,2*i3-d3)
+ > +0.5D0*(z(i1,i2+1,i3)+z(i1,i2,i3))
+ enddo
+ do i1=1,mm1-1
+ u(2*i1-t1,2*i2-t2,2*i3-d3)=u(2*i1-t1,2*i2-t2,2*i3-d3)
+ > +0.25D0*(z(i1+1,i2+1,i3)+z(i1+1,i2,i3)
+ > +z(i1, i2+1,i3)+z(i1, i2,i3))
+ enddo
+ enddo
+ enddo
+
+ do i3=1,mm3-1
+ do i2=d2,mm2-1
+ do i1=d1,mm1-1
+ u(2*i1-d1,2*i2-d2,2*i3-t3)=u(2*i1-d1,2*i2-d2,2*i3-t3)
+ > +0.5D0*(z(i1,i2,i3+1)+z(i1,i2,i3))
+ enddo
+ do i1=1,mm1-1
+ u(2*i1-t1,2*i2-d2,2*i3-t3)=u(2*i1-t1,2*i2-d2,2*i3-t3)
+ > +0.25D0*(z(i1+1,i2,i3+1)+z(i1,i2,i3+1)
+ > +z(i1+1,i2,i3 )+z(i1,i2,i3 ))
+ enddo
+ enddo
+ do i2=1,mm2-1
+ do i1=d1,mm1-1
+ u(2*i1-d1,2*i2-t2,2*i3-t3)=u(2*i1-d1,2*i2-t2,2*i3-t3)
+ > +0.25D0*(z(i1,i2+1,i3+1)+z(i1,i2,i3+1)
+ > +z(i1,i2+1,i3 )+z(i1,i2,i3 ))
+ enddo
+ do i1=1,mm1-1
+ u(2*i1-t1,2*i2-t2,2*i3-t3)=u(2*i1-t1,2*i2-t2,2*i3-t3)
+ > +0.125D0*(z(i1+1,i2+1,i3+1)+z(i1+1,i2,i3+1)
+ > +z(i1 ,i2+1,i3+1)+z(i1 ,i2,i3+1)
+ > +z(i1+1,i2+1,i3 )+z(i1+1,i2,i3 )
+ > +z(i1 ,i2+1,i3 )+z(i1 ,i2,i3 ))
+ enddo
+ enddo
+ enddo
+
+ endif
+
+ call comm3_ex(u,n1,n2,n3,k)
+
+ if( debug_vec(0) .ge. 1 )then
+ call rep_nrm(z,mm1,mm2,mm3,'z: inter',k-1)
+ call rep_nrm(u,n1,n2,n3,'u: inter',k)
+ endif
+
+ if( debug_vec(5) .ge. k )then
+ call showall(z,mm1,mm2,mm3)
+ call showall(u,n1,n2,n3)
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine norm2u3(r,n1,n2,n3,rnm2,rnmu,nx,ny,nz)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c norm2u3 evaluates approximations to the L2 norm and the
+c uniform (or L-infinity or Chebyshev) norm, under the
+c assumption that the boundaries are periodic or zero. Add the
+c boundaries in with half weight (quarter weight on the edges
+c and eighth weight at the corners) for inhomogeneous boundaries.
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+
+ integer n1, n2, n3, nx, ny, nz
+ double precision rnm2, rnmu, r(n1,n2,n3)
+ double precision s, a, ss
+ integer i3, i2, i1, ierr
+
+ double precision dn
+
+ dn = 1.0d0*nx*ny*nz
+
+ s=0.0D0
+ rnmu = 0.0D0
+ do i3=2,n3-1
+ do i2=2,n2-1
+ do i1=2,n1-1
+ s=s+r(i1,i2,i3)**2
+ a=abs(r(i1,i2,i3))
+ if(a.gt.rnmu)rnmu=a
+ enddo
+ enddo
+ enddo
+
+ call mpi_allreduce(rnmu,ss,1,dp_type,
+ > mpi_max,mpi_comm_world,ierr)
+ rnmu = ss
+ call mpi_allreduce(s, ss, 1, dp_type,
+ > mpi_sum,mpi_comm_world,ierr)
+ s = ss
+ rnm2=sqrt( s / dn )
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine rep_nrm(u,n1,n2,n3,title,kk)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c report on norm
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer n1, n2, n3, kk
+ double precision u(n1,n2,n3)
+ character*8 title
+
+ double precision rnm2, rnmu
+
+
+ call norm2u3(u,n1,n2,n3,rnm2,rnmu,nx(kk),ny(kk),nz(kk))
+ if( me .eq. root )then
+ write(*,7)kk,title,rnm2,rnmu
+ 7 format(' Level',i2,' in ',a8,': norms =',D21.14,D21.14)
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine comm3(u,n1,n2,n3,kk)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c comm3 organizes the communication on all borders
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer n1, n2, n3, kk
+ double precision u(n1,n2,n3)
+ integer axis
+
+ if( .not. dead(kk) )then
+ do axis = 1, 3
+ if( nprocs .ne. 1) then
+
+ call ready( axis, -1, kk )
+ call ready( axis, +1, kk )
+
+ call give3( axis, +1, u, n1, n2, n3, kk )
+ call give3( axis, -1, u, n1, n2, n3, kk )
+
+ call take3( axis, -1, u, n1, n2, n3 )
+ call take3( axis, +1, u, n1, n2, n3 )
+
+ else
+ call comm1p( axis, u, n1, n2, n3, kk )
+ endif
+ enddo
+ else
+ call zero3(u,n1,n2,n3)
+ endif
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine comm3_ex(u,n1,n2,n3,kk)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c comm3_ex communicates to expand the number of processors
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer n1, n2, n3, kk
+ double precision u(n1,n2,n3)
+ integer axis
+
+ do axis = 1, 3
+ if( nprocs .ne. 1 ) then
+ if( take_ex( axis, kk ) )then
+ call ready( axis, -1, kk )
+ call ready( axis, +1, kk )
+ call take3_ex( axis, -1, u, n1, n2, n3 )
+ call take3_ex( axis, +1, u, n1, n2, n3 )
+ endif
+
+ if( give_ex( axis, kk ) )then
+ call give3_ex( axis, +1, u, n1, n2, n3, kk )
+ call give3_ex( axis, -1, u, n1, n2, n3, kk )
+ endif
+ else
+ call comm1p_ex( axis, u, n1, n2, n3, kk )
+ endif
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine ready( axis, dir, k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c ready allocates a buffer to take in a message
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer axis, dir, k
+ integer buff_id,buff_len,i,ierr
+
+ buff_id = 3 + dir
+ buff_len = nm2
+
+ do i=1,nm2
+ buff(i,buff_id) = 0.0D0
+ enddo
+
+
+c---------------------------------------------------------------------
+c fake message request type
+c---------------------------------------------------------------------
+ msg_id(axis,dir,1) = msg_type(axis,dir) +1000*me
+
+ call mpi_irecv( buff(1,buff_id), buff_len,
+ > dp_type, nbr(axis,-dir,k), msg_type(axis,dir),
+ > mpi_comm_world, msg_id(axis,dir,1), ierr)
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine give3( axis, dir, u, n1, n2, n3, k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c give3 sends border data out in the requested direction
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer axis, dir, n1, n2, n3, k, ierr
+ double precision u( n1, n2, n3 )
+
+ integer i3, i2, i1, buff_len,buff_id
+
+ buff_id = 2 + dir
+ buff_len = 0
+
+ if( axis .eq. 1 )then
+ if( dir .eq. -1 )then
+
+ do i3=2,n3-1
+ do i2=2,n2-1
+ buff_len = buff_len + 1
+ buff(buff_len,buff_id ) = u( 2, i2,i3)
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ else if( dir .eq. +1 ) then
+
+ do i3=2,n3-1
+ do i2=2,n2-1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( n1-1, i2,i3)
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ endif
+ endif
+
+ if( axis .eq. 2 )then
+ if( dir .eq. -1 )then
+
+ do i3=2,n3-1
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1, 2,i3)
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ else if( dir .eq. +1 ) then
+
+ do i3=2,n3-1
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id )= u( i1,n2-1,i3)
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ endif
+ endif
+
+ if( axis .eq. 3 )then
+ if( dir .eq. -1 )then
+
+ do i2=1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1,i2,2)
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ else if( dir .eq. +1 ) then
+
+ do i2=1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1,i2,n3-1)
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ endif
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine take3( axis, dir, u, n1, n2, n3 )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c take3 copies in border data from the requested direction
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer axis, dir, n1, n2, n3
+ double precision u( n1, n2, n3 )
+
+ integer buff_id, indx
+
+ integer status(mpi_status_size), ierr
+
+ integer i3, i2, i1
+
+ call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
+ buff_id = 3 + dir
+ indx = 0
+
+ if( axis .eq. 1 )then
+ if( dir .eq. -1 )then
+
+ do i3=2,n3-1
+ do i2=2,n2-1
+ indx = indx + 1
+ u(n1,i2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+
+ else if( dir .eq. +1 ) then
+
+ do i3=2,n3-1
+ do i2=2,n2-1
+ indx = indx + 1
+ u(1,i2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+
+ endif
+ endif
+
+ if( axis .eq. 2 )then
+ if( dir .eq. -1 )then
+
+ do i3=2,n3-1
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,n2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+
+ else if( dir .eq. +1 ) then
+
+ do i3=2,n3-1
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,1,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+
+ endif
+ endif
+
+ if( axis .eq. 3 )then
+ if( dir .eq. -1 )then
+
+ do i2=1,n2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,n3) = buff(indx, buff_id )
+ enddo
+ enddo
+
+ else if( dir .eq. +1 ) then
+
+ do i2=1,n2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,1) = buff(indx, buff_id )
+ enddo
+ enddo
+
+ endif
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine give3_ex( axis, dir, u, n1, n2, n3, k )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c give3_ex sends border data out to expand number of processors
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer axis, dir, n1, n2, n3, k, ierr
+ double precision u( n1, n2, n3 )
+
+ integer i3, i2, i1, buff_len, buff_id
+
+ buff_id = 2 + dir
+ buff_len = 0
+
+ if( axis .eq. 1 )then
+ if( dir .eq. -1 )then
+
+ do i3=1,n3
+ do i2=1,n2
+ buff_len = buff_len + 1
+ buff(buff_len,buff_id ) = u( 2, i2,i3)
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ else if( dir .eq. +1 ) then
+
+ do i3=1,n3
+ do i2=1,n2
+ do i1=n1-1,n1
+ buff_len = buff_len + 1
+ buff(buff_len,buff_id)= u(i1,i2,i3)
+ enddo
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ endif
+ endif
+
+ if( axis .eq. 2 )then
+ if( dir .eq. -1 )then
+
+ do i3=1,n3
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1, 2,i3)
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ else if( dir .eq. +1 ) then
+
+ do i3=1,n3
+ do i2=n2-1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len,buff_id )= u(i1,i2,i3)
+ enddo
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ endif
+ endif
+
+ if( axis .eq. 3 )then
+ if( dir .eq. -1 )then
+
+ do i2=1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1,i2,2)
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ else if( dir .eq. +1 ) then
+
+ do i3=n3-1,n3
+ do i2=1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1,i2,i3)
+ enddo
+ enddo
+ enddo
+
+ call mpi_send(
+ > buff(1, buff_id ), buff_len,dp_type,
+ > nbr( axis, dir, k ), msg_type(axis,dir),
+ > mpi_comm_world, ierr)
+
+ endif
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine take3_ex( axis, dir, u, n1, n2, n3 )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c take3_ex copies in border data to expand number of processors
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer axis, dir, n1, n2, n3
+ double precision u( n1, n2, n3 )
+
+ integer buff_id, indx
+
+ integer status(mpi_status_size) , ierr
+
+ integer i3, i2, i1
+
+ call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
+ buff_id = 3 + dir
+ indx = 0
+
+ if( axis .eq. 1 )then
+ if( dir .eq. -1 )then
+
+ do i3=1,n3
+ do i2=1,n2
+ indx = indx + 1
+ u(n1,i2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+
+ else if( dir .eq. +1 ) then
+
+ do i3=1,n3
+ do i2=1,n2
+ do i1=1,2
+ indx = indx + 1
+ u(i1,i2,i3) = buff(indx,buff_id)
+ enddo
+ enddo
+ enddo
+
+ endif
+ endif
+
+ if( axis .eq. 2 )then
+ if( dir .eq. -1 )then
+
+ do i3=1,n3
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,n2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+
+ else if( dir .eq. +1 ) then
+
+ do i3=1,n3
+ do i2=1,2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,i3) = buff(indx,buff_id)
+ enddo
+ enddo
+ enddo
+
+ endif
+ endif
+
+ if( axis .eq. 3 )then
+ if( dir .eq. -1 )then
+
+ do i2=1,n2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,n3) = buff(indx, buff_id )
+ enddo
+ enddo
+
+ else if( dir .eq. +1 ) then
+
+ do i3=1,2
+ do i2=1,n2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,i3) = buff(indx,buff_id)
+ enddo
+ enddo
+ enddo
+
+ endif
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine comm1p( axis, u, n1, n2, n3, kk )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer axis, dir, n1, n2, n3
+ double precision u( n1, n2, n3 )
+
+ integer i3, i2, i1, buff_len,buff_id
+ integer i, kk, indx
+
+ dir = -1
+
+ buff_id = 3 + dir
+ buff_len = nm2
+
+ do i=1,nm2
+ buff(i,buff_id) = 0.0D0
+ enddo
+
+
+ dir = +1
+
+ buff_id = 3 + dir
+ buff_len = nm2
+
+ do i=1,nm2
+ buff(i,buff_id) = 0.0D0
+ enddo
+
+ dir = +1
+
+ buff_id = 2 + dir
+ buff_len = 0
+
+ if( axis .eq. 1 )then
+ do i3=2,n3-1
+ do i2=2,n2-1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( n1-1, i2,i3)
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 2 )then
+ do i3=2,n3-1
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id )= u( i1,n2-1,i3)
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 3 )then
+ do i2=1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1,i2,n3-1)
+ enddo
+ enddo
+ endif
+
+ dir = -1
+
+ buff_id = 2 + dir
+ buff_len = 0
+
+ if( axis .eq. 1 )then
+ do i3=2,n3-1
+ do i2=2,n2-1
+ buff_len = buff_len + 1
+ buff(buff_len,buff_id ) = u( 2, i2,i3)
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 2 )then
+ do i3=2,n3-1
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1, 2,i3)
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 3 )then
+ do i2=1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1,i2,2)
+ enddo
+ enddo
+ endif
+
+ do i=1,nm2
+ buff(i,4) = buff(i,3)
+ buff(i,2) = buff(i,1)
+ enddo
+
+ dir = -1
+
+ buff_id = 3 + dir
+ indx = 0
+
+ if( axis .eq. 1 )then
+ do i3=2,n3-1
+ do i2=2,n2-1
+ indx = indx + 1
+ u(n1,i2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 2 )then
+ do i3=2,n3-1
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,n2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 3 )then
+ do i2=1,n2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,n3) = buff(indx, buff_id )
+ enddo
+ enddo
+ endif
+
+
+ dir = +1
+
+ buff_id = 3 + dir
+ indx = 0
+
+ if( axis .eq. 1 )then
+ do i3=2,n3-1
+ do i2=2,n2-1
+ indx = indx + 1
+ u(1,i2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 2 )then
+ do i3=2,n3-1
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,1,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 3 )then
+ do i2=1,n2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,1) = buff(indx, buff_id )
+ enddo
+ enddo
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine comm1p_ex( axis, u, n1, n2, n3, kk )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ include 'globals.h'
+
+ integer axis, dir, n1, n2, n3
+ double precision u( n1, n2, n3 )
+
+ integer i3, i2, i1, buff_len,buff_id
+ integer i, kk, indx
+
+ if( take_ex( axis, kk ) ) then
+
+ dir = -1
+
+ buff_id = 3 + dir
+ buff_len = nm2
+
+ do i=1,nm2
+ buff(i,buff_id) = 0.0D0
+ enddo
+
+
+ dir = +1
+
+ buff_id = 3 + dir
+ buff_len = nm2
+
+ do i=1,nm2
+ buff(i,buff_id) = 0.0D0
+ enddo
+
+
+ dir = -1
+
+ buff_id = 3 + dir
+ indx = 0
+
+ if( axis .eq. 1 )then
+ do i3=1,n3
+ do i2=1,n2
+ indx = indx + 1
+ u(n1,i2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 2 )then
+ do i3=1,n3
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,n2,i3) = buff(indx, buff_id )
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 3 )then
+ do i2=1,n2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,n3) = buff(indx, buff_id )
+ enddo
+ enddo
+ endif
+
+ dir = +1
+
+ buff_id = 3 + dir
+ indx = 0
+
+ if( axis .eq. 1 )then
+ do i3=1,n3
+ do i2=1,n2
+ do i1=1,2
+ indx = indx + 1
+ u(i1,i2,i3) = buff(indx,buff_id)
+ enddo
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 2 )then
+ do i3=1,n3
+ do i2=1,2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,i3) = buff(indx,buff_id)
+ enddo
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 3 )then
+ do i3=1,2
+ do i2=1,n2
+ do i1=1,n1
+ indx = indx + 1
+ u(i1,i2,i3) = buff(indx,buff_id)
+ enddo
+ enddo
+ enddo
+ endif
+
+ endif
+
+ if( give_ex( axis, kk ) )then
+
+ dir = +1
+
+ buff_id = 2 + dir
+ buff_len = 0
+
+ if( axis .eq. 1 )then
+ do i3=1,n3
+ do i2=1,n2
+ do i1=n1-1,n1
+ buff_len = buff_len + 1
+ buff(buff_len,buff_id)= u(i1,i2,i3)
+ enddo
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 2 )then
+ do i3=1,n3
+ do i2=n2-1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len,buff_id )= u(i1,i2,i3)
+ enddo
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 3 )then
+ do i3=n3-1,n3
+ do i2=1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1,i2,i3)
+ enddo
+ enddo
+ enddo
+ endif
+
+ dir = -1
+
+ buff_id = 2 + dir
+ buff_len = 0
+
+ if( axis .eq. 1 )then
+ do i3=1,n3
+ do i2=1,n2
+ buff_len = buff_len + 1
+ buff(buff_len,buff_id ) = u( 2, i2,i3)
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 2 )then
+ do i3=1,n3
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1, 2,i3)
+ enddo
+ enddo
+ endif
+
+ if( axis .eq. 3 )then
+ do i2=1,n2
+ do i1=1,n1
+ buff_len = buff_len + 1
+ buff(buff_len, buff_id ) = u( i1,i2,2)
+ enddo
+ enddo
+ endif
+
+ endif
+
+ do i=1,nm2
+ buff(i,4) = buff(i,3)
+ buff(i,2) = buff(i,1)
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine zran3(z,n1,n2,n3,nx,ny,k)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c zran3 loads +1 at ten randomly chosen points,
+c loads -1 at a different ten random points,
+c and zero elsewhere.
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+
+ integer is1, is2, is3, ie1, ie2, ie3
+ common /grid/ is1,is2,is3,ie1,ie2,ie3
+
+ integer n1, n2, n3, k, nx, ny, ierr, i0, m0, m1
+ double precision z(n1,n2,n3)
+
+ integer mm, i1, i2, i3, d1, e1, e2, e3
+ double precision x, a
+ double precision xx, x0, x1, a1, a2, ai, power
+ parameter( mm = 10, a = 5.D0 ** 13, x = 314159265.D0)
+ double precision ten( mm, 0:1 ), temp, best
+ integer i, j1( mm, 0:1 ), j2( mm, 0:1 ), j3( mm, 0:1 )
+ integer jg( 0:3, mm, 0:1 ), jg_temp(4)
+
+ external randlc
+ double precision randlc, rdummy
+
+ a1 = power( a, nx, 1, 0 )
+ a2 = power( a, nx, ny, 0 )
+
+ call zero3(z,n1,n2,n3)
+
+c i = is1-2+nx*(is2-2+ny*(is3-2))
+
+ ai = power( a, nx, is2-2+ny*(is3-2), is1-2 )
+ d1 = ie1 - is1 + 1
+ e1 = ie1 - is1 + 2
+ e2 = ie2 - is2 + 2
+ e3 = ie3 - is3 + 2
+ x0 = x
+ rdummy = randlc( x0, ai )
+ do i3 = 2, e3
+ x1 = x0
+ do i2 = 2, e2
+ xx = x1
+ call vranlc( d1, xx, a, z( 2, i2, i3 ))
+ rdummy = randlc( x1, a1 )
+ enddo
+ rdummy = randlc( x0, a2 )
+ enddo
+
+c---------------------------------------------------------------------
+c call comm3(z,n1,n2,n3)
+c call showall(z,n1,n2,n3)
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c each processor looks for twenty candidates
+c---------------------------------------------------------------------
+ do i=1,mm
+ ten( i, 1 ) = 0.0D0
+ j1( i, 1 ) = 0
+ j2( i, 1 ) = 0
+ j3( i, 1 ) = 0
+ ten( i, 0 ) = 1.0D0
+ j1( i, 0 ) = 0
+ j2( i, 0 ) = 0
+ j3( i, 0 ) = 0
+ enddo
+
+ do i3=2,n3-1
+ do i2=2,n2-1
+ do i1=2,n1-1
+ if( z(i1,i2,i3) .gt. ten( 1, 1 ) )then
+ ten(1,1) = z(i1,i2,i3)
+ j1(1,1) = i1
+ j2(1,1) = i2
+ j3(1,1) = i3
+ call bubble( ten, j1, j2, j3, mm, 1 )
+ endif
+ if( z(i1,i2,i3) .lt. ten( 1, 0 ) )then
+ ten(1,0) = z(i1,i2,i3)
+ j1(1,0) = i1
+ j2(1,0) = i2
+ j3(1,0) = i3
+ call bubble( ten, j1, j2, j3, mm, 0 )
+ endif
+ enddo
+ enddo
+ enddo
+
+ call mpi_barrier(mpi_comm_world,ierr)
+
+c---------------------------------------------------------------------
+c Now which of these are globally best?
+c---------------------------------------------------------------------
+ i1 = mm
+ i0 = mm
+ do i=mm,1,-1
+
+ best = z( j1(i1,1), j2(i1,1), j3(i1,1) )
+ call mpi_allreduce(best,temp,1,dp_type,
+ > mpi_max,mpi_comm_world,ierr)
+ best = temp
+ if(best.eq.z(j1(i1,1),j2(i1,1),j3(i1,1)))then
+ jg( 0, i, 1) = me
+ jg( 1, i, 1) = is1 - 2 + j1( i1, 1 )
+ jg( 2, i, 1) = is2 - 2 + j2( i1, 1 )
+ jg( 3, i, 1) = is3 - 2 + j3( i1, 1 )
+ i1 = i1-1
+ else
+ jg( 0, i, 1) = 0
+ jg( 1, i, 1) = 0
+ jg( 2, i, 1) = 0
+ jg( 3, i, 1) = 0
+ endif
+ ten( i, 1 ) = best
+ call mpi_allreduce(jg(0,i,1), jg_temp,4,MPI_INTEGER,
+ > mpi_max,mpi_comm_world,ierr)
+ jg( 0, i, 1) = jg_temp(1)
+ jg( 1, i, 1) = jg_temp(2)
+ jg( 2, i, 1) = jg_temp(3)
+ jg( 3, i, 1) = jg_temp(4)
+
+ best = z( j1(i0,0), j2(i0,0), j3(i0,0) )
+ call mpi_allreduce(best,temp,1,dp_type,
+ > mpi_min,mpi_comm_world,ierr)
+ best = temp
+ if(best.eq.z(j1(i0,0),j2(i0,0),j3(i0,0)))then
+ jg( 0, i, 0) = me
+ jg( 1, i, 0) = is1 - 2 + j1( i0, 0 )
+ jg( 2, i, 0) = is2 - 2 + j2( i0, 0 )
+ jg( 3, i, 0) = is3 - 2 + j3( i0, 0 )
+ i0 = i0-1
+ else
+ jg( 0, i, 0) = 0
+ jg( 1, i, 0) = 0
+ jg( 2, i, 0) = 0
+ jg( 3, i, 0) = 0
+ endif
+ ten( i, 0 ) = best
+ call mpi_allreduce(jg(0,i,0), jg_temp,4,MPI_INTEGER,
+ > mpi_max,mpi_comm_world,ierr)
+ jg( 0, i, 0) = jg_temp(1)
+ jg( 1, i, 0) = jg_temp(2)
+ jg( 2, i, 0) = jg_temp(3)
+ jg( 3, i, 0) = jg_temp(4)
+
+ enddo
+ m1 = i1+1
+ m0 = i0+1
+
+c if( me .eq. root) then
+c write(*,*)' '
+c write(*,*)' negative charges at'
+c write(*,9)(jg(1,i,0),jg(2,i,0),jg(3,i,0),i=1,mm)
+c write(*,*)' positive charges at'
+c write(*,9)(jg(1,i,1),jg(2,i,1),jg(3,i,1),i=1,mm)
+c write(*,*)' small random numbers were'
+c write(*,8)(ten( i,0),i=mm,1,-1)
+c write(*,*)' and they were found on processor number'
+c write(*,7)(jg(0,i,0),i=mm,1,-1)
+c write(*,*)' large random numbers were'
+c write(*,8)(ten( i,1),i=mm,1,-1)
+c write(*,*)' and they were found on processor number'
+c write(*,7)(jg(0,i,1),i=mm,1,-1)
+c endif
+c 9 format(5(' (',i3,2(',',i3),')'))
+c 8 format(5D15.8)
+c 7 format(10i4)
+ call mpi_barrier(mpi_comm_world,ierr)
+ do i3=1,n3
+ do i2=1,n2
+ do i1=1,n1
+ z(i1,i2,i3) = 0.0D0
+ enddo
+ enddo
+ enddo
+ do i=mm,m0,-1
+ z( j1(i,0), j2(i,0), j3(i,0) ) = -1.0D0
+ enddo
+ do i=mm,m1,-1
+ z( j1(i,1), j2(i,1), j3(i,1) ) = +1.0D0
+ enddo
+ call comm3(z,n1,n2,n3,k)
+
+c---------------------------------------------------------------------
+c call showall(z,n1,n2,n3)
+c---------------------------------------------------------------------
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine show_l(z,n1,n2,n3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+
+ integer n1,n2,n3,i1,i2,i3,ierr
+ double precision z(n1,n2,n3)
+ integer m1, m2, m3,i
+
+ m1 = min(n1,18)
+ m2 = min(n2,14)
+ m3 = min(n3,18)
+
+ write(*,*)' '
+ do i=0,nprocs-1
+ if( me .eq. i )then
+ write(*,*)' id = ', me
+ do i3=1,m3
+ do i1=1,m1
+ write(*,6)(z(i1,i2,i3),i2=1,m2)
+ enddo
+ write(*,*)' - - - - - - - '
+ enddo
+ write(*,*)' '
+ 6 format(6f15.11)
+ endif
+ call mpi_barrier(mpi_comm_world,ierr)
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine showall(z,n1,n2,n3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+
+ integer n1,n2,n3,i1,i2,i3,i,ierr
+ double precision z(n1,n2,n3)
+ integer m1, m2, m3
+
+ m1 = min(n1,18)
+ m2 = min(n2,14)
+ m3 = min(n3,18)
+
+ write(*,*)' '
+ do i=0,nprocs-1
+ if( me .eq. i )then
+ write(*,*)' id = ', me
+ do i3=1,m3
+ do i1=1,m1
+ write(*,6)(z(i1,i2,i3),i2=1,m2)
+ enddo
+ write(*,*)' - - - - - - - '
+ enddo
+ write(*,*)' '
+ 6 format(15f6.3)
+ endif
+ call mpi_barrier(mpi_comm_world,ierr)
+ enddo
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine show(z,n1,n2,n3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+ integer n1,n2,n3,i1,i2,i3,ierr,i
+ double precision z(n1,n2,n3)
+
+ write(*,*)' '
+ do i=0,nprocs-1
+ if( me .eq. i )then
+ write(*,*)' id = ', me
+ do i3=2,n3-1
+ do i1=2,n1-1
+ write(*,6)(z(i1,i2,i3),i2=2,n1-1)
+ enddo
+ write(*,*)' - - - - - - - '
+ enddo
+ write(*,*)' '
+ 6 format(8D10.3)
+ endif
+ call mpi_barrier(mpi_comm_world,ierr)
+ enddo
+
+c call comm3(z,n1,n2,n3)
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ double precision function power( a, n1, n2, n3 )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c power raises an integer, disguised as a double
+c precision real, to an integer power.
+c This version tries to avoid integer overflow by treating
+c it as expressed in a form of "n1*n2+n3".
+c---------------------------------------------------------------------
+ implicit none
+
+ double precision a, aj
+ integer n1, n2, n3
+
+ integer n1j, n2j, nj
+ external randlc
+ double precision randlc, rdummy
+
+ power = 1.0d0
+ aj = a
+ nj = n3
+ n1j = n1
+ n2j = n2
+ 100 continue
+
+ if( n2j .gt. 0 ) then
+ if( mod(n2j,2) .eq. 1 ) nj = nj + n1j
+ n2j = n2j/2
+ else if( nj .eq. 0 ) then
+ go to 200
+ endif
+ if( mod(nj,2) .eq. 1 ) rdummy = randlc( power, aj )
+ rdummy = randlc( aj, aj )
+ nj = nj/2
+ go to 100
+
+ 200 continue
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine bubble( ten, j1, j2, j3, m, ind )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c bubble does a bubble sort in direction dir
+c---------------------------------------------------------------------
+ implicit none
+
+ include 'mpinpb.h'
+
+ integer m, ind, j1( m, 0:1 ), j2( m, 0:1 ), j3( m, 0:1 )
+ double precision ten( m, 0:1 )
+ double precision temp
+ integer i, j_temp
+
+ if( ind .eq. 1 )then
+
+ do i=1,m-1
+ if( ten(i,ind) .gt. ten(i+1,ind) )then
+
+ temp = ten( i+1, ind )
+ ten( i+1, ind ) = ten( i, ind )
+ ten( i, ind ) = temp
+
+ j_temp = j1( i+1, ind )
+ j1( i+1, ind ) = j1( i, ind )
+ j1( i, ind ) = j_temp
+
+ j_temp = j2( i+1, ind )
+ j2( i+1, ind ) = j2( i, ind )
+ j2( i, ind ) = j_temp
+
+ j_temp = j3( i+1, ind )
+ j3( i+1, ind ) = j3( i, ind )
+ j3( i, ind ) = j_temp
+
+ else
+ return
+ endif
+ enddo
+
+ else
+
+ do i=1,m-1
+ if( ten(i,ind) .lt. ten(i+1,ind) )then
+
+ temp = ten( i+1, ind )
+ ten( i+1, ind ) = ten( i, ind )
+ ten( i, ind ) = temp
+
+ j_temp = j1( i+1, ind )
+ j1( i+1, ind ) = j1( i, ind )
+ j1( i, ind ) = j_temp
+
+ j_temp = j2( i+1, ind )
+ j2( i+1, ind ) = j2( i, ind )
+ j2( i, ind ) = j_temp
+
+ j_temp = j3( i+1, ind )
+ j3( i+1, ind ) = j3( i, ind )
+ j3( i, ind ) = j_temp
+
+ else
+ return
+ endif
+ enddo
+
+ endif
+
+ return
+ end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine zero3(z,n1,n2,n3)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+ include 'mpinpb.h'
+
+ integer n1, n2, n3
+ double precision z(n1,n2,n3)
+ integer i1, i2, i3
+
+ do i3=1,n3
+ do i2=1,n2
+ do i1=1,n1
+ z(i1,i2,i3)=0.0D0
+ enddo
+ enddo
+ enddo
+
+ return
+ end
+
+
+c----- end of program ------------------------------------------------
--- /dev/null
+ 8 = top level
+ 256 256 256 = nx ny nz
+ 20 = nit
+ 0 0 0 0 0 0 0 0 = debug_vec
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'mpif.h'
+
+ integer me, nprocs, root, dp_type
+ common /mpistuff/ me, nprocs, root, dp_type
+
--- /dev/null
+# Makefile for MPI dummy library.
+# Must be edited for a specific machine. Does NOT read in
+# the make.def file of NPB 2.3
+F77 = f77
+CC = cc
+AR = ar
+
+# Enable if either Cray or IBM: (no such flag for most machines: see wtime.h)
+# MACHINE = -DCRAY
+# MACHINE = -DIBM
+
+libmpi.a: mpi_dummy.o mpi_dummy_c.o wtime.o
+ $(AR) r libmpi.a mpi_dummy.o mpi_dummy_c.o wtime.o
+
+mpi_dummy.o: mpi_dummy.f mpif.h
+ $(F77) -c mpi_dummy.f
+# For a Cray C90, try:
+# cf77 -dp -c mpi_dummy.f
+# For an IBM 590, try:
+# xlf -c mpi_dummy.f
+
+mpi_dummy_c.o: mpi_dummy.c mpi.h
+ $(CC) -c ${MACHINE} -o mpi_dummy_c.o mpi_dummy.c
+
+wtime.o: wtime.c
+# For most machines or CRAY or IBM
+ $(CC) -c ${MACHINE} wtime.c
+# For a precise timer on an SGI Power Challenge, try:
+# $(CC) -o wtime.o -c wtime_sgi64.c
+
+test: test.f
+ $(F77) -o test -I. test.f -L. -lmpi
+
+
+
+clean:
+ - rm -f *~ *.o
+ - rm -f test libmpi.a
--- /dev/null
+###########################################
+# NAS Parallel Benchmarks 2&3 #
+# MPI/F77/C #
+# Revision 3.3 #
+# NASA Ames Research Center #
+# npb@nas.nasa.gov #
+# http://www.nas.nasa.gov/Software/NPB/ #
+###########################################
+
+MPI Dummy Library
+
+
+The MPI dummy library is supplied as a convenience for people who do
+not have an MPI library but would like to try running on one processor
+anyway. The NPB 2.x/3.x benchmarks are designed so that they do not
+actually try to do any message passing when run on one node. The MPI
+dummy library is just that - a set of dummy MPI routines which don't
+do anything, but allow you to link the benchmarks. Actually they do a
+few things, but nothing important. Note that the dummy library is
+sufficient only for the NPB 2.x/3.x benchmarks. It probably won't be
+useful for anything else because it implements only a handful of
+functions.
+
+Because the dummy library is just an extra goody, and since we don't
+have an infinite amount of time, it may be a bit trickier to configure
+than the rest of the benchmarks. You need to:
+
+1. Find out how C and Fortran interact on your machine. On most machines,
+the fortran functon foo(x) is declared in C as foo_(xp) where xp is
+a pointer, not a value. On IBMs, it's just foo(xp). On Cray C90s, its
+FOO(xp). You can define CRAY or IBM to get these, or you need to
+edit wtime.c if you've got something else.
+
+2. Edit the Makefile to compile mpi_dummy.f and wtime.c correctly
+for your machine (including -DCRAY or -DIBM if necessary).
+
+3. The substitute MPI timer gives wall clock time, not CPU time.
+If you're running on a timeshared machine, you may want to
+use a CPU timer. Edit the function mpi_wtime() in mpi_dummy.f
+to change this timer. (NOTE: for official benchmark results,
+ONLY wall clock times are valid. Using a CPU timer is ok
+if you want to get things running, but don't report any results
+measured with a CPU timer. )
+
+TROUBLESHOOTING
+
+o Compiling or linking of the benchmark aborts because the dummy MPI
+ header file or the dummy MPI library cannot be found.
+ - the file make.dummy in subdirectory config relies on the use
+ of the -I"path" and -L"path" -l"library" constructs to pass
+ information to the compilers and linkers. Edit this file to conform
+ to your system.
--- /dev/null
+#define MPI_DOUBLE 1
+#define MPI_INT 2
+#define MPI_BYTE 3
+#define MPI_FLOAT 4
+#define MPI_LONG 5
+
+#define MPI_COMM_WORLD 0
+
+#define MPI_MAX 1
+#define MPI_SUM 2
+#define MPI_MIN 3
+
+#define MPI_SUCCESS 0
+#define MPI_ANY_SOURCE -1
+#define MPI_ERR_OTHER -1
+#define MPI_STATUS_SIZE 3
+
+
+/*
+ Status object. It is the only user-visible MPI data-structure
+ The "count" field is PRIVATE; use MPI_Get_count to access it.
+ */
+typedef struct {
+ int count;
+ int MPI_SOURCE;
+ int MPI_TAG;
+ int MPI_ERROR;
+} MPI_Status;
+
+
+/* MPI request objects */
+typedef int MPI_Request;
+
+/* MPI datatype */
+typedef int MPI_Datatype;
+
+/* MPI comm */
+typedef int MPI_Comm;
+
+/* MPI operation */
+typedef int MPI_Op;
+
+
+
+/* Prototypes: */
+void mpi_error( void );
+
+int MPI_Irecv( void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int source,
+ int tag,
+ MPI_Comm comm,
+ MPI_Request *request );
+
+int MPI_Send( void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int dest,
+ int tag,
+ MPI_Comm comm );
+
+int MPI_Wait( MPI_Request *request,
+ MPI_Status *status );
+
+int MPI_Init( int *argc,
+ char ***argv );
+
+int MPI_Comm_rank( MPI_Comm comm,
+ int *rank );
+
+int MPI_Comm_size( MPI_Comm comm,
+ int *size );
+
+double MPI_Wtime( void );
+
+int MPI_Barrier( MPI_Comm comm );
+
+int MPI_Finalize( void );
+
+int MPI_Allreduce( void *sendbuf,
+ void *recvbuf,
+ int nitems,
+ MPI_Datatype type,
+ MPI_Op op,
+ MPI_Comm comm );
+
+int MPI_Reduce( void *sendbuf,
+ void *recvbuf,
+ int nitems,
+ MPI_Datatype type,
+ MPI_Op op,
+ int root,
+ MPI_Comm comm );
+
+int MPI_Alltoall( void *sendbuf,
+ int sendcount,
+ MPI_Datatype sendtype,
+ void *recvbuf,
+ int recvcount,
+ MPI_Datatype recvtype,
+ MPI_Comm comm );
+
+int MPI_Alltoallv( void *sendbuf,
+ int *sendcounts,
+ int *senddispl,
+ MPI_Datatype sendtype,
+ void *recvbuf,
+ int *recvcounts,
+ int *recvdispl,
+ MPI_Datatype recvtype,
+ MPI_Comm comm );
--- /dev/null
+#include "mpi.h"
+#include "wtime.h"
+#include <stdlib.h>
+
+
+
+void mpi_error( void )
+{
+ printf( "mpi_error called\n" );
+ abort();
+}
+
+
+
+
+int MPI_Irecv( void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int source,
+ int tag,
+ MPI_Comm comm,
+ MPI_Request *request )
+{
+ mpi_error();
+ return( MPI_ERR_OTHER );
+}
+
+
+
+
+int MPI_Recv( void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int source,
+ int tag,
+ MPI_Comm comm,
+ MPI_Status *status )
+{
+ mpi_error();
+ return( MPI_ERR_OTHER );
+}
+
+
+
+
+int MPI_Send( void *buf,
+ int count,
+ MPI_Datatype datatype,
+ int dest,
+ int tag,
+ MPI_Comm comm )
+{
+ mpi_error();
+ return( MPI_ERR_OTHER );
+}
+
+
+
+
+int MPI_Wait( MPI_Request *request,
+ MPI_Status *status )
+{
+ mpi_error();
+ return( MPI_ERR_OTHER );
+}
+
+
+
+
+int MPI_Init( int *argc,
+ char ***argv )
+{
+ return( MPI_SUCCESS );
+}
+
+
+
+
+int MPI_Comm_rank( MPI_Comm comm,
+ int *rank )
+{
+ *rank = 0;
+ return( MPI_SUCCESS );
+}
+
+
+
+
+int MPI_Comm_size( MPI_Comm comm,
+ int *size )
+{
+ *size = 1;
+ return( MPI_SUCCESS );
+}
+
+
+
+
+double MPI_Wtime( void )
+{
+ void wtime();
+
+ double t;
+ wtime( &t );
+ return( t );
+}
+
+
+
+
+int MPI_Barrier( MPI_Comm comm )
+{
+ return( MPI_SUCCESS );
+}
+
+
+
+
+int MPI_Finalize( void )
+{
+ return( MPI_SUCCESS );
+}
+
+
+
+
+int MPI_Allreduce( void *sendbuf,
+ void *recvbuf,
+ int nitems,
+ MPI_Datatype type,
+ MPI_Op op,
+ MPI_Comm comm )
+{
+ int i;
+ if( type == MPI_INT )
+ {
+ int *pd_sendbuf, *pd_recvbuf;
+ pd_sendbuf = (int *) sendbuf;
+ pd_recvbuf = (int *) recvbuf;
+ for( i=0; i<nitems; i++ )
+ *(pd_recvbuf+i) = *(pd_sendbuf+i);
+ }
+ if( type == MPI_LONG )
+ {
+ long *pd_sendbuf, *pd_recvbuf;
+ pd_sendbuf = (long *) sendbuf;
+ pd_recvbuf = (long *) recvbuf;
+ for( i=0; i<nitems; i++ )
+ *(pd_recvbuf+i) = *(pd_sendbuf+i);
+ }
+ if( type == MPI_DOUBLE )
+ {
+ double *pd_sendbuf, *pd_recvbuf;
+ pd_sendbuf = (double *) sendbuf;
+ pd_recvbuf = (double *) recvbuf;
+ for( i=0; i<nitems; i++ )
+ *(pd_recvbuf+i) = *(pd_sendbuf+i);
+ }
+ return( MPI_SUCCESS );
+}
+
+
+
+
+int MPI_Reduce( void *sendbuf,
+ void *recvbuf,
+ int nitems,
+ MPI_Datatype type,
+ MPI_Op op,
+ int root,
+ MPI_Comm comm )
+{
+ int i;
+ if( type == MPI_INT )
+ {
+ int *pi_sendbuf, *pi_recvbuf;
+ pi_sendbuf = (int *) sendbuf;
+ pi_recvbuf = (int *) recvbuf;
+ for( i=0; i<nitems; i++ )
+ *(pi_recvbuf+i) = *(pi_sendbuf+i);
+ }
+ if( type == MPI_LONG )
+ {
+ long *pi_sendbuf, *pi_recvbuf;
+ pi_sendbuf = (long *) sendbuf;
+ pi_recvbuf = (long *) recvbuf;
+ for( i=0; i<nitems; i++ )
+ *(pi_recvbuf+i) = *(pi_sendbuf+i);
+ }
+ if( type == MPI_DOUBLE )
+ {
+ double *pd_sendbuf, *pd_recvbuf;
+ pd_sendbuf = (double *) sendbuf;
+ pd_recvbuf = (double *) recvbuf;
+ for( i=0; i<nitems; i++ )
+ *(pd_recvbuf+i) = *(pd_sendbuf+i);
+ }
+ return( MPI_SUCCESS );
+}
+
+
+
+
+int MPI_Alltoall( void *sendbuf,
+ int sendcount,
+ MPI_Datatype sendtype,
+ void *recvbuf,
+ int recvcount,
+ MPI_Datatype recvtype,
+ MPI_Comm comm )
+{
+ int i;
+ if( recvtype == MPI_INT )
+ {
+ int *pd_sendbuf, *pd_recvbuf;
+ pd_sendbuf = (int *) sendbuf;
+ pd_recvbuf = (int *) recvbuf;
+ for( i=0; i<sendcount; i++ )
+ *(pd_recvbuf+i) = *(pd_sendbuf+i);
+ }
+ if( recvtype == MPI_LONG )
+ {
+ long *pd_sendbuf, *pd_recvbuf;
+ pd_sendbuf = (long *) sendbuf;
+ pd_recvbuf = (long *) recvbuf;
+ for( i=0; i<sendcount; i++ )
+ *(pd_recvbuf+i) = *(pd_sendbuf+i);
+ }
+ return( MPI_SUCCESS );
+}
+
+
+
+
+int MPI_Alltoallv( void *sendbuf,
+ int *sendcounts,
+ int *senddispl,
+ MPI_Datatype sendtype,
+ void *recvbuf,
+ int *recvcounts,
+ int *recvdispl,
+ MPI_Datatype recvtype,
+ MPI_Comm comm )
+{
+ int i;
+ if( recvtype == MPI_INT )
+ {
+ int *pd_sendbuf, *pd_recvbuf;
+ pd_sendbuf = (int *) sendbuf;
+ pd_recvbuf = (int *) recvbuf;
+ for( i=0; i<sendcounts[0]; i++ )
+ *(pd_recvbuf+i+recvdispl[0]) = *(pd_sendbuf+i+senddispl[0]);
+ }
+ if( recvtype == MPI_LONG )
+ {
+ long *pd_sendbuf, *pd_recvbuf;
+ pd_sendbuf = (long *) sendbuf;
+ pd_recvbuf = (long *) recvbuf;
+ for( i=0; i<sendcounts[0]; i++ )
+ *(pd_recvbuf+i+recvdispl[0]) = *(pd_sendbuf+i+senddispl[0]);
+ }
+ return( MPI_SUCCESS );
+}
+
+
+
+
--- /dev/null
+ subroutine mpi_isend(buf,count,datatype,source,
+ & tag,comm,request,ierror)
+ integer buf(*), count,datatype,source,tag,comm,
+ & request,ierror
+ call mpi_error()
+ return
+ end
+
+ subroutine mpi_irecv(buf,count,datatype,source,
+ & tag,comm,request,ierror)
+ integer buf(*), count,datatype,source,tag,comm,
+ & request,ierror
+ call mpi_error()
+ return
+ end
+
+ subroutine mpi_send(buf,count,datatype,dest,tag,comm,ierror)
+ integer buf(*), count,datatype,dest,tag,comm,ierror
+ call mpi_error()
+ return
+ end
+
+ subroutine mpi_recv(buf,count,datatype,source,
+ & tag,comm,status,ierror)
+ integer buf(*), count,datatype,source,tag,comm,
+ & status(*),ierror
+ call mpi_error()
+ return
+ end
+
+ subroutine mpi_comm_split(comm,color,key,newcomm,ierror)
+ integer comm,color,key,newcomm,ierror
+ return
+ end
+
+ subroutine mpi_comm_rank(comm, rank,ierr)
+ implicit none
+ integer comm, rank,ierr
+ rank = 0
+ return
+ end
+
+ subroutine mpi_comm_size(comm, size, ierr)
+ implicit none
+ integer comm, size, ierr
+ size = 1
+ return
+ end
+
+ double precision function mpi_wtime()
+ implicit none
+ double precision t
+c This function must measure wall clock time, not CPU time.
+c Since there is no portable timer in Fortran (77)
+c we call a routine compiled in C (though the C source may have
+c to be tweaked).
+ call wtime(t)
+c The following is not ok for "official" results because it reports
+c CPU time not wall clock time. It may be useful for developing/testing
+c on timeshared Crays, though.
+c call second(t)
+
+ mpi_wtime = t
+
+ return
+ end
+
+
+c may be valid to call this in single processor case
+ subroutine mpi_barrier(comm,ierror)
+ return
+ end
+
+c may be valid to call this in single processor case
+ subroutine mpi_bcast(buf, nitems, type, root, comm, ierr)
+ implicit none
+ integer buf(*), nitems, type, root, comm, ierr
+ return
+ end
+
+ subroutine mpi_comm_dup(oldcomm, newcomm,ierror)
+ integer oldcomm, newcomm,ierror
+ newcomm= oldcomm
+ return
+ end
+
+ subroutine mpi_error()
+ print *, 'mpi_error called'
+ stop
+ end
+
+ subroutine mpi_abort(comm, errcode, ierr)
+ implicit none
+ integer comm, errcode, ierr
+ print *, 'mpi_abort called'
+ stop
+ end
+
+ subroutine mpi_finalize(ierr)
+ return
+ end
+
+ subroutine mpi_init(ierr)
+ return
+ end
+
+
+c assume double precision, which is all SP uses
+ subroutine mpi_reduce(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ implicit none
+ include 'mpif.h'
+ integer nitems, type, op, root, comm, ierr
+ double precision inbuf(*), outbuf(*)
+
+ if (type .eq. mpi_double_precision) then
+ call mpi_reduce_dp(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ else if (type .eq. mpi_double_complex) then
+ call mpi_reduce_dc(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ else if (type .eq. mpi_complex) then
+ call mpi_reduce_complex(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ else if (type .eq. mpi_real) then
+ call mpi_reduce_real(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ else if (type .eq. mpi_integer) then
+ call mpi_reduce_int(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ else
+ print *, 'mpi_reduce: unknown type ', type
+ end if
+ return
+ end
+
+
+ subroutine mpi_reduce_real(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ implicit none
+ integer nitems, type, op, root, comm, ierr, i
+ real inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+ subroutine mpi_reduce_dp(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ implicit none
+ integer nitems, type, op, root, comm, ierr, i
+ double precision inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+ subroutine mpi_reduce_dc(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ implicit none
+ integer nitems, type, op, root, comm, ierr, i
+ double complex inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+
+ subroutine mpi_reduce_complex(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ implicit none
+ integer nitems, type, op, root, comm, ierr, i
+ complex inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+ subroutine mpi_reduce_int(inbuf, outbuf, nitems,
+ $ type, op, root, comm, ierr)
+ implicit none
+ integer nitems, type, op, root, comm, ierr, i
+ integer inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+ subroutine mpi_allreduce(inbuf, outbuf, nitems,
+ $ type, op, comm, ierr)
+ implicit none
+ integer nitems, type, op, comm, ierr
+ double precision inbuf(*), outbuf(*)
+
+ call mpi_reduce(inbuf, outbuf, nitems,
+ $ type, op, 0, comm, ierr)
+ return
+ end
+
+ subroutine mpi_alltoall(inbuf, nitems, type, outbuf, nitems_dum,
+ $ type_dum, comm, ierr)
+ implicit none
+ include 'mpif.h'
+ integer nitems, type, comm, ierr, nitems_dum, type_dum
+ double precision inbuf(*), outbuf(*)
+ if (type .eq. mpi_double_precision) then
+ call mpi_alltoall_dp(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ else if (type .eq. mpi_double_complex) then
+ call mpi_alltoall_dc(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ else if (type .eq. mpi_complex) then
+ call mpi_alltoall_complex(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ else if (type .eq. mpi_real) then
+ call mpi_alltoall_real(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ else if (type .eq. mpi_integer) then
+ call mpi_alltoall_int(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ else
+ print *, 'mpi_alltoall: unknown type ', type
+ end if
+ return
+ end
+
+ subroutine mpi_alltoall_dc(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ implicit none
+ integer nitems, type, comm, ierr, i
+ double complex inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+
+ subroutine mpi_alltoall_complex(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ implicit none
+ integer nitems, type, comm, ierr, i
+ double complex inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+ subroutine mpi_alltoall_dp(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ implicit none
+ integer nitems, type, comm, ierr, i
+ double precision inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+ subroutine mpi_alltoall_real(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ implicit none
+ integer nitems, type, comm, ierr, i
+ real inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+ subroutine mpi_alltoall_int(inbuf, outbuf, nitems,
+ $ type, comm, ierr)
+ implicit none
+ integer nitems, type, comm, ierr, i
+ integer inbuf(*), outbuf(*)
+ do i = 1, nitems
+ outbuf(i) = inbuf(i)
+ end do
+
+ return
+ end
+
+ subroutine mpi_wait(request,status,ierror)
+ integer request,status,ierror
+ call mpi_error()
+ return
+ end
+
+ subroutine mpi_waitall(count,requests,status,ierror)
+ integer count,requests(*),status(*),ierror
+ call mpi_error()
+ return
+ end
+
--- /dev/null
+ integer mpi_comm_world
+ parameter (mpi_comm_world = 0)
+
+ integer mpi_max, mpi_min, mpi_sum
+ parameter (mpi_max = 1, mpi_sum = 2, mpi_min = 3)
+
+ integer mpi_byte, mpi_integer, mpi_real,
+ > mpi_double_precision, mpi_complex,
+ > mpi_double_complex
+ parameter (mpi_double_precision = 1,
+ $ mpi_integer = 2,
+ $ mpi_byte = 3,
+ $ mpi_real= 4,
+ $ mpi_complex = 5,
+ $ mpi_double_complex = 6)
+
+ integer mpi_any_source
+ parameter (mpi_any_source = -1)
+
+ integer mpi_err_other
+ parameter (mpi_err_other = -1)
+
+ double precision mpi_wtime
+ external mpi_wtime
+
+ integer mpi_status_size
+ parameter (mpi_status_size=3)
--- /dev/null
+ program
+ implicit none
+ double precision t, mpi_wtime
+ external mpi_wtime
+ t = 0.0
+ t = mpi_wtime()
+ print *, t
+ t = mpi_wtime()
+ print *, t
+ end
--- /dev/null
+#include "wtime.h"
+#include <sys/time.h>
+
+void wtime(double *t)
+{
+ static int sec = -1;
+ struct timeval tv;
+ gettimeofday(&tv, (void *)0);
+ if (sec < 0) sec = tv.tv_sec;
+ *t = (tv.tv_sec - sec) + 1.0e-6*tv.tv_usec;
+}
+
+
--- /dev/null
+ subroutine wtime(tim)
+ real*8 tim
+ dimension tarray(2)
+ call etime(tarray)
+ tim = tarray(1)
+ return
+ end
+
+
+
+
+
--- /dev/null
+/* C/Fortran interface is different on different machines.
+ * You may need to tweak this.
+ */
+
+
+#if defined(IBM)
+#define wtime wtime
+#elif defined(CRAY)
+#define wtime WTIME
+#else
+#define wtime wtime_
+#endif
--- /dev/null
+#include <sys/types.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/syssgi.h>
+#include <sys/immu.h>
+#include <errno.h>
+#include <stdio.h>
+
+/* The following works on SGI Power Challenge systems */
+
+typedef unsigned long iotimer_t;
+
+unsigned int cycleval;
+volatile iotimer_t *iotimer_addr, base_counter;
+double resolution;
+
+/* address_t is an integer type big enough to hold an address */
+typedef unsigned long address_t;
+
+
+
+void timer_init()
+{
+
+ int fd;
+ char *virt_addr;
+ address_t phys_addr, page_offset, pagemask, pagebase_addr;
+
+ pagemask = getpagesize() - 1;
+ errno = 0;
+ phys_addr = syssgi(SGI_QUERY_CYCLECNTR, &cycleval);
+ if (errno != 0) {
+ perror("SGI_QUERY_CYCLECNTR");
+ exit(1);
+ }
+ /* rel_addr = page offset of physical address */
+ page_offset = phys_addr & pagemask;
+ pagebase_addr = phys_addr - page_offset;
+ fd = open("/dev/mmem", O_RDONLY);
+
+ virt_addr = mmap(0, pagemask, PROT_READ, MAP_PRIVATE, fd, pagebase_addr);
+ virt_addr = virt_addr + page_offset;
+ iotimer_addr = (iotimer_t *)virt_addr;
+ /* cycleval in picoseconds to this gives resolution in seconds */
+ resolution = 1.0e-12*cycleval;
+ base_counter = *iotimer_addr;
+}
+
+void wtime_(double *time)
+{
+ static int initialized = 0;
+ volatile iotimer_t counter_value;
+ if (!initialized) {
+ timer_init();
+ initialized = 1;
+ }
+ counter_value = *iotimer_addr - base_counter;
+ *time = (double)counter_value * resolution;
+}
+
+
+void wtime(double *time)
+{
+ static int initialized = 0;
+ volatile iotimer_t counter_value;
+ if (!initialized) {
+ timer_init();
+ initialized = 1;
+ }
+ counter_value = *iotimer_addr - base_counter;
+ *time = (double)counter_value * resolution;
+}
+
+
--- /dev/null
+SHELL=/bin/sh
+CLASS=U
+NPROCS=1
+SUBTYPE=
+VERSION=
+SFILE=config/suite.def
+
+default: header
+ @ sys/print_instructions
+
+BT: bt
+bt: header
+ cd BT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) SUBTYPE=$(SUBTYPE) VERSION=$(VERSION)
+
+SP: sp
+sp: header
+ cd SP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+LU: lu
+lu: header
+ cd LU; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) VERSION=$(VERSION)
+
+MG: mg
+mg: header
+ cd MG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+FT: ft
+ft: header
+ cd FT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+IS: is
+is: header
+ cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+CG: cg
+cg: header
+ cd CG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+EP: ep
+ep: header
+ cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+
+DT: dt
+dt: header
+ cd DT; $(MAKE) CLASS=$(CLASS)
+
+# Awk script courtesy cmg@cray.com, modified by Haoqiang Jin
+suite:
+ @ awk -f sys/suite.awk SMAKE=$(MAKE) $(SFILE) | $(SHELL)
+
+
+# It would be nice to make clean in each subdirectory (the targets
+# are defined) but on a really clean system this will won't work
+# because those makefiles need config/make.def
+clean:
+ - rm -f core
+ - rm -f *~ */core */*~ */*.o */npbparams.h */*.obj */*.exe
+ - rm -f MPI_dummy/test MPI_dummy/libmpi.a
+ - rm -f sys/setparams sys/makesuite sys/setparams.h
+ - rm -f btio.*.out*
+
+veryclean: clean
+ - rm -f config/make.def config/suite.def
+ - rm -f bin/sp.* bin/lu.* bin/mg.* bin/ft.* bin/bt.* bin/is.*
+ - rm -f bin/ep.* bin/cg.* bin/dt.*
+
+header:
+ @ sys/print_header
+
--- /dev/null
+*** Warning ***
+
+This version of benchmarks IS, DT and EP contain special
+tweaks to work with SMPI.
+
+
+
+The MPI implementation of NPB 3.3 (NPB3.3-MPI)
+--------------------------------------------------
+
+For problem reports and suggestions on the implementation,
+please contact:
+
+ NAS Parallel Benchmark Team
+ npb@nas.nasa.gov
+
+ http://www.nas.nasa.gov/Software/NPB
+
+
+This directory contains the MPI implementation of the NAS
+Parallel Benchmarks, Version 3.3 (NPB3.3-MPI). A brief
+summary of the new features introduced in this version is
+given below.
+
+For changes from different versions, see the Changes.log file
+included in the upper directory of this distribution.
+
+For explanation of compilation and running of the benchmarks,
+please refer to README.install. For a special note on DT, please
+see the README file in the DT subdirectory.
+
+
+New features in NPB3.3-MPI:
+ * NPB3.3-MPI introduces a new problem size (class E) to seven of
+ the benchmarks (BT, SP, LU, CG, MG, FT, and EP). The version
+ also includes a new problem size (class D) for the IS benchmark,
+ which was not present in the previous releases.
+
+ * The release is merged with the vector codes for the BT and LU
+ benchmarks, which can be selected with the VERSION=VEC option
+ during compilation. However, it should be noted that successful
+ vectorization highly depends on the compiler used. Some changes
+ to compiler directives for vectorization in the current codes
+ (see *_vec.f files) may be required.
+
+ * New improvements to BTIO (BT with IO subtypes):
+ - added I/O stats (I/O timing, data size written, I/O data rate)
+ - added an option for interleaving reads between writes through
+ the inputbt.data file. Although the data file size would be
+ smaller as a result, the total amount of data written is still
+ the same.
+
--- /dev/null
+Some explanations on the MPI implementation of NPB 3.3 (NPB3.3-MPI)
+----------------------------------------------------------------------
+
+NPB-MPI is a sample MPI implementation based on NPB2.4 and NPB3.0-SER.
+This implementation contains all eight original benchmarks:
+Seven in Fortran: BT, SP, LU, FT, CG, MG, and EP; one in C: IS,
+as well as the DT benchmark, written in C, introduced in NPB3.2-MPI.
+
+For changes from different versions, see the Changes.log file
+included in the upper directory of this distribution.
+
+This version has been tested, among others, on an SGI Origin3000 and
+an SGI Altix. For problem reports and suggestions on the implementation,
+please contact
+
+ NAS Parallel Benchmark Team
+ npb@nas.nasa.gov
+
+
+CAUTION *********************************
+When running the I/O benchmark, one or more data files will be written
+in the directory from which the executable is invoked. They are not
+deleted at the end of the program. A new run will overwrite the old
+file(s). If not enough space is available in the user partition, the
+program will fail. For classes C and D the disk space required is
+3 GB and 135 GB, respectively.
+*****************************************
+
+
+1. Compilation
+
+ NPB3-MPI uses the same directory tree as NPB3-SER (and NPB2.x) does.
+ Before compilation, one needs to check the configuration file
+ 'make.def' in the config directory and modify the file if necessary.
+ If it does not (yet) exist, copy 'make.def.template' or one of the
+ sample files in the NAS.samples subdirectory to 'make.def' and
+ edit the content for site- and machine-specific data. Then
+
+ make <benchmark-name> NPROCS=<number> CLASS=<class> \
+ [SUBTYPE=<type>] [VERSION=VEC]
+
+ where <benchmark-name> is "bt", "cg", "dt", "ep", "ft", "is",
+ "lu", "mg", or "sp"
+ <number> is the number of processes
+ <class> is "S", "W", "A", "B", "C", "D", or "E"
+
+ Classes C, D and E are not available for DT.
+ Class E is not available for IS.
+
+ The "VERSION=VEC" option is used for selecting the vectorized
+ versions of BT and LU.
+
+ Only when making the I/O benchmark:
+ <benchmark-name> is "bt"
+ <number>, <class> as above
+ <type> is "full", "simple", "fortran", or "epio"
+
+ Three parameters not used in the original BT benchmark are present in
+ the I/O benchmark. Two are set by default in the file BT/bt.f.
+ Changing them is optional.
+ One is set in make.def. It must be specified.
+
+ bt.f: collbuf_nodes: number of processes used to buffer data before
+ writing to file in the collective buffering mode
+ (<type> is "full").
+ collbuf_size: size of buffer (in bytes) per process used in
+ collective buffering
+
+ make.def: -DFORTRAN_REC_SIZE: Fortran I/O record length in bytes. This
+ is a system-specific value. It is part of the
+ definition string of variable CONVERTFLAG. Syntax:
+ "CONVERTFLAG = -DFORTRAN_REC_SIZE=n", where n is
+ the record length.
+
+ When <type> is "full" or "simple", the code must be linked with an
+ MPI library that contains the subset of IO routines defined in MPI 2.
+
+
+ Class D for IS (Integer Sort) requires a compiler/system that
+ supports the "long" type in C to be 64-bit. As examples, the SGI
+ MIPS compiler for the SGI Origin using the "-64" compilation flag and
+ the Intel compiler for IA64 are known to work.
+
+
+ The above procedure allows you to build one benchmark
+ at a time. To build a whole suite, you can type "make suite"
+ Make will look in file "config/suite.def" for a list of
+ executables to build. The file contains one line per specification,
+ with comments preceded by "#". Each line contains the name
+ of a benchmark, the class, and the number of processors, separated
+ by spaces or tabs. config/suite.def.template contains an example
+ of such a file.
+
+
+ The benchmarks have been designed so that they can be run
+ on a single processor without an MPI library. A few "dummy"
+ MPI routines are still required for linking. For convenience
+ such a library is supplied in the "MPI_dummy" subdirectory of
+ the distribution. It contains an mpif.h and mpi.f include files
+ which must be used as well. The dummy library is built and
+ linked automatically and paths to the include files are defined
+ by inserting the line "include ../config/make.dummy" into the
+ make.def file (see example in make.def.template). Make sure to
+ read the warnings in the README file in "MPI_dummy".The use of
+ the library is fragile and can produce unexpected errors.
+
+
+ ================================
+
+ The "RAND" variable in make.def
+ --------------------------------
+
+ Most of the NPBs use a random number generator. In two of the NPBs (FT
+ and EP) the computation of random numbers is included in the timed
+ part of the calculation, and it is important that the random number
+ generator be efficient. The default random number generator package
+ provided is called "randi8" and should be used where possible. It has
+ the following requirements:
+
+ randi8:
+ 1. Uses integer*8 arithmetic. Compiler must support integer*8
+ 2. Uses the Fortran 90 IAND intrinsic. Compiler must support IAND.
+ 3. Assumes overflow bits are discarded by the hardware. In particular,
+ that the lowest 46 bits of a*b are always correct, even if the
+ result a*b is larger than 2^64.
+
+ Since randi8 may not work on all machines, we supply the following
+ alternatives:
+
+ randi8_safe
+ 1. Uses integer*8 arithmetic
+ 2. Uses the Fortran 90 IBITS intrinsic.
+ 3. Does not make any assumptions about overflow. Should always
+ work correctly if compiler supports integer*8 and IBITS.
+
+ randdp
+ 1. Uses double precision arithmetic (to simulate integer*8 operations).
+ Should work with any system with support for 64-bit floating
+ point arithmetic.
+
+ randdpvec
+ 1. Similar to randdp but written to be easier to vectorize.
+
+
+2. Execution
+
+ The executable is named <benchmark-name>.<class>.<nprocs>[.<suffix>],
+ where <suffix> is "fortran_io", "mpi_io_simple", "ep_io", or
+ "mpi_io_full"
+ The executable is placed in the bin subdirectory (or in the directory
+ BINDIR specified in make.def, if you've defined it). The method for
+ running the MPI program depends on your local system.
+ When any of the I/O benchmarks is run (non-empty subtype), one or
+ more output files are created, and placed in the directory from which
+ the program was started. These are not removed automatically, and
+ will be overwritten the next time an IO benchmark is run.
--- /dev/null
+SHELL=/bin/sh
+BENCHMARK=sp
+BENCHMARKU=SP
+
+include ../config/make.def
+
+
+OBJS = sp.o make_set.o initialize.o exact_solution.o exact_rhs.o \
+ set_constants.o adi.o define.o copy_faces.o rhs.o \
+ lhsx.o lhsy.o lhsz.o x_solve.o ninvr.o y_solve.o pinvr.o \
+ z_solve.o tzetar.o add.o txinvr.o error.o verify.o setup_mpi.o \
+ ${COMMON}/print_results.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+# npbparams.h is included by header.h
+# The following rule should do the trick but many make programs (not gmake)
+# will do the wrong thing and rebuild the world every time (because the
+# mod time on header.h is not changed. One solution would be to
+# touch header.h but this might cause confusion if someone has
+# accidentally deleted it. Instead, make the dependency on npbparams.h
+# explicit in all the lines below (even though dependence is indirect).
+
+# header.h: npbparams.h
+
+${PROGRAM}: config ${OBJS}
+ ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
+
+.f.o:
+ ${FCOMPILE} $<
+
+sp.o: sp.f header.h npbparams.h mpinpb.h
+make_set.o: make_set.f header.h npbparams.h mpinpb.h
+initialize.o: initialize.f header.h npbparams.h
+exact_solution.o: exact_solution.f header.h npbparams.h
+exact_rhs.o: exact_rhs.f header.h npbparams.h
+set_constants.o: set_constants.f header.h npbparams.h
+adi.o: adi.f header.h npbparams.h
+define.o: define.f header.h npbparams.h
+copy_faces.o: copy_faces.f header.h npbparams.h mpinpb.h
+rhs.o: rhs.f header.h npbparams.h
+lhsx.o: lhsx.f header.h npbparams.h
+lhsy.o: lhsy.f header.h npbparams.h
+lhsz.o: lhsz.f header.h npbparams.h
+x_solve.o: x_solve.f header.h npbparams.h mpinpb.h
+ninvr.o: ninvr.f header.h npbparams.h
+y_solve.o: y_solve.f header.h npbparams.h mpinpb.h
+pinvr.o: pinvr.f header.h npbparams.h
+z_solve.o: z_solve.f header.h npbparams.h mpinpb.h
+tzetar.o: tzetar.f header.h npbparams.h
+add.o: add.f header.h npbparams.h
+txinvr.o: txinvr.f header.h npbparams.h
+error.o: error.f header.h npbparams.h mpinpb.h
+verify.o: verify.f header.h npbparams.h mpinpb.h
+setup_mpi.o: setup_mpi.f mpinpb.h npbparams.h
+
+
+clean:
+ - rm -f *.o *~ mputil*
+ - rm -f npbparams.h core
--- /dev/null
+
+This code implements a 3D Multi-partition algorithm for the solution
+of the uncoupled systems of linear equations resulting from
+Beam-Warming approximate factorization. Consequently, the program
+must be run on a square number of processors. The included file
+"npbparams.h" contains a parameter statement which sets "maxcells"
+and "problem_size". The parameter maxcells must be set to the
+square root of the number of processors. For example, if running
+on 25 processors, then set max_cells=5. The standard problem sizes
+are problem_size=64 for class A, 102 for class B, and 162 for class C.
+
+The number of time steps and the time step size dt are set in the
+npbparams.h but may be overridden in the input deck "inputsp.data".
+The number of time steps is 400 for all three
+standard problems, and the appropriate time step sizes "dt" are
+0.0015d0 for class A, 0.001d0 for class B, and 0.00067 for class C.
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine add
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c addition of update to the vector u
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, i, j, k, m
+
+ do c = 1, ncells
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ u(i,j,k,m,c) = u(i,j,k,m,c) + rhs(i,j,k,m,c)
+ end do
+ end do
+ end do
+ end do
+ end do
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine adi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ call copy_faces
+
+ call txinvr
+
+ call x_solve
+
+ call y_solve
+
+ call z_solve
+
+ call add
+
+ return
+ end
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine copy_faces
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function copies the face values of a variable defined on a set
+c of cells to the overlap locations of the adjacent sets of cells.
+c Because a set of cells interfaces in each direction with exactly one
+c other set, we only need to fill six different buffers. We could try to
+c overlap communication with computation, by computing
+c some internal values while communicating boundary values, but this
+c adds so much overhead that it's not clearly useful.
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i, j, k, c, m, requests(0:11), p0, p1,
+ > p2, p3, p4, p5, b_size(0:5), ss(0:5),
+ > sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11)
+
+c---------------------------------------------------------------------
+c exit immediately if there are no faces to be copied
+c---------------------------------------------------------------------
+ if (no_nodes .eq. 1) then
+ call compute_rhs
+ return
+ endif
+
+
+ ss(0) = start_send_east
+ ss(1) = start_send_west
+ ss(2) = start_send_north
+ ss(3) = start_send_south
+ ss(4) = start_send_top
+ ss(5) = start_send_bottom
+
+ sr(0) = start_recv_east
+ sr(1) = start_recv_west
+ sr(2) = start_recv_north
+ sr(3) = start_recv_south
+ sr(4) = start_recv_top
+ sr(5) = start_recv_bottom
+
+ b_size(0) = east_size
+ b_size(1) = west_size
+ b_size(2) = north_size
+ b_size(3) = south_size
+ b_size(4) = top_size
+ b_size(5) = bottom_size
+
+c---------------------------------------------------------------------
+c because the difference stencil for the diagonalized scheme is
+c orthogonal, we do not have to perform the staged copying of faces,
+c but can send all face information simultaneously to the neighboring
+c cells in all directions
+c---------------------------------------------------------------------
+ p0 = 0
+ p1 = 0
+ p2 = 0
+ p3 = 0
+ p4 = 0
+ p5 = 0
+
+ do c = 1, ncells
+ do m = 1, 5
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to eastern neighbors (i-dir)
+c---------------------------------------------------------------------
+ if (cell_coord(1,c) .ne. ncells) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = cell_size(1,c)-2, cell_size(1,c)-1
+ out_buffer(ss(0)+p0) = u(i,j,k,m,c)
+ p0 = p0 + 1
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to western neighbors
+c---------------------------------------------------------------------
+ if (cell_coord(1,c) .ne. 1) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, 1
+ out_buffer(ss(1)+p1) = u(i,j,k,m,c)
+ p1 = p1 + 1
+ end do
+ end do
+ end do
+
+
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to northern neighbors (j_dir)
+c---------------------------------------------------------------------
+ if (cell_coord(2,c) .ne. ncells) then
+ do k = 0, cell_size(3,c)-1
+ do j = cell_size(2,c)-2, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ out_buffer(ss(2)+p2) = u(i,j,k,m,c)
+ p2 = p2 + 1
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to southern neighbors
+c---------------------------------------------------------------------
+ if (cell_coord(2,c).ne. 1) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, 1
+ do i = 0, cell_size(1,c)-1
+ out_buffer(ss(3)+p3) = u(i,j,k,m,c)
+ p3 = p3 + 1
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to top neighbors (k-dir)
+c---------------------------------------------------------------------
+ if (cell_coord(3,c) .ne. ncells) then
+ do k = cell_size(3,c)-2, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ out_buffer(ss(4)+p4) = u(i,j,k,m,c)
+ p4 = p4 + 1
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c fill the buffer to be sent to bottom neighbors
+c---------------------------------------------------------------------
+ if (cell_coord(3,c).ne. 1) then
+ do k=0, 1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ out_buffer(ss(5)+p5) = u(i,j,k,m,c)
+ p5 = p5 + 1
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c m loop
+c---------------------------------------------------------------------
+ end do
+
+c---------------------------------------------------------------------
+c cell loop
+c---------------------------------------------------------------------
+ end do
+
+ call mpi_irecv(in_buffer(sr(0)), b_size(0),
+ > dp_type, successor(1), WEST,
+ > comm_rhs, requests(0), error)
+ call mpi_irecv(in_buffer(sr(1)), b_size(1),
+ > dp_type, predecessor(1), EAST,
+ > comm_rhs, requests(1), error)
+ call mpi_irecv(in_buffer(sr(2)), b_size(2),
+ > dp_type, successor(2), SOUTH,
+ > comm_rhs, requests(2), error)
+ call mpi_irecv(in_buffer(sr(3)), b_size(3),
+ > dp_type, predecessor(2), NORTH,
+ > comm_rhs, requests(3), error)
+ call mpi_irecv(in_buffer(sr(4)), b_size(4),
+ > dp_type, successor(3), BOTTOM,
+ > comm_rhs, requests(4), error)
+ call mpi_irecv(in_buffer(sr(5)), b_size(5),
+ > dp_type, predecessor(3), TOP,
+ > comm_rhs, requests(5), error)
+
+ call mpi_isend(out_buffer(ss(0)), b_size(0),
+ > dp_type, successor(1), EAST,
+ > comm_rhs, requests(6), error)
+ call mpi_isend(out_buffer(ss(1)), b_size(1),
+ > dp_type, predecessor(1), WEST,
+ > comm_rhs, requests(7), error)
+ call mpi_isend(out_buffer(ss(2)), b_size(2),
+ > dp_type,successor(2), NORTH,
+ > comm_rhs, requests(8), error)
+ call mpi_isend(out_buffer(ss(3)), b_size(3),
+ > dp_type,predecessor(2), SOUTH,
+ > comm_rhs, requests(9), error)
+ call mpi_isend(out_buffer(ss(4)), b_size(4),
+ > dp_type,successor(3), TOP,
+ > comm_rhs, requests(10), error)
+ call mpi_isend(out_buffer(ss(5)), b_size(5),
+ > dp_type,predecessor(3), BOTTOM,
+ > comm_rhs,requests(11), error)
+
+
+ call mpi_waitall(12, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c unpack the data that has just been received;
+c---------------------------------------------------------------------
+ p0 = 0
+ p1 = 0
+ p2 = 0
+ p3 = 0
+ p4 = 0
+ p5 = 0
+
+ do c = 1, ncells
+ do m = 1, 5
+
+ if (cell_coord(1,c) .ne. 1) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = -2, -1
+ u(i,j,k,m,c) = in_buffer(sr(1)+p0)
+ p0 = p0 + 1
+ end do
+ end do
+ end do
+ endif
+
+ if (cell_coord(1,c) .ne. ncells) then
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = cell_size(1,c), cell_size(1,c)+1
+ u(i,j,k,m,c) = in_buffer(sr(0)+p1)
+ p1 = p1 + 1
+ end do
+ end do
+ end do
+ end if
+
+ if (cell_coord(2,c) .ne. 1) then
+ do k = 0, cell_size(3,c)-1
+ do j = -2, -1
+ do i = 0, cell_size(1,c)-1
+ u(i,j,k,m,c) = in_buffer(sr(3)+p2)
+ p2 = p2 + 1
+ end do
+ end do
+ end do
+
+ endif
+
+ if (cell_coord(2,c) .ne. ncells) then
+ do k = 0, cell_size(3,c)-1
+ do j = cell_size(2,c), cell_size(2,c)+1
+ do i = 0, cell_size(1,c)-1
+ u(i,j,k,m,c) = in_buffer(sr(2)+p3)
+ p3 = p3 + 1
+ end do
+ end do
+ end do
+ endif
+
+ if (cell_coord(3,c) .ne. 1) then
+ do k = -2, -1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ u(i,j,k,m,c) = in_buffer(sr(5)+p4)
+ p4 = p4 + 1
+ end do
+ end do
+ end do
+ endif
+
+ if (cell_coord(3,c) .ne. ncells) then
+ do k = cell_size(3,c), cell_size(3,c)+1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ u(i,j,k,m,c) = in_buffer(sr(4)+p5)
+ p5 = p5 + 1
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c m loop
+c---------------------------------------------------------------------
+ end do
+
+c---------------------------------------------------------------------
+c cells loop
+c---------------------------------------------------------------------
+ end do
+
+c---------------------------------------------------------------------
+c now that we have all the data, compute the rhs
+c---------------------------------------------------------------------
+ call compute_rhs
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine compute_buffer_size(dim)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, dim, face_size
+
+ if (ncells .eq. 1) return
+
+c---------------------------------------------------------------------
+c compute the actual sizes of the buffers; note that there is
+c always one cell face that doesn't need buffer space, because it
+c is at the boundary of the grid
+c---------------------------------------------------------------------
+
+ west_size = 0
+ east_size = 0
+
+ do c = 1, ncells
+ face_size = cell_size(2,c) * cell_size(3,c) * dim * 2
+ if (cell_coord(1,c).ne.1) west_size = west_size + face_size
+ if (cell_coord(1,c).ne.ncells) east_size = east_size +
+ > face_size
+ end do
+
+ north_size = 0
+ south_size = 0
+ do c = 1, ncells
+ face_size = cell_size(1,c)*cell_size(3,c) * dim * 2
+ if (cell_coord(2,c).ne.1) south_size = south_size + face_size
+ if (cell_coord(2,c).ne.ncells) north_size = north_size +
+ > face_size
+ end do
+
+ top_size = 0
+ bottom_size = 0
+ do c = 1, ncells
+ face_size = cell_size(1,c) * cell_size(2,c) * dim * 2
+ if (cell_coord(3,c).ne.1) bottom_size = bottom_size +
+ > face_size
+ if (cell_coord(3,c).ne.ncells) top_size = top_size +
+ > face_size
+ end do
+
+ start_send_west = 1
+ start_send_east = start_send_west + west_size
+ start_send_south = start_send_east + east_size
+ start_send_north = start_send_south + south_size
+ start_send_bottom = start_send_north + north_size
+ start_send_top = start_send_bottom + bottom_size
+ start_recv_west = 1
+ start_recv_east = start_recv_west + west_size
+ start_recv_south = start_recv_east + east_size
+ start_recv_north = start_recv_south + south_size
+ start_recv_bottom = start_recv_north + north_size
+ start_recv_top = start_recv_bottom + bottom_size
+
+ return
+ end
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine error_norm(rms)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function computes the norm of the difference between the
+c computed solution and the exact solution
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer c, i, j, k, m, ii, jj, kk, d, error
+ double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5),
+ > add
+
+ do m = 1, 5
+ rms_work(m) = 0.0d0
+ end do
+
+ do c = 1, ncells
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ ii = 0
+ do i = cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+ call exact_solution(xi, eta, zeta, u_exact)
+
+ do m = 1, 5
+ add = u(ii,jj,kk,m,c)-u_exact(m)
+ rms_work(m) = rms_work(m) + add*add
+ end do
+ ii = ii + 1
+ end do
+ jj = jj + 1
+ end do
+ kk = kk + 1
+ end do
+ end do
+
+ call mpi_allreduce(rms_work, rms, 5, dp_type,
+ > MPI_SUM, comm_setup, error)
+
+ do m = 1, 5
+ do d = 1, 3
+ rms(m) = rms(m) / dble(grid_points(d)-2)
+ end do
+ rms(m) = dsqrt(rms(m))
+ end do
+
+ return
+ end
+
+
+
+ subroutine rhs_norm(rms)
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer c, i, j, k, d, m, error
+ double precision rms(5), rms_work(5), add
+
+ do m = 1, 5
+ rms_work(m) = 0.0d0
+ end do
+
+ do c = 1, ncells
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ do m = 1, 5
+ add = rhs(i,j,k,m,c)
+ rms_work(m) = rms_work(m) + add*add
+ end do
+ end do
+ end do
+ end do
+ end do
+
+
+
+ call mpi_allreduce(rms_work, rms, 5, dp_type,
+ > MPI_SUM, comm_setup, error)
+
+ do m = 1, 5
+ do d = 1, 3
+ rms(m) = rms(m) / dble(grid_points(d)-2)
+ end do
+ rms(m) = dsqrt(rms(m))
+ end do
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exact_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ double precision dtemp(5), xi, eta, zeta, dtpp
+ integer c, m, i, j, k, ip1, im1, jp1,
+ > jm1, km1, kp1
+
+c---------------------------------------------------------------------
+c loop over all cells owned by this node
+c---------------------------------------------------------------------
+ do c = 1, ncells
+
+c---------------------------------------------------------------------
+c initialize
+c---------------------------------------------------------------------
+ do m = 1, 5
+ do k= 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ forcing(i,j,k,m,c) = 0.0d0
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c xi-direction flux differences
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ zeta = dble(k+cell_low(3,c)) * dnzm1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ eta = dble(j+cell_low(2,c)) * dnym1
+
+ do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c)
+ xi = dble(i+cell_low(1,c)) * dnxm1
+
+ call exact_solution(xi, eta, zeta, dtemp)
+ do m = 1, 5
+ ue(i,m) = dtemp(m)
+ end do
+
+ dtpp = 1.0d0 / dtemp(1)
+
+ do m = 2, 5
+ buf(i,m) = dtpp * dtemp(m)
+ end do
+
+ cuf(i) = buf(i,2) * buf(i,2)
+ buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) +
+ > buf(i,4) * buf(i,4)
+ q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
+ > buf(i,4)*ue(i,4))
+
+ end do
+
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ im1 = i-1
+ ip1 = i+1
+
+ forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
+ > tx2*( ue(ip1,2)-ue(im1,2) )+
+ > dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1))
+
+ forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tx2 * (
+ > (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
+ > (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
+ > xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+
+ > dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2))
+
+ forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tx2 * (
+ > ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
+ > xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+
+ > dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3))
+
+ forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tx2*(
+ > ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
+ > xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+
+ > dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4))
+
+ forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tx2*(
+ > buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
+ > buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
+ > 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+
+ > buf(im1,1))+
+ > xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+
+ > xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+
+ > dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5))
+ end do
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation
+c---------------------------------------------------------------------
+ if (start(1,c) .gt. 0) then
+ do m = 1, 5
+ i = 1
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m))
+ i = 2
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) -
+ > 4.0d0*ue(i+1,m) + ue(i+2,m))
+ end do
+ endif
+
+ do m = 1, 5
+ do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
+ > (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+ > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m))
+ end do
+ end do
+
+ if (end(1,c) .gt. 0) then
+ do m = 1, 5
+ i = cell_size(1,c)-3
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+ > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m))
+ i = cell_size(1,c)-2
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m))
+ end do
+ endif
+
+ end do
+ end do
+c---------------------------------------------------------------------
+c eta-direction flux differences
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ zeta = dble(k+cell_low(3,c)) * dnzm1
+ do i=start(1,c), cell_size(1,c)-end(1,c)-1
+ xi = dble(i+cell_low(1,c)) * dnxm1
+
+ do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c)
+ eta = dble(j+cell_low(2,c)) * dnym1
+
+ call exact_solution(xi, eta, zeta, dtemp)
+ do m = 1, 5
+ ue(j,m) = dtemp(m)
+ end do
+ dtpp = 1.0d0/dtemp(1)
+
+ do m = 2, 5
+ buf(j,m) = dtpp * dtemp(m)
+ end do
+
+ cuf(j) = buf(j,3) * buf(j,3)
+ buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) +
+ > buf(j,4) * buf(j,4)
+ q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
+ > buf(j,4)*ue(j,4))
+ end do
+
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ jm1 = j-1
+ jp1 = j+1
+
+ forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
+ > ty2*( ue(jp1,3)-ue(jm1,3) )+
+ > dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1))
+
+ forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - ty2*(
+ > ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
+ > yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+
+ > dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2))
+
+ forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - ty2*(
+ > (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
+ > (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
+ > yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+
+ > dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3))
+
+ forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - ty2*(
+ > ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
+ > yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+
+ > dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4))
+
+ forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - ty2*(
+ > buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
+ > buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
+ > 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+
+ > buf(jm1,1))+
+ > yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+
+ > yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+
+ > dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5))
+ end do
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation
+c---------------------------------------------------------------------
+ if (start(2,c) .gt. 0) then
+ do m = 1, 5
+ j = 1
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m))
+ j = 2
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) -
+ > 4.0d0*ue(j+1,m) + ue(j+2,m))
+ end do
+ endif
+
+ do m = 1, 5
+ do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
+ > (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+ > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m))
+ end do
+ end do
+ if (end(2,c) .gt. 0) then
+ do m = 1, 5
+ j = cell_size(2,c)-3
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+ > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m))
+ j = cell_size(2,c)-2
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m))
+
+ end do
+ endif
+
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c zeta-direction flux differences
+c---------------------------------------------------------------------
+ do j=start(2,c), cell_size(2,c)-end(2,c)-1
+ eta = dble(j+cell_low(2,c)) * dnym1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ xi = dble(i+cell_low(1,c)) * dnxm1
+
+ do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c)
+ zeta = dble(k+cell_low(3,c)) * dnzm1
+
+ call exact_solution(xi, eta, zeta, dtemp)
+ do m = 1, 5
+ ue(k,m) = dtemp(m)
+ end do
+
+ dtpp = 1.0d0/dtemp(1)
+
+ do m = 2, 5
+ buf(k,m) = dtpp * dtemp(m)
+ end do
+
+ cuf(k) = buf(k,4) * buf(k,4)
+ buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) +
+ > buf(k,3) * buf(k,3)
+ q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
+ > buf(k,4)*ue(k,4))
+ end do
+
+ do k=start(3,c), cell_size(3,c)-end(3,c)-1
+ km1 = k-1
+ kp1 = k+1
+
+ forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
+ > tz2*( ue(kp1,4)-ue(km1,4) )+
+ > dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1))
+
+ forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tz2 * (
+ > ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
+ > zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+
+ > dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2))
+
+ forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tz2 * (
+ > ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
+ > zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+
+ > dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3))
+
+ forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tz2 * (
+ > (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
+ > (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
+ > zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+
+ > dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4))
+
+ forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tz2 * (
+ > buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
+ > buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
+ > 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1)
+ > +buf(km1,1))+
+ > zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+
+ > zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+
+ > dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5))
+ end do
+
+c---------------------------------------------------------------------
+c Fourth-order dissipation
+c---------------------------------------------------------------------
+ if (start(3,c) .gt. 0) then
+ do m = 1, 5
+ k = 1
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m))
+ k = 2
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) -
+ > 4.0d0*ue(k+1,m) + ue(k+2,m))
+ end do
+ endif
+
+ do m = 1, 5
+ do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
+ > (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+ > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m))
+ end do
+ end do
+
+ if (end(3,c) .gt. 0) then
+ do m = 1, 5
+ k = cell_size(3,c)-3
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+ > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m))
+ k = cell_size(3,c)-2
+ forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
+ > (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m))
+ end do
+ endif
+
+ end do
+ end do
+c---------------------------------------------------------------------
+c now change the sign of the forcing function,
+c---------------------------------------------------------------------
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ forcing(i,j,k,m,c) = -1.d0 * forcing(i,j,k,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c cell loop
+c---------------------------------------------------------------------
+ end do
+
+ return
+ end
+
+
+
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine exact_solution(xi,eta,zeta,dtemp)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function returns the exact solution at point xi, eta, zeta
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ double precision xi, eta, zeta, dtemp(5)
+ integer m
+
+ do m = 1, 5
+ dtemp(m) = ce(m,1) +
+ > xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
+ > eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
+ > zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) +
+ > zeta*ce(m,13))))
+ end do
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+
+c---------------------------------------------------------------------
+c The following include file is generated automatically by the
+c "setparams" utility. It defines
+c maxcells: the square root of the maximum number of processors
+c problem_size: 12, 64, 102, 162 (for class T, A, B, C)
+c dt_default: default time step for this problem size if no
+c config file
+c niter_default: default number of iterations for this problem size
+c---------------------------------------------------------------------
+
+ include 'npbparams.h'
+
+ integer ncells, grid_points(3)
+ common /global/ ncells, grid_points
+
+ double precision tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
+ > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4,
+ > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt,
+ > ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2,
+ > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+ > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+ > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+ > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1,
+ > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1,
+ > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2,
+ > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+ > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1,
+ > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6,
+ > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+ common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
+ > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4,
+ > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt,
+ > ce, dxmax, dymax, dzmax, xxcon1, xxcon2,
+ > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+ > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+ > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+ > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1,
+ > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1,
+ > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2,
+ > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+ > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1,
+ > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6,
+ > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+ integer EAST, WEST, NORTH, SOUTH,
+ > BOTTOM, TOP
+
+ parameter (EAST=2000, WEST=3000, NORTH=4000, SOUTH=5000,
+ > BOTTOM=6000, TOP=7000)
+
+ integer cell_coord (3,maxcells), cell_low (3,maxcells),
+ > cell_high (3,maxcells), cell_size(3,maxcells),
+ > predecessor(3), slice (3,maxcells),
+ > grid_size (3), successor(3),
+ > start (3,maxcells), end (3,maxcells)
+ common /partition/ cell_coord, cell_low, cell_high, cell_size,
+ > grid_size, successor, predecessor, slice,
+ > start, end
+
+ integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE, IMAXP, JMAXP
+
+ parameter (MAX_CELL_DIM = (problem_size/maxcells)+1)
+
+ parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM)
+ parameter (IMAXP=IMAX/2*2+1,JMAXP=JMAX/2*2+1)
+
+c---------------------------------------------------------------------
+c +1 at end to avoid zero length arrays for 1 node
+c---------------------------------------------------------------------
+ parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60*2+1)
+
+ double precision
+ > u (-2:IMAXP+1,-2:JMAXP+1,-2:KMAX+1, 5,maxcells),
+ > us (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > vs (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > ws (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > qs (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > ainv (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > rho_i (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > speed (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > square (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
+ > rhs ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells),
+ > forcing ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells),
+ > lhs ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1,15,maxcells),
+ > in_buffer(BUF_SIZE), out_buffer(BUF_SIZE)
+ common /fields/ u, us, vs, ws, qs, ainv, rho_i, speed, square,
+ > rhs, forcing, lhs, in_buffer, out_buffer
+
+ double precision cv(-2:MAX_CELL_DIM+1), rhon(-2:MAX_CELL_DIM+1),
+ > rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1),
+ > cuf(-2:MAX_CELL_DIM+1), q(-2:MAX_CELL_DIM+1),
+ > ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5)
+ common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf
+
+ integer west_size, east_size, bottom_size, top_size,
+ > north_size, south_size, start_send_west,
+ > start_send_east, start_send_south, start_send_north,
+ > start_send_bottom, start_send_top, start_recv_west,
+ > start_recv_east, start_recv_south, start_recv_north,
+ > start_recv_bottom, start_recv_top
+ common /box/ west_size, east_size, bottom_size,
+ > top_size, north_size, south_size,
+ > start_send_west, start_send_east, start_send_south,
+ > start_send_north, start_send_bottom, start_send_top,
+ > start_recv_west, start_recv_east, start_recv_south,
+ > start_recv_north, start_recv_bottom, start_recv_top
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine initialize
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This subroutine initializes the field variable u using
+c tri-linear transfinite interpolation of the boundary values
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, i, j, k, m, ii, jj, kk, ix, iy, iz
+ double precision xi, eta, zeta, Pface(5,3,2), Pxi, Peta,
+ > Pzeta, temp(5)
+
+
+c---------------------------------------------------------------------
+c Later (in compute_rhs) we compute 1/u for every element. A few of
+c the corner elements are not used, but it convenient (and faster)
+c to compute the whole thing with a simple loop. Make sure those
+c values are nonzero by initializing the whole thing here.
+c---------------------------------------------------------------------
+ do c = 1, ncells
+ do kk = -1, IMAX
+ do jj = -1, IMAX
+ do ii = -1, IMAX
+ u(ii, jj, kk, 1, c) = 1.0
+ u(ii, jj, kk, 2, c) = 0.0
+ u(ii, jj, kk, 3, c) = 0.0
+ u(ii, jj, kk, 4, c) = 0.0
+ u(ii, jj, kk, 5, c) = 1.0
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c first store the "interpolated" values everywhere on the grid
+c---------------------------------------------------------------------
+ do c=1, ncells
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ ii = 0
+ do i = cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+
+ do ix = 1, 2
+ call exact_solution(dble(ix-1), eta, zeta,
+ > Pface(1,1,ix))
+ end do
+
+ do iy = 1, 2
+ call exact_solution(xi, dble(iy-1) , zeta,
+ > Pface(1,2,iy))
+ end do
+
+ do iz = 1, 2
+ call exact_solution(xi, eta, dble(iz-1),
+ > Pface(1,3,iz))
+ end do
+
+ do m = 1, 5
+ Pxi = xi * Pface(m,1,2) +
+ > (1.0d0-xi) * Pface(m,1,1)
+ Peta = eta * Pface(m,2,2) +
+ > (1.0d0-eta) * Pface(m,2,1)
+ Pzeta = zeta * Pface(m,3,2) +
+ > (1.0d0-zeta) * Pface(m,3,1)
+
+ u(ii,jj,kk,m,c) = Pxi + Peta + Pzeta -
+ > Pxi*Peta - Pxi*Pzeta - Peta*Pzeta +
+ > Pxi*Peta*Pzeta
+
+ end do
+ ii = ii + 1
+ end do
+ jj = jj + 1
+ end do
+ kk = kk+1
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c now store the exact values on the boundaries
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c west face
+c---------------------------------------------------------------------
+ c = slice(1,1)
+ ii = 0
+ xi = 0.0d0
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(ii,jj,kk,m,c) = temp(m)
+ end do
+ jj = jj + 1
+ end do
+ kk = kk + 1
+ end do
+
+c---------------------------------------------------------------------
+c east face
+c---------------------------------------------------------------------
+ c = slice(1,ncells)
+ ii = cell_size(1,c)-1
+ xi = 1.0d0
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(ii,jj,kk,m,c) = temp(m)
+ end do
+ jj = jj + 1
+ end do
+ kk = kk + 1
+ end do
+
+c---------------------------------------------------------------------
+c south face
+c---------------------------------------------------------------------
+ c = slice(2,1)
+ jj = 0
+ eta = 0.0d0
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ ii = 0
+ do i = cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(ii,jj,kk,m,c) = temp(m)
+ end do
+ ii = ii + 1
+ end do
+ kk = kk + 1
+ end do
+
+
+c---------------------------------------------------------------------
+c north face
+c---------------------------------------------------------------------
+ c = slice(2,ncells)
+ jj = cell_size(2,c)-1
+ eta = 1.0d0
+ kk = 0
+ do k = cell_low(3,c), cell_high(3,c)
+ zeta = dble(k) * dnzm1
+ ii = 0
+ do i = cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(ii,jj,kk,m,c) = temp(m)
+ end do
+ ii = ii + 1
+ end do
+ kk = kk + 1
+ end do
+
+c---------------------------------------------------------------------
+c bottom face
+c---------------------------------------------------------------------
+ c = slice(3,1)
+ kk = 0
+ zeta = 0.0d0
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ ii = 0
+ do i =cell_low(1,c), cell_high(1,c)
+ xi = dble(i) *dnxm1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(ii,jj,kk,m,c) = temp(m)
+ end do
+ ii = ii + 1
+ end do
+ jj = jj + 1
+ end do
+
+c---------------------------------------------------------------------
+c top face
+c---------------------------------------------------------------------
+ c = slice(3,ncells)
+ kk = cell_size(3,c)-1
+ zeta = 1.0d0
+ jj = 0
+ do j = cell_low(2,c), cell_high(2,c)
+ eta = dble(j) * dnym1
+ ii = 0
+ do i =cell_low(1,c), cell_high(1,c)
+ xi = dble(i) * dnxm1
+ call exact_solution(xi, eta, zeta, temp)
+ do m = 1, 5
+ u(ii,jj,kk,m,c) = temp(m)
+ end do
+ ii = ii + 1
+ end do
+ jj = jj + 1
+ end do
+
+ return
+ end
+
+
+ subroutine lhsinit
+
+ include 'header.h'
+
+ integer i, j, k, d, c, n
+
+c---------------------------------------------------------------------
+c loop over all cells
+c---------------------------------------------------------------------
+ do c = 1, ncells
+
+c---------------------------------------------------------------------
+c first, initialize the start and end arrays
+c---------------------------------------------------------------------
+ do d = 1, 3
+ if (cell_coord(d,c) .eq. 1) then
+ start(d,c) = 1
+ else
+ start(d,c) = 0
+ endif
+ if (cell_coord(d,c) .eq. ncells) then
+ end(d,c) = 1
+ else
+ end(d,c) = 0
+ endif
+ end do
+
+c---------------------------------------------------------------------
+c zap the whole left hand side for starters
+c---------------------------------------------------------------------
+ do n = 1, 15
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ lhs(i,j,k,n,c) = 0.0d0
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c next, set all diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+ do n = 1, 3
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ lhs(i,j,k,5*n-2,c) = 1.0d0
+ end do
+ end do
+ end do
+ end do
+
+ end do
+
+ return
+ end
+
+
+
--- /dev/null
+400 number of time steps
+0.0015d0 dt for class A = 0.0015d0. class B = 0.001d0 class C = 0.00067d0
+64 64 64
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine lhsx(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three x-factors
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ double precision ru1
+ integer i, j, k, c
+
+
+c---------------------------------------------------------------------
+c treat only cell c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c first fill the lhs for the u-eigenvalue
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c)-1, cell_size(1,c)-end(1,c)
+ ru1 = c3c4*rho_i(i,j,k,c)
+ cv(i) = us(i,j,k,c)
+ rhon(i) = dmax1(dx2+con43*ru1,
+ > dx5+c1c5*ru1,
+ > dxmax+ru1,
+ > dx1)
+ end do
+
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ lhs(i,j,k,1,c) = 0.0d0
+ lhs(i,j,k,2,c) = - dttx2 * cv(i-1) - dttx1 * rhon(i-1)
+ lhs(i,j,k,3,c) = 1.0d0 + c2dttx1 * rhon(i)
+ lhs(i,j,k,4,c) = dttx2 * cv(i+1) - dttx1 * rhon(i+1)
+ lhs(i,j,k,5,c) = 0.0d0
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c add fourth order dissipation
+c---------------------------------------------------------------------
+ if (start(1,c) .gt. 0) then
+ i = 1
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
+ lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+ lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+
+ lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4
+ lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz6
+ lhs(i+1,j,k,4,c) = lhs(i+1,j,k,4,c) - comz4
+ lhs(i+1,j,k,5,c) = lhs(i+1,j,k,5,c) + comz1
+ end do
+ end do
+ endif
+
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i=3*start(1,c), cell_size(1,c)-3*end(1,c)-1
+ lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+ lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+ lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+ lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+ lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+ end do
+ end do
+ end do
+
+ if (end(1,c) .gt. 0) then
+ i = cell_size(1,c)-3
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+ lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+ lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+ lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+
+ lhs(i+1,j,k,1,c) = lhs(i+1,j,k,1,c) + comz1
+ lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4
+ lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz5
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c subsequently, fill the other factors (u+c), (u-c) by a4ing to
+c the first
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c)
+ lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) -
+ > dttx2 * speed(i-1,j,k,c)
+ lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c)
+ lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) +
+ > dttx2 * speed(i+1,j,k,c)
+ lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
+ lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
+ lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) +
+ > dttx2 * speed(i-1,j,k,c)
+ lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
+ lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) -
+ > dttx2 * speed(i+1,j,k,c)
+ lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
+ end do
+ end do
+ end do
+
+ return
+ end
+
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine lhsy(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three y-factors
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ double precision ru1
+ integer i, j, k, c
+
+c---------------------------------------------------------------------
+c treat only cell c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c first fill the lhs for the u-eigenvalue
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+ do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+ ru1 = c3c4*rho_i(i,j,k,c)
+ cv(j) = vs(i,j,k,c)
+ rhoq(j) = dmax1( dy3 + con43 * ru1,
+ > dy5 + c1c5*ru1,
+ > dymax + ru1,
+ > dy1)
+ end do
+
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ lhs(i,j,k,1,c) = 0.0d0
+ lhs(i,j,k,2,c) = -dtty2 * cv(j-1) - dtty1 * rhoq(j-1)
+ lhs(i,j,k,3,c) = 1.0 + c2dtty1 * rhoq(j)
+ lhs(i,j,k,4,c) = dtty2 * cv(j+1) - dtty1 * rhoq(j+1)
+ lhs(i,j,k,5,c) = 0.0d0
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c add fourth order dissipation
+c---------------------------------------------------------------------
+ if (start(2,c) .gt. 0) then
+ j = 1
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+ lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
+ lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+ lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+
+ lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4
+ lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz6
+ lhs(i,j+1,k,4,c) = lhs(i,j+1,k,4,c) - comz4
+ lhs(i,j+1,k,5,c) = lhs(i,j+1,k,5,c) + comz1
+ end do
+ end do
+ endif
+
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j=3*start(2,c), cell_size(2,c)-3*end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+ lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+ lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+ lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+ lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+ lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+ end do
+ end do
+ end do
+
+ if (end(2,c) .gt. 0) then
+ j = cell_size(2,c)-3
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+ lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+ lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+ lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+
+ lhs(i,j+1,k,1,c) = lhs(i,j+1,k,1,c) + comz1
+ lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4
+ lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz5
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c subsequently, do the other two factors
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c)
+ lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) -
+ > dtty2 * speed(i,j-1,k,c)
+ lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c)
+ lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) +
+ > dtty2 * speed(i,j+1,k,c)
+ lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
+ lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
+ lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) +
+ > dtty2 * speed(i,j-1,k,c)
+ lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
+ lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) -
+ > dtty2 * speed(i,j+1,k,c)
+ lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
+ end do
+ end do
+ end do
+
+ return
+ end
+
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine lhsz(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This function computes the left hand side for the three z-factors
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ double precision ru1
+ integer i, j, k, c
+
+c---------------------------------------------------------------------
+c treat only cell c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c first fill the lhs for the u-eigenvalue
+c---------------------------------------------------------------------
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+ do k = start(3,c)-1, cell_size(3,c)-end(3,c)
+ ru1 = c3c4*rho_i(i,j,k,c)
+ cv(k) = ws(i,j,k,c)
+ rhos(k) = dmax1(dz4 + con43 * ru1,
+ > dz5 + c1c5 * ru1,
+ > dzmax + ru1,
+ > dz1)
+ end do
+
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ lhs(i,j,k,1,c) = 0.0d0
+ lhs(i,j,k,2,c) = -dttz2 * cv(k-1) - dttz1 * rhos(k-1)
+ lhs(i,j,k,3,c) = 1.0 + c2dttz1 * rhos(k)
+ lhs(i,j,k,4,c) = dttz2 * cv(k+1) - dttz1 * rhos(k+1)
+ lhs(i,j,k,5,c) = 0.0d0
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c add fourth order dissipation
+c---------------------------------------------------------------------
+ if (start(3,c) .gt. 0) then
+ k = 1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
+ lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+ lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+
+ lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4
+ lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz6
+ lhs(i,j,k+1,4,c) = lhs(i,j,k+1,4,c) - comz4
+ lhs(i,j,k+1,5,c) = lhs(i,j,k+1,5,c) + comz1
+ end do
+ end do
+ endif
+
+ do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+ lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+ lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+ lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+ lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
+ end do
+ end do
+ end do
+
+ if (end(3,c) .gt. 0) then
+ k = cell_size(3,c)-3
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
+ lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
+ lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
+ lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
+
+ lhs(i,j,k+1,1,c) = lhs(i,j,k+1,1,c) + comz1
+ lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4
+ lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz5
+ end do
+ end do
+ endif
+
+
+c---------------------------------------------------------------------
+c subsequently, fill the other factors (u+c), (u-c)
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c)
+ lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) -
+ > dttz2 * speed(i,j,k-1,c)
+ lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c)
+ lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) +
+ > dttz2 * speed(i,j,k+1,c)
+ lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
+ lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
+ lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) +
+ > dttz2 * speed(i,j,k-1,c)
+ lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
+ lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) -
+ > dttz2 * speed(i,j,k+1,c)
+ lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
+ end do
+ end do
+ end do
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine make_set
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This function allocates space for a set of cells and fills the set
+c such that communication between cells on different nodes is only
+c nearest neighbor
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer p, i, j, c, dir, size, excess, ierr,ierrcode
+
+c---------------------------------------------------------------------
+c compute square root; add small number to allow for roundoff
+c (note: this is computed in setup_mpi.f also, but prefer to do
+c it twice because of some include file problems).
+c---------------------------------------------------------------------
+ ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c this makes coding easier
+c---------------------------------------------------------------------
+ p = ncells
+
+c---------------------------------------------------------------------
+c determine the location of the cell at the bottom of the 3D
+c array of cells
+c---------------------------------------------------------------------
+ cell_coord(1,1) = mod(node,p)
+ cell_coord(2,1) = node/p
+ cell_coord(3,1) = 0
+
+c---------------------------------------------------------------------
+c set the cell_coords for cells in the rest of the z-layers;
+c this comes down to a simple linear numbering in the z-direct-
+c ion, and to the doubly-cyclic numbering in the other dirs
+c---------------------------------------------------------------------
+ do c=2, p
+ cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p)
+ cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p)
+ cell_coord(3,c) = c-1
+ end do
+
+c---------------------------------------------------------------------
+c offset all the coordinates by 1 to adjust for Fortran arrays
+c---------------------------------------------------------------------
+ do dir = 1, 3
+ do c = 1, p
+ cell_coord(dir,c) = cell_coord(dir,c) + 1
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c slice(dir,n) contains the sequence number of the cell that is in
+c coordinate plane n in the dir direction
+c---------------------------------------------------------------------
+ do dir = 1, 3
+ do c = 1, p
+ slice(dir,cell_coord(dir,c)) = c
+ end do
+ end do
+
+
+c---------------------------------------------------------------------
+c fill the predecessor and successor entries, using the indices
+c of the bottom cells (they are the same at each level of k
+c anyway) acting as if full periodicity pertains; note that p is
+c added to those arguments to the mod functions that might
+c otherwise return wrong values when using the modulo function
+c---------------------------------------------------------------------
+ i = cell_coord(1,1)-1
+ j = cell_coord(2,1)-1
+
+ predecessor(1) = mod(i-1+p,p) + p*j
+ predecessor(2) = i + p*mod(j-1+p,p)
+ predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p)
+ successor(1) = mod(i+1,p) + p*j
+ successor(2) = i + p*mod(j+1,p)
+ successor(3) = mod(i-1+p,p) + p*mod(j+1,p)
+
+c---------------------------------------------------------------------
+c now compute the sizes of the cells
+c---------------------------------------------------------------------
+ do dir= 1, 3
+c---------------------------------------------------------------------
+c set cell_coord range for each direction
+c---------------------------------------------------------------------
+ size = grid_points(dir)/p
+ excess = mod(grid_points(dir),p)
+ do c=1, ncells
+ if (cell_coord(dir,c) .le. excess) then
+ cell_size(dir,c) = size+1
+ cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1)
+ cell_high(dir,c) = cell_low(dir,c)+size
+ else
+ cell_size(dir,c) = size
+ cell_low(dir,c) = excess*(size+1)+
+ > (cell_coord(dir,c)-excess-1)*size
+ cell_high(dir,c) = cell_low(dir,c)+size-1
+ endif
+ if (cell_size(dir, c) .le. 2) then
+ write(*,50)
+ 50 format(' Error: Cell size too small. Min size is 3')
+ call MPI_Abort(mpi_comm_world,ierrcode,ierr)
+ stop
+ endif
+ end do
+ end do
+
+ return
+ end
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'mpif.h'
+
+ integer node, no_nodes, total_nodes, root, comm_setup,
+ > comm_solve, comm_rhs, dp_type
+ logical active
+ common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup,
+ > comm_solve, comm_rhs, dp_type, active
+ integer DEFAULT_TAG
+ parameter (DEFAULT_TAG = 0)
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine ninvr(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c block-diagonal matrix-vector multiplication
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, i, j, k
+ double precision r1, r2, r3, r4, r5, t1, t2
+
+c---------------------------------------------------------------------
+c treat only one cell
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+ r1 = rhs(i,j,k,1,c)
+ r2 = rhs(i,j,k,2,c)
+ r3 = rhs(i,j,k,3,c)
+ r4 = rhs(i,j,k,4,c)
+ r5 = rhs(i,j,k,5,c)
+
+ t1 = bt * r3
+ t2 = 0.5d0 * ( r4 + r5 )
+
+ rhs(i,j,k,1,c) = -r2
+ rhs(i,j,k,2,c) = r1
+ rhs(i,j,k,3,c) = bt * ( r4 - r5 )
+ rhs(i,j,k,4,c) = -t1 + t2
+ rhs(i,j,k,5,c) = t1 + t2
+ enddo
+ enddo
+ enddo
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine pinvr(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c block-diagonal matrix-vector multiplication
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i, j, k, c
+ double precision r1, r2, r3, r4, r5, t1, t2
+
+c---------------------------------------------------------------------
+c treat only one cell
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+ r1 = rhs(i,j,k,1,c)
+ r2 = rhs(i,j,k,2,c)
+ r3 = rhs(i,j,k,3,c)
+ r4 = rhs(i,j,k,4,c)
+ r5 = rhs(i,j,k,5,c)
+
+ t1 = bt * r1
+ t2 = 0.5d0 * ( r4 + r5 )
+
+ rhs(i,j,k,1,c) = bt * ( r4 - r5 )
+ rhs(i,j,k,2,c) = -r3
+ rhs(i,j,k,3,c) = r2
+ rhs(i,j,k,4,c) = -t1 + t2
+ rhs(i,j,k,5,c) = t1 + t2
+ end do
+ end do
+ end do
+
+ return
+ end
+
+
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine compute_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, i, j, k, m
+ double precision aux, rho_inv, uijk, up1, um1, vijk, vp1, vm1,
+ > wijk, wp1, wm1
+
+
+c---------------------------------------------------------------------
+c loop over all cells owned by this node
+c---------------------------------------------------------------------
+ do c = 1, ncells
+
+c---------------------------------------------------------------------
+c compute the reciprocal of density, and the kinetic energy,
+c and the speed of sound.
+c---------------------------------------------------------------------
+
+ do k = -1, cell_size(3,c)
+ do j = -1, cell_size(2,c)
+ do i = -1, cell_size(1,c)
+ rho_inv = 1.0d0/u(i,j,k,1,c)
+ rho_i(i,j,k,c) = rho_inv
+ us(i,j,k,c) = u(i,j,k,2,c) * rho_inv
+ vs(i,j,k,c) = u(i,j,k,3,c) * rho_inv
+ ws(i,j,k,c) = u(i,j,k,4,c) * rho_inv
+ square(i,j,k,c) = 0.5d0* (
+ > u(i,j,k,2,c)*u(i,j,k,2,c) +
+ > u(i,j,k,3,c)*u(i,j,k,3,c) +
+ > u(i,j,k,4,c)*u(i,j,k,4,c) ) * rho_inv
+ qs(i,j,k,c) = square(i,j,k,c) * rho_inv
+c---------------------------------------------------------------------
+c (don't need speed and ainx until the lhs computation)
+c---------------------------------------------------------------------
+ aux = c1c2*rho_inv* (u(i,j,k,5,c) - square(i,j,k,c))
+ aux = dsqrt(aux)
+ speed(i,j,k,c) = aux
+ ainv(i,j,k,c) = 1.0d0/aux
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c copy the exact forcing term to the right hand side; because
+c this forcing term is known, we can store it on the whole of every
+c cell, including the boundary
+c---------------------------------------------------------------------
+
+ do m = 1, 5
+ do k = 0, cell_size(3,c)-1
+ do j = 0, cell_size(2,c)-1
+ do i = 0, cell_size(1,c)-1
+ rhs(i,j,k,m,c) = forcing(i,j,k,m,c)
+ end do
+ end do
+ end do
+ end do
+
+
+c---------------------------------------------------------------------
+c compute xi-direction fluxes
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ uijk = us(i,j,k,c)
+ up1 = us(i+1,j,k,c)
+ um1 = us(i-1,j,k,c)
+
+ rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dx1tx1 *
+ > (u(i+1,j,k,1,c) - 2.0d0*u(i,j,k,1,c) +
+ > u(i-1,j,k,1,c)) -
+ > tx2 * (u(i+1,j,k,2,c) - u(i-1,j,k,2,c))
+
+ rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dx2tx1 *
+ > (u(i+1,j,k,2,c) - 2.0d0*u(i,j,k,2,c) +
+ > u(i-1,j,k,2,c)) +
+ > xxcon2*con43 * (up1 - 2.0d0*uijk + um1) -
+ > tx2 * (u(i+1,j,k,2,c)*up1 -
+ > u(i-1,j,k,2,c)*um1 +
+ > (u(i+1,j,k,5,c)- square(i+1,j,k,c)-
+ > u(i-1,j,k,5,c)+ square(i-1,j,k,c))*
+ > c2)
+
+ rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dx3tx1 *
+ > (u(i+1,j,k,3,c) - 2.0d0*u(i,j,k,3,c) +
+ > u(i-1,j,k,3,c)) +
+ > xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) +
+ > vs(i-1,j,k,c)) -
+ > tx2 * (u(i+1,j,k,3,c)*up1 -
+ > u(i-1,j,k,3,c)*um1)
+
+ rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dx4tx1 *
+ > (u(i+1,j,k,4,c) - 2.0d0*u(i,j,k,4,c) +
+ > u(i-1,j,k,4,c)) +
+ > xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) +
+ > ws(i-1,j,k,c)) -
+ > tx2 * (u(i+1,j,k,4,c)*up1 -
+ > u(i-1,j,k,4,c)*um1)
+
+ rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dx5tx1 *
+ > (u(i+1,j,k,5,c) - 2.0d0*u(i,j,k,5,c) +
+ > u(i-1,j,k,5,c)) +
+ > xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) +
+ > qs(i-1,j,k,c)) +
+ > xxcon4 * (up1*up1 - 2.0d0*uijk*uijk +
+ > um1*um1) +
+ > xxcon5 * (u(i+1,j,k,5,c)*rho_i(i+1,j,k,c) -
+ > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
+ > u(i-1,j,k,5,c)*rho_i(i-1,j,k,c)) -
+ > tx2 * ( (c1*u(i+1,j,k,5,c) -
+ > c2*square(i+1,j,k,c))*up1 -
+ > (c1*u(i-1,j,k,5,c) -
+ > c2*square(i-1,j,k,c))*um1 )
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c add fourth order xi-direction dissipation
+c---------------------------------------------------------------------
+ if (start(1,c) .gt. 0) then
+ i = 1
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp *
+ > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) +
+ > u(i+2,j,k,m,c))
+ end do
+ end do
+ end do
+
+ i = 2
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > (-4.0d0*u(i-1,j,k,m,c) + 6.0d0*u(i,j,k,m,c) -
+ > 4.0d0*u(i+1,j,k,m,c) + u(i+2,j,k,m,c))
+ end do
+ end do
+ end do
+ endif
+
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) +
+ > 6.0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) +
+ > u(i+2,j,k,m,c) )
+ end do
+ end do
+ end do
+ end do
+
+
+ if (end(1,c) .gt. 0) then
+ i = cell_size(1,c)-3
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) +
+ > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) )
+ end do
+ end do
+ end do
+
+ i = cell_size(1,c)-2
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > ( u(i-2,j,k,m,c) - 4.d0*u(i-1,j,k,m,c) +
+ > 5.d0*u(i,j,k,m,c) )
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c compute eta-direction fluxes
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ vijk = vs(i,j,k,c)
+ vp1 = vs(i,j+1,k,c)
+ vm1 = vs(i,j-1,k,c)
+ rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dy1ty1 *
+ > (u(i,j+1,k,1,c) - 2.0d0*u(i,j,k,1,c) +
+ > u(i,j-1,k,1,c)) -
+ > ty2 * (u(i,j+1,k,3,c) - u(i,j-1,k,3,c))
+ rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dy2ty1 *
+ > (u(i,j+1,k,2,c) - 2.0d0*u(i,j,k,2,c) +
+ > u(i,j-1,k,2,c)) +
+ > yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) +
+ > us(i,j-1,k,c)) -
+ > ty2 * (u(i,j+1,k,2,c)*vp1 -
+ > u(i,j-1,k,2,c)*vm1)
+ rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dy3ty1 *
+ > (u(i,j+1,k,3,c) - 2.0d0*u(i,j,k,3,c) +
+ > u(i,j-1,k,3,c)) +
+ > yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) -
+ > ty2 * (u(i,j+1,k,3,c)*vp1 -
+ > u(i,j-1,k,3,c)*vm1 +
+ > (u(i,j+1,k,5,c) - square(i,j+1,k,c) -
+ > u(i,j-1,k,5,c) + square(i,j-1,k,c))
+ > *c2)
+ rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dy4ty1 *
+ > (u(i,j+1,k,4,c) - 2.0d0*u(i,j,k,4,c) +
+ > u(i,j-1,k,4,c)) +
+ > yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) +
+ > ws(i,j-1,k,c)) -
+ > ty2 * (u(i,j+1,k,4,c)*vp1 -
+ > u(i,j-1,k,4,c)*vm1)
+ rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dy5ty1 *
+ > (u(i,j+1,k,5,c) - 2.0d0*u(i,j,k,5,c) +
+ > u(i,j-1,k,5,c)) +
+ > yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) +
+ > qs(i,j-1,k,c)) +
+ > yycon4 * (vp1*vp1 - 2.0d0*vijk*vijk +
+ > vm1*vm1) +
+ > yycon5 * (u(i,j+1,k,5,c)*rho_i(i,j+1,k,c) -
+ > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
+ > u(i,j-1,k,5,c)*rho_i(i,j-1,k,c)) -
+ > ty2 * ((c1*u(i,j+1,k,5,c) -
+ > c2*square(i,j+1,k,c)) * vp1 -
+ > (c1*u(i,j-1,k,5,c) -
+ > c2*square(i,j-1,k,c)) * vm1)
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c add fourth order eta-direction dissipation
+c---------------------------------------------------------------------
+ if (start(2,c) .gt. 0) then
+ j = 1
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp *
+ > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) +
+ > u(i,j+2,k,m,c))
+ end do
+ end do
+ end do
+
+ j = 2
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > (-4.0d0*u(i,j-1,k,m,c) + 6.0d0*u(i,j,k,m,c) -
+ > 4.0d0*u(i,j+1,k,m,c) + u(i,j+2,k,m,c))
+ end do
+ end do
+ end do
+ endif
+
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1
+ do i = start(1,c),cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) +
+ > 6.0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) +
+ > u(i,j+2,k,m,c) )
+ end do
+ end do
+ end do
+ end do
+
+ if (end(2,c) .gt. 0) then
+ j = cell_size(2,c)-3
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) +
+ > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) )
+ end do
+ end do
+ end do
+
+ j = cell_size(2,c)-2
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > ( u(i,j-2,k,m,c) - 4.d0*u(i,j-1,k,m,c) +
+ > 5.d0*u(i,j,k,m,c) )
+ end do
+ end do
+ end do
+ endif
+
+
+c---------------------------------------------------------------------
+c compute zeta-direction fluxes
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ wijk = ws(i,j,k,c)
+ wp1 = ws(i,j,k+1,c)
+ wm1 = ws(i,j,k-1,c)
+
+ rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dz1tz1 *
+ > (u(i,j,k+1,1,c) - 2.0d0*u(i,j,k,1,c) +
+ > u(i,j,k-1,1,c)) -
+ > tz2 * (u(i,j,k+1,4,c) - u(i,j,k-1,4,c))
+ rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dz2tz1 *
+ > (u(i,j,k+1,2,c) - 2.0d0*u(i,j,k,2,c) +
+ > u(i,j,k-1,2,c)) +
+ > zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) +
+ > us(i,j,k-1,c)) -
+ > tz2 * (u(i,j,k+1,2,c)*wp1 -
+ > u(i,j,k-1,2,c)*wm1)
+ rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dz3tz1 *
+ > (u(i,j,k+1,3,c) - 2.0d0*u(i,j,k,3,c) +
+ > u(i,j,k-1,3,c)) +
+ > zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) +
+ > vs(i,j,k-1,c)) -
+ > tz2 * (u(i,j,k+1,3,c)*wp1 -
+ > u(i,j,k-1,3,c)*wm1)
+ rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dz4tz1 *
+ > (u(i,j,k+1,4,c) - 2.0d0*u(i,j,k,4,c) +
+ > u(i,j,k-1,4,c)) +
+ > zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) -
+ > tz2 * (u(i,j,k+1,4,c)*wp1 -
+ > u(i,j,k-1,4,c)*wm1 +
+ > (u(i,j,k+1,5,c) - square(i,j,k+1,c) -
+ > u(i,j,k-1,5,c) + square(i,j,k-1,c))
+ > *c2)
+ rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dz5tz1 *
+ > (u(i,j,k+1,5,c) - 2.0d0*u(i,j,k,5,c) +
+ > u(i,j,k-1,5,c)) +
+ > zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) +
+ > qs(i,j,k-1,c)) +
+ > zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk +
+ > wm1*wm1) +
+ > zzcon5 * (u(i,j,k+1,5,c)*rho_i(i,j,k+1,c) -
+ > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
+ > u(i,j,k-1,5,c)*rho_i(i,j,k-1,c)) -
+ > tz2 * ( (c1*u(i,j,k+1,5,c) -
+ > c2*square(i,j,k+1,c))*wp1 -
+ > (c1*u(i,j,k-1,5,c) -
+ > c2*square(i,j,k-1,c))*wm1)
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c add fourth order zeta-direction dissipation
+c---------------------------------------------------------------------
+ if (start(3,c) .gt. 0) then
+ k = 1
+ do m = 1, 5
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp *
+ > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) +
+ > u(i,j,k+2,m,c))
+ end do
+ end do
+ end do
+
+ k = 2
+ do m = 1, 5
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > (-4.0d0*u(i,j,k-1,m,c) + 6.0d0*u(i,j,k,m,c) -
+ > 4.0d0*u(i,j,k+1,m,c) + u(i,j,k+2,m,c))
+ end do
+ end do
+ end do
+ endif
+
+ do m = 1, 5
+ do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c),cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) +
+ > 6.0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) +
+ > u(i,j,k+2,m,c) )
+ end do
+ end do
+ end do
+ end do
+
+ if (end(3,c) .gt. 0) then
+ k = cell_size(3,c)-3
+ do m = 1, 5
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) +
+ > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) )
+ end do
+ end do
+ end do
+
+ k = cell_size(3,c)-2
+ do m = 1, 5
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
+ > ( u(i,j,k-2,m,c) - 4.d0*u(i,j,k-1,m,c) +
+ > 5.d0*u(i,j,k,m,c) )
+ end do
+ end do
+ end do
+ endif
+
+ do m = 1, 5
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) * dt
+ end do
+ end do
+ end do
+ end do
+
+ end do
+
+ return
+ end
+
+
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine set_constants
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ ce(1,1) = 2.0d0
+ ce(1,2) = 0.0d0
+ ce(1,3) = 0.0d0
+ ce(1,4) = 4.0d0
+ ce(1,5) = 5.0d0
+ ce(1,6) = 3.0d0
+ ce(1,7) = 0.5d0
+ ce(1,8) = 0.02d0
+ ce(1,9) = 0.01d0
+ ce(1,10) = 0.03d0
+ ce(1,11) = 0.5d0
+ ce(1,12) = 0.4d0
+ ce(1,13) = 0.3d0
+
+ ce(2,1) = 1.0d0
+ ce(2,2) = 0.0d0
+ ce(2,3) = 0.0d0
+ ce(2,4) = 0.0d0
+ ce(2,5) = 1.0d0
+ ce(2,6) = 2.0d0
+ ce(2,7) = 3.0d0
+ ce(2,8) = 0.01d0
+ ce(2,9) = 0.03d0
+ ce(2,10) = 0.02d0
+ ce(2,11) = 0.4d0
+ ce(2,12) = 0.3d0
+ ce(2,13) = 0.5d0
+
+ ce(3,1) = 2.0d0
+ ce(3,2) = 2.0d0
+ ce(3,3) = 0.0d0
+ ce(3,4) = 0.0d0
+ ce(3,5) = 0.0d0
+ ce(3,6) = 2.0d0
+ ce(3,7) = 3.0d0
+ ce(3,8) = 0.04d0
+ ce(3,9) = 0.03d0
+ ce(3,10) = 0.05d0
+ ce(3,11) = 0.3d0
+ ce(3,12) = 0.5d0
+ ce(3,13) = 0.4d0
+
+ ce(4,1) = 2.0d0
+ ce(4,2) = 2.0d0
+ ce(4,3) = 0.0d0
+ ce(4,4) = 0.0d0
+ ce(4,5) = 0.0d0
+ ce(4,6) = 2.0d0
+ ce(4,7) = 3.0d0
+ ce(4,8) = 0.03d0
+ ce(4,9) = 0.05d0
+ ce(4,10) = 0.04d0
+ ce(4,11) = 0.2d0
+ ce(4,12) = 0.1d0
+ ce(4,13) = 0.3d0
+
+ ce(5,1) = 5.0d0
+ ce(5,2) = 4.0d0
+ ce(5,3) = 3.0d0
+ ce(5,4) = 2.0d0
+ ce(5,5) = 0.1d0
+ ce(5,6) = 0.4d0
+ ce(5,7) = 0.3d0
+ ce(5,8) = 0.05d0
+ ce(5,9) = 0.04d0
+ ce(5,10) = 0.03d0
+ ce(5,11) = 0.1d0
+ ce(5,12) = 0.3d0
+ ce(5,13) = 0.2d0
+
+ c1 = 1.4d0
+ c2 = 0.4d0
+ c3 = 0.1d0
+ c4 = 1.0d0
+ c5 = 1.4d0
+
+ bt = dsqrt(0.5d0)
+
+ dnxm1 = 1.0d0 / dble(grid_points(1)-1)
+ dnym1 = 1.0d0 / dble(grid_points(2)-1)
+ dnzm1 = 1.0d0 / dble(grid_points(3)-1)
+
+ c1c2 = c1 * c2
+ c1c5 = c1 * c5
+ c3c4 = c3 * c4
+ c1345 = c1c5 * c3c4
+
+ conz1 = (1.0d0-c1c5)
+
+ tx1 = 1.0d0 / (dnxm1 * dnxm1)
+ tx2 = 1.0d0 / (2.0d0 * dnxm1)
+ tx3 = 1.0d0 / dnxm1
+
+ ty1 = 1.0d0 / (dnym1 * dnym1)
+ ty2 = 1.0d0 / (2.0d0 * dnym1)
+ ty3 = 1.0d0 / dnym1
+
+ tz1 = 1.0d0 / (dnzm1 * dnzm1)
+ tz2 = 1.0d0 / (2.0d0 * dnzm1)
+ tz3 = 1.0d0 / dnzm1
+
+ dx1 = 0.75d0
+ dx2 = 0.75d0
+ dx3 = 0.75d0
+ dx4 = 0.75d0
+ dx5 = 0.75d0
+
+ dy1 = 0.75d0
+ dy2 = 0.75d0
+ dy3 = 0.75d0
+ dy4 = 0.75d0
+ dy5 = 0.75d0
+
+ dz1 = 1.0d0
+ dz2 = 1.0d0
+ dz3 = 1.0d0
+ dz4 = 1.0d0
+ dz5 = 1.0d0
+
+ dxmax = dmax1(dx3, dx4)
+ dymax = dmax1(dy2, dy4)
+ dzmax = dmax1(dz2, dz3)
+
+ dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) )
+
+ c4dssp = 4.0d0 * dssp
+ c5dssp = 5.0d0 * dssp
+
+ dttx1 = dt*tx1
+ dttx2 = dt*tx2
+ dtty1 = dt*ty1
+ dtty2 = dt*ty2
+ dttz1 = dt*tz1
+ dttz2 = dt*tz2
+
+ c2dttx1 = 2.0d0*dttx1
+ c2dtty1 = 2.0d0*dtty1
+ c2dttz1 = 2.0d0*dttz1
+
+ dtdssp = dt*dssp
+
+ comz1 = dtdssp
+ comz4 = 4.0d0*dtdssp
+ comz5 = 5.0d0*dtdssp
+ comz6 = 6.0d0*dtdssp
+
+ c3c4tx3 = c3c4*tx3
+ c3c4ty3 = c3c4*ty3
+ c3c4tz3 = c3c4*tz3
+
+ dx1tx1 = dx1*tx1
+ dx2tx1 = dx2*tx1
+ dx3tx1 = dx3*tx1
+ dx4tx1 = dx4*tx1
+ dx5tx1 = dx5*tx1
+
+ dy1ty1 = dy1*ty1
+ dy2ty1 = dy2*ty1
+ dy3ty1 = dy3*ty1
+ dy4ty1 = dy4*ty1
+ dy5ty1 = dy5*ty1
+
+ dz1tz1 = dz1*tz1
+ dz2tz1 = dz2*tz1
+ dz3tz1 = dz3*tz1
+ dz4tz1 = dz4*tz1
+ dz5tz1 = dz5*tz1
+
+ c2iv = 2.5d0
+ con43 = 4.0d0/3.0d0
+ con16 = 1.0d0/6.0d0
+
+ xxcon1 = c3c4tx3*con43*tx3
+ xxcon2 = c3c4tx3*tx3
+ xxcon3 = c3c4tx3*conz1*tx3
+ xxcon4 = c3c4tx3*con16*tx3
+ xxcon5 = c3c4tx3*c1c5*tx3
+
+ yycon1 = c3c4ty3*con43*ty3
+ yycon2 = c3c4ty3*ty3
+ yycon3 = c3c4ty3*conz1*ty3
+ yycon4 = c3c4ty3*con16*ty3
+ yycon5 = c3c4ty3*c1c5*ty3
+
+ zzcon1 = c3c4tz3*con43*tz3
+ zzcon2 = c3c4tz3*tz3
+ zzcon3 = c3c4tz3*conz1*tz3
+ zzcon4 = c3c4tz3*con16*tz3
+ zzcon5 = c3c4tz3*c1c5*tz3
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine setup_mpi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c set up MPI stuff
+c---------------------------------------------------------------------
+
+ implicit none
+ include 'mpinpb.h'
+ include 'npbparams.h'
+ integer error, nc, color
+
+ call mpi_init(error)
+
+ call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error)
+ call mpi_comm_rank(MPI_COMM_WORLD, node, error)
+
+ if (.not. convertdouble) then
+ dp_type = MPI_DOUBLE_PRECISION
+ else
+ dp_type = MPI_REAL
+ endif
+
+c---------------------------------------------------------------------
+c compute square root; add small number to allow for roundoff
+c---------------------------------------------------------------------
+ nc = dint(dsqrt(dble(total_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c We handle a non-square number of nodes by making the excess nodes
+c inactive. However, we can never handle more cells than were compiled
+c in.
+c---------------------------------------------------------------------
+
+ if (nc .gt. maxcells) nc = maxcells
+
+ if (node .ge. nc*nc) then
+ active = .false.
+ color = 1
+ else
+ active = .true.
+ color = 0
+ end if
+
+ call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error)
+ if (.not. active) return
+
+ call mpi_comm_size(comm_setup, no_nodes, error)
+ call mpi_comm_dup(comm_setup, comm_solve, error)
+ call mpi_comm_dup(comm_setup, comm_rhs, error)
+
+c---------------------------------------------------------------------
+c let node 0 be the root for the group (there is only one)
+c---------------------------------------------------------------------
+ root = 0
+
+ return
+ end
+
--- /dev/null
+!-------------------------------------------------------------------------!
+! !
+! N A S P A R A L L E L B E N C H M A R K S 3.3 !
+! !
+! S P !
+! !
+!-------------------------------------------------------------------------!
+! !
+! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
+! It is described in NAS Technical Reports 95-020 and 02-007 !
+! !
+! Permission to use, copy, distribute and modify this software !
+! for any purpose with or without fee is hereby granted. We !
+! request, however, that all derived work reference the NAS !
+! Parallel Benchmarks 3.3. This software is provided "as is" !
+! without express or implied warranty. !
+! !
+! Information on NPB 3.3, including the technical report, the !
+! original specifications, source code, results and information !
+! on how to submit new results, is available at: !
+! !
+! http://www.nas.nasa.gov/Software/NPB/ !
+! !
+! Send comments or suggestions to npb@nas.nasa.gov !
+! !
+! NAS Parallel Benchmarks Group !
+! NASA Ames Research Center !
+! Mail Stop: T27A-1 !
+! Moffett Field, CA 94035-1000 !
+! !
+! E-mail: npb@nas.nasa.gov !
+! Fax: (650) 604-3957 !
+! !
+!-------------------------------------------------------------------------!
+
+
+c---------------------------------------------------------------------
+c
+c Authors: R. F. Van der Wijngaart
+c W. Saphir
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+ program MPSP
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i, niter, step, c, error, fstatus
+ external timer_read
+ double precision mflops, t, tmax, timer_read
+ logical verified
+ character class
+
+ call setup_mpi
+ if (.not. active) goto 999
+
+c---------------------------------------------------------------------
+c Root node reads input file (if it exists) else takes
+c defaults from parameters
+c---------------------------------------------------------------------
+ if (node .eq. root) then
+
+ write(*, 1000)
+ open (unit=2,file='inputsp.data',status='old', iostat=fstatus)
+c
+ if (fstatus .eq. 0) then
+ write(*,233)
+ 233 format(' Reading from input file inputsp.data')
+ read (2,*) niter
+ read (2,*) dt
+ read (2,*) grid_points(1), grid_points(2), grid_points(3)
+ close(2)
+ else
+ write(*,234)
+ niter = niter_default
+ dt = dt_default
+ grid_points(1) = problem_size
+ grid_points(2) = problem_size
+ grid_points(3) = problem_size
+ endif
+ 234 format(' No input file inputsp.data. Using compiled defaults')
+
+ write(*, 1001) grid_points(1), grid_points(2), grid_points(3)
+ write(*, 1002) niter, dt
+ if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes
+ if (no_nodes .ne. maxcells*maxcells)
+ > write(*, 1005) maxcells*maxcells
+ write(*, 1003) no_nodes
+
+ 1000 format(//,' NAS Parallel Benchmarks 3.3 -- SP Benchmark',/)
+ 1001 format(' Size: ', i4, 'x', i4, 'x', i4)
+ 1002 format(' Iterations: ', i4, ' dt: ', F11.7)
+ 1004 format(' Total number of processes: ', i5)
+ 1005 format(' WARNING: compiled for ', i5, ' processes ')
+ 1003 format(' Number of active processes: ', i5, /)
+
+ endif
+
+ call mpi_bcast(niter, 1, MPI_INTEGER,
+ > root, comm_setup, error)
+
+ call mpi_bcast(dt, 1, dp_type,
+ > root, comm_setup, error)
+
+ call mpi_bcast(grid_points(1), 3, MPI_INTEGER,
+ > root, comm_setup, error)
+
+
+ call make_set
+
+ do c = 1, ncells
+ if ( (cell_size(1,c) .gt. IMAX) .or.
+ > (cell_size(2,c) .gt. JMAX) .or.
+ > (cell_size(3,c) .gt. KMAX) ) then
+ print *,node, c, (cell_size(i,c),i=1,3)
+ print *,' Problem size too big for compiled array sizes'
+ goto 999
+ endif
+ end do
+
+ call set_constants
+
+ call initialize
+
+c call mpi_finalize(error)
+c stop
+
+ call lhsinit
+
+ call exact_rhs
+
+ call compute_buffer_size(5)
+
+c---------------------------------------------------------------------
+c do one time step to touch all code, and reinitialize
+c---------------------------------------------------------------------
+ call adi
+ call initialize
+
+c---------------------------------------------------------------------
+c Synchronize before placing time stamp
+c---------------------------------------------------------------------
+ call mpi_barrier(comm_setup, error)
+
+ call timer_clear(1)
+ call timer_start(1)
+
+ do step = 1, niter
+
+ if (node .eq. root) then
+ if (mod(step, 20) .eq. 0 .or.
+ > step .eq. 1) then
+ write(*, 200) step
+ 200 format(' Time step ', i4)
+ endif
+ endif
+
+ call adi
+
+ end do
+
+ call timer_stop(1)
+ t = timer_read(1)
+
+ call verify(niter, class, verified)
+
+ call mpi_reduce(t, tmax, 1,
+ > dp_type, MPI_MAX,
+ > root, comm_setup, error)
+
+ if( node .eq. root ) then
+ if( tmax .ne. 0. ) then
+ mflops = (881.174*float( problem_size )**3
+ > -4683.91*float( problem_size )**2
+ > +11484.5*float( problem_size )
+ > -19272.4) * float( niter ) / (tmax*1000000.0d0)
+ else
+ mflops = 0.0
+ endif
+
+ call print_results('SP', class, grid_points(1),
+ > grid_points(2), grid_points(3), niter, maxcells*maxcells,
+ > total_nodes, tmax, mflops, ' floating point',
+ > verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5,
+ > cs6, '(none)')
+ endif
+
+ 999 continue
+ call mpi_barrier(MPI_COMM_WORLD, error)
+ call mpi_finalize(error)
+
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine txinvr
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c block-diagonal matrix-vector multiplication
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer c, i, j, k
+ double precision t1, t2, t3, ac, ru1, uu, vv, ww, r1, r2, r3,
+ > r4, r5, ac2inv
+
+c---------------------------------------------------------------------
+c loop over all cells owned by this node
+c---------------------------------------------------------------------
+ do c = 1, ncells
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+ ru1 = rho_i(i,j,k,c)
+ uu = us(i,j,k,c)
+ vv = vs(i,j,k,c)
+ ww = ws(i,j,k,c)
+ ac = speed(i,j,k,c)
+ ac2inv = ainv(i,j,k,c)*ainv(i,j,k,c)
+
+ r1 = rhs(i,j,k,1,c)
+ r2 = rhs(i,j,k,2,c)
+ r3 = rhs(i,j,k,3,c)
+ r4 = rhs(i,j,k,4,c)
+ r5 = rhs(i,j,k,5,c)
+
+ t1 = c2 * ac2inv * ( qs(i,j,k,c)*r1 - uu*r2 -
+ > vv*r3 - ww*r4 + r5 )
+ t2 = bt * ru1 * ( uu * r1 - r2 )
+ t3 = ( bt * ru1 * ac ) * t1
+
+ rhs(i,j,k,1,c) = r1 - t1
+ rhs(i,j,k,2,c) = - ru1 * ( ww*r1 - r4 )
+ rhs(i,j,k,3,c) = ru1 * ( vv*r1 - r3 )
+ rhs(i,j,k,4,c) = - t2 + t3
+ rhs(i,j,k,5,c) = t2 + t3
+ end do
+ end do
+ end do
+ end do
+
+ return
+ end
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine tzetar(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c block-diagonal matrix-vector multiplication
+c---------------------------------------------------------------------
+
+ include 'header.h'
+
+ integer i, j, k, c
+ double precision t1, t2, t3, ac, xvel, yvel, zvel, r1, r2, r3,
+ > r4, r5, btuz, acinv, ac2u, uzik1
+
+c---------------------------------------------------------------------
+c treat only one cell
+c---------------------------------------------------------------------
+ do k = start(3,c), cell_size(3,c)-end(3,c)-1
+ do j = start(2,c), cell_size(2,c)-end(2,c)-1
+ do i = start(1,c), cell_size(1,c)-end(1,c)-1
+
+ xvel = us(i,j,k,c)
+ yvel = vs(i,j,k,c)
+ zvel = ws(i,j,k,c)
+ ac = speed(i,j,k,c)
+ acinv = ainv(i,j,k,c)
+
+ ac2u = ac*ac
+
+ r1 = rhs(i,j,k,1,c)
+ r2 = rhs(i,j,k,2,c)
+ r3 = rhs(i,j,k,3,c)
+ r4 = rhs(i,j,k,4,c)
+ r5 = rhs(i,j,k,5,c)
+
+ uzik1 = u(i,j,k,1,c)
+ btuz = bt * uzik1
+
+ t1 = btuz*acinv * (r4 + r5)
+ t2 = r3 + t1
+ t3 = btuz * (r4 - r5)
+
+ rhs(i,j,k,1,c) = t2
+ rhs(i,j,k,2,c) = -uzik1*r2 + xvel*t2
+ rhs(i,j,k,3,c) = uzik1*r1 + yvel*t2
+ rhs(i,j,k,4,c) = zvel*t2 + t3
+ rhs(i,j,k,5,c) = uzik1*(-xvel*r2 + yvel*r1) +
+ > qs(i,j,k,c)*t2 + c2iv*ac2u*t1 + zvel*t3
+
+ end do
+ end do
+ end do
+
+ return
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine verify(no_time_steps, class, verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c verification routine
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5),
+ > epsilon, xce(5), xcr(5), dtref
+ integer m, no_time_steps
+ character class
+ logical verified
+
+c---------------------------------------------------------------------
+c tolerance level
+c---------------------------------------------------------------------
+ epsilon = 1.0d-08
+
+
+c---------------------------------------------------------------------
+c compute the error norm and the residual norm, and exit if not printing
+c---------------------------------------------------------------------
+ call error_norm(xce)
+ call copy_faces
+
+ call rhs_norm(xcr)
+
+ do m = 1, 5
+ xcr(m) = xcr(m) / dt
+ enddo
+
+ if (node .ne. 0) return
+
+ class = 'U'
+ verified = .true.
+
+ do m = 1,5
+ xcrref(m) = 1.0
+ xceref(m) = 1.0
+ end do
+
+c---------------------------------------------------------------------
+c reference data for 12X12X12 grids after 100 time steps, with DT = 1.50d-02
+c---------------------------------------------------------------------
+ if ( (grid_points(1) .eq. 12 ) .and.
+ > (grid_points(2) .eq. 12 ) .and.
+ > (grid_points(3) .eq. 12 ) .and.
+ > (no_time_steps .eq. 100 )) then
+
+ class = 'S'
+ dtref = 1.5d-2
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 2.7470315451339479d-02
+ xcrref(2) = 1.0360746705285417d-02
+ xcrref(3) = 1.6235745065095532d-02
+ xcrref(4) = 1.5840557224455615d-02
+ xcrref(5) = 3.4849040609362460d-02
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+ xceref(1) = 2.7289258557377227d-05
+ xceref(2) = 1.0364446640837285d-05
+ xceref(3) = 1.6154798287166471d-05
+ xceref(4) = 1.5750704994480102d-05
+ xceref(5) = 3.4177666183390531d-05
+
+
+c---------------------------------------------------------------------
+c reference data for 36X36X36 grids after 400 time steps, with DT = 1.5d-03
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 36) .and.
+ > (grid_points(2) .eq. 36) .and.
+ > (grid_points(3) .eq. 36) .and.
+ > (no_time_steps . eq. 400) ) then
+
+ class = 'W'
+ dtref = 1.5d-3
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 0.1893253733584d-02
+ xcrref(2) = 0.1717075447775d-03
+ xcrref(3) = 0.2778153350936d-03
+ xcrref(4) = 0.2887475409984d-03
+ xcrref(5) = 0.3143611161242d-02
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+ xceref(1) = 0.7542088599534d-04
+ xceref(2) = 0.6512852253086d-05
+ xceref(3) = 0.1049092285688d-04
+ xceref(4) = 0.1128838671535d-04
+ xceref(5) = 0.1212845639773d-03
+
+c---------------------------------------------------------------------
+c reference data for 64X64X64 grids after 400 time steps, with DT = 1.5d-03
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 64) .and.
+ > (grid_points(2) .eq. 64) .and.
+ > (grid_points(3) .eq. 64) .and.
+ > (no_time_steps . eq. 400) ) then
+
+ class = 'A'
+ dtref = 1.5d-3
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 2.4799822399300195d0
+ xcrref(2) = 1.1276337964368832d0
+ xcrref(3) = 1.5028977888770491d0
+ xcrref(4) = 1.4217816211695179d0
+ xcrref(5) = 2.1292113035138280d0
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+ xceref(1) = 1.0900140297820550d-04
+ xceref(2) = 3.7343951769282091d-05
+ xceref(3) = 5.0092785406541633d-05
+ xceref(4) = 4.7671093939528255d-05
+ xceref(5) = 1.3621613399213001d-04
+
+c---------------------------------------------------------------------
+c reference data for 102X102X102 grids after 400 time steps,
+c with DT = 1.0d-03
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 102) .and.
+ > (grid_points(2) .eq. 102) .and.
+ > (grid_points(3) .eq. 102) .and.
+ > (no_time_steps . eq. 400) ) then
+
+ class = 'B'
+ dtref = 1.0d-3
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 0.6903293579998d+02
+ xcrref(2) = 0.3095134488084d+02
+ xcrref(3) = 0.4103336647017d+02
+ xcrref(4) = 0.3864769009604d+02
+ xcrref(5) = 0.5643482272596d+02
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+ xceref(1) = 0.9810006190188d-02
+ xceref(2) = 0.1022827905670d-02
+ xceref(3) = 0.1720597911692d-02
+ xceref(4) = 0.1694479428231d-02
+ xceref(5) = 0.1847456263981d-01
+
+c---------------------------------------------------------------------
+c reference data for 162X162X162 grids after 400 time steps,
+c with DT = 0.67d-03
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 162) .and.
+ > (grid_points(2) .eq. 162) .and.
+ > (grid_points(3) .eq. 162) .and.
+ > (no_time_steps . eq. 400) ) then
+
+ class = 'C'
+ dtref = 0.67d-3
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 0.5881691581829d+03
+ xcrref(2) = 0.2454417603569d+03
+ xcrref(3) = 0.3293829191851d+03
+ xcrref(4) = 0.3081924971891d+03
+ xcrref(5) = 0.4597223799176d+03
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+ xceref(1) = 0.2598120500183d+00
+ xceref(2) = 0.2590888922315d-01
+ xceref(3) = 0.5132886416320d-01
+ xceref(4) = 0.4806073419454d-01
+ xceref(5) = 0.5483377491301d+00
+
+c---------------------------------------------------------------------
+c reference data for 408X408X408 grids after 500 time steps,
+c with DT = 0.3d-03
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 408) .and.
+ > (grid_points(2) .eq. 408) .and.
+ > (grid_points(3) .eq. 408) .and.
+ > (no_time_steps . eq. 500) ) then
+
+ class = 'D'
+ dtref = 0.30d-3
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 0.1044696216887d+05
+ xcrref(2) = 0.3204427762578d+04
+ xcrref(3) = 0.4648680733032d+04
+ xcrref(4) = 0.4238923283697d+04
+ xcrref(5) = 0.7588412036136d+04
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+ xceref(1) = 0.5089471423669d+01
+ xceref(2) = 0.5323514855894d+00
+ xceref(3) = 0.1187051008971d+01
+ xceref(4) = 0.1083734951938d+01
+ xceref(5) = 0.1164108338568d+02
+
+c---------------------------------------------------------------------
+c reference data for 1020X1020X1020 grids after 500 time steps,
+c with DT = 0.1d-03
+c---------------------------------------------------------------------
+ elseif ( (grid_points(1) .eq. 1020) .and.
+ > (grid_points(2) .eq. 1020) .and.
+ > (grid_points(3) .eq. 1020) .and.
+ > (no_time_steps . eq. 500) ) then
+
+ class = 'E'
+ dtref = 0.10d-3
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+ xcrref(1) = 0.6255387422609d+05
+ xcrref(2) = 0.1495317020012d+05
+ xcrref(3) = 0.2347595750586d+05
+ xcrref(4) = 0.2091099783534d+05
+ xcrref(5) = 0.4770412841218d+05
+
+c---------------------------------------------------------------------
+c Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+ xceref(1) = 0.6742735164909d+02
+ xceref(2) = 0.5390656036938d+01
+ xceref(3) = 0.1680647196477d+02
+ xceref(4) = 0.1536963126457d+02
+ xceref(5) = 0.1575330146156d+03
+
+ else
+ verified = .false.
+ endif
+
+c---------------------------------------------------------------------
+c verification test for residuals if gridsize is one of
+c the defined grid sizes above (class .ne. 'U')
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c Compute the difference of solution values and the known reference values.
+c---------------------------------------------------------------------
+ do m = 1, 5
+
+ xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m))
+ xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
+
+ enddo
+
+c---------------------------------------------------------------------
+c Output the comparison of computed results to known cases.
+c---------------------------------------------------------------------
+
+ if (class .ne. 'U') then
+ write(*, 1990) class
+ 1990 format(' Verification being performed for class ', a)
+ write (*,2000) epsilon
+ 2000 format(' accuracy setting for epsilon = ', E20.13)
+ verified = (dabs(dt-dtref) .le. epsilon)
+ if (.not.verified) then
+ class = 'U'
+ write (*,1000) dtref
+ 1000 format(' DT does not match the reference value of ',
+ > E15.8)
+ endif
+ else
+ write(*, 1995)
+ 1995 format(' Unknown class')
+ endif
+
+
+ if (class .ne. 'U') then
+ write (*,2001)
+ else
+ write (*, 2005)
+ endif
+
+ 2001 format(' Comparison of RMS-norms of residual')
+ 2005 format(' RMS-norms of residual')
+ do m = 1, 5
+ if (class .eq. 'U') then
+ write(*, 2015) m, xcr(m)
+ else if (xcrdif(m) .le. epsilon) then
+ write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
+ else
+ verified = .false.
+ write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
+ endif
+ enddo
+
+ if (class .ne. 'U') then
+ write (*,2002)
+ else
+ write (*,2006)
+ endif
+ 2002 format(' Comparison of RMS-norms of solution error')
+ 2006 format(' RMS-norms of solution error')
+
+ do m = 1, 5
+ if (class .eq. 'U') then
+ write(*, 2015) m, xce(m)
+ else if (xcedif(m) .le. epsilon) then
+ write (*,2011) m,xce(m),xceref(m),xcedif(m)
+ else
+ verified = .false.
+ write (*,2010) m,xce(m),xceref(m),xcedif(m)
+ endif
+ enddo
+
+ 2010 format(' FAILURE: ', i2, E20.13, E20.13, E20.13)
+ 2011 format(' ', i2, E20.13, E20.13, E20.13)
+ 2015 format(' ', i2, E20.13)
+
+ if (class .eq. 'U') then
+ write(*, 2022)
+ write(*, 2023)
+ 2022 format(' No reference values provided')
+ 2023 format(' No verification performed')
+ else if (verified) then
+ write(*, 2020)
+ 2020 format(' Verification Successful')
+ else
+ write(*, 2021)
+ 2021 format(' Verification failed')
+ endif
+
+ return
+
+
+ end
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine x_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function performs the solution of the approximate factorization
+c step in the x-direction for all five matrix components
+c simultaneously. The Thomas algorithm is employed to solve the
+c systems for the x-lines. Boundary conditions are non-periodic
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+
+ integer i, j, k, jp, kp, n, iend, jsize, ksize, i1, i2,
+ > buffer_size, c, m, p, istart, stage, error,
+ > requests(2), statuses(MPI_STATUS_SIZE, 2)
+ double precision r1, r2, d, e, s(5), sm1, sm2,
+ > fac1, fac2
+
+
+
+c---------------------------------------------------------------------
+c OK, now we know that there are multiple processors
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
+c on this node in the direction of increasing i for the forward sweep,
+c and after that reversing the direction for the backsubstitution.
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c FORWARD ELIMINATION
+c---------------------------------------------------------------------
+ do stage = 1, ncells
+ c = slice(1,stage)
+
+ istart = 0
+ iend = cell_size(1,c)-1
+
+ jsize = cell_size(2,c)
+ ksize = cell_size(3,c)
+ jp = cell_coord(2,c)-1
+ kp = cell_coord(3,c)-1
+
+ buffer_size = (jsize-start(2,c)-end(2,c)) *
+ > (ksize-start(3,c)-end(3,c))
+
+ if ( stage .ne. 1) then
+
+c---------------------------------------------------------------------
+c if this is not the first processor in this row of cells,
+c receive data from predecessor containing the right hand
+c sides and the upper diagonal elements of the previous two rows
+c---------------------------------------------------------------------
+ call mpi_irecv(in_buffer, 22*buffer_size,
+ > dp_type, predecessor(1),
+ > DEFAULT_TAG, comm_solve,
+ > requests(1), error)
+
+
+c---------------------------------------------------------------------
+c communication has already been started.
+c compute the left hand side while waiting for the msg
+c---------------------------------------------------------------------
+ call lhsx(c)
+
+c---------------------------------------------------------------------
+c wait for pending communication to complete
+c This waits on the current receive and on the send
+c from the previous stage. They always come in pairs.
+c---------------------------------------------------------------------
+
+ call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c unpack the buffer
+c---------------------------------------------------------------------
+ i = istart
+ i1 = istart + 1
+ n = 0
+
+c---------------------------------------------------------------------
+c create a running pointer
+c---------------------------------------------------------------------
+ p = 0
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+ > in_buffer(p+1) * lhs(i,j,k,n+1,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+ > in_buffer(p+2) * lhs(i,j,k,n+1,c)
+ do m = 1, 3
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
+ end do
+ d = in_buffer(p+6)
+ e = in_buffer(p+7)
+ do m = 1, 3
+ s(m) = in_buffer(p+7+m)
+ end do
+ r1 = lhs(i,j,k,n+2,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+ lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+ do m = 1, 3
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
+ end do
+ r2 = lhs(i1,j,k,n+1,c)
+ lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
+ lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
+ do m = 1, 3
+ rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2
+ end do
+ p = p + 10
+ end do
+ end do
+
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+ > in_buffer(p+1) * lhs(i,j,k,n+1,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+ > in_buffer(p+2) * lhs(i,j,k,n+1,c)
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > in_buffer(p+3) * lhs(i,j,k,n+1,c)
+ d = in_buffer(p+4)
+ e = in_buffer(p+5)
+ s(m) = in_buffer(p+6)
+ r1 = lhs(i,j,k,n+2,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+ lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
+ r2 = lhs(i1,j,k,n+1,c)
+ lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
+ lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
+ rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2
+ p = p + 6
+ end do
+ end do
+ end do
+
+ else
+
+c---------------------------------------------------------------------
+c if this IS the first cell, we still compute the lhs
+c---------------------------------------------------------------------
+ call lhsx(c)
+ endif
+
+c---------------------------------------------------------------------
+c perform the Thomas algorithm; first, FORWARD ELIMINATION
+c---------------------------------------------------------------------
+ n = 0
+
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = istart, iend-2
+ i1 = i + 1
+ i2 = i + 2
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ end do
+ lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
+ > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
+ > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+ > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
+ end do
+ lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
+ > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
+ lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
+ > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
+ > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c The last two rows in this grid block are a bit different,
+c since they do not have two more rows available for the
+c elimination of off-diagonal entries
+c---------------------------------------------------------------------
+
+ i = iend - 1
+ i1 = iend
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ end do
+ lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
+ > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
+ > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+ > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
+ end do
+c---------------------------------------------------------------------
+c scale the last row immediately (some of this is
+c overkill in case this is the last cell)
+c---------------------------------------------------------------------
+ fac2 = 1.d0/lhs(i1,j,k,n+3,c)
+ lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
+ lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c)
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c do the u+c and the u-c factors
+c---------------------------------------------------------------------
+
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = istart, iend-2
+ i1 = i + 1
+ i2 = i + 2
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
+ > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
+ > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
+ rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+ > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
+ lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
+ > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
+ lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
+ > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
+ rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
+ > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c And again the last two rows separately
+c---------------------------------------------------------------------
+ i = iend - 1
+ i1 = iend
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
+ > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
+ > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
+ rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+ > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
+c---------------------------------------------------------------------
+c Scale the last row immediately (some of this is overkill
+c if this is the last cell)
+c---------------------------------------------------------------------
+ fac2 = 1.d0/lhs(i1,j,k,n+3,c)
+ lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
+ lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)
+ rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c)
+
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c send information to the next processor, except when this
+c is the last grid block
+c---------------------------------------------------------------------
+ if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c create a running pointer for the send buffer
+c---------------------------------------------------------------------
+ p = 0
+ n = 0
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = iend-1, iend
+ out_buffer(p+1) = lhs(i,j,k,n+4,c)
+ out_buffer(p+2) = lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ out_buffer(p+2+m) = rhs(i,j,k,m,c)
+ end do
+ p = p+5
+ end do
+ end do
+ end do
+
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = iend-1, iend
+ out_buffer(p+1) = lhs(i,j,k,n+4,c)
+ out_buffer(p+2) = lhs(i,j,k,n+5,c)
+ out_buffer(p+3) = rhs(i,j,k,m,c)
+ p = p + 3
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c send data to next phase
+c can't receive data yet because buffer size will be wrong
+c---------------------------------------------------------------------
+ call mpi_isend(out_buffer, 22*buffer_size,
+ > dp_type, successor(1),
+ > DEFAULT_TAG, comm_solve,
+ > requests(2), error)
+
+ endif
+ end do
+
+c---------------------------------------------------------------------
+c now go in the reverse direction
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c BACKSUBSTITUTION
+c---------------------------------------------------------------------
+ do stage = ncells, 1, -1
+ c = slice(1,stage)
+
+ istart = 0
+ iend = cell_size(1,c)-1
+
+ jsize = cell_size(2,c)
+ ksize = cell_size(3,c)
+ jp = cell_coord(2,c)-1
+ kp = cell_coord(3,c)-1
+
+ buffer_size = (jsize-start(2,c)-end(2,c)) *
+ > (ksize-start(3,c)-end(3,c))
+
+ if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c if this is not the starting cell in this row of cells,
+c wait for a message to be received, containing the
+c solution of the previous two stations
+c---------------------------------------------------------------------
+ call mpi_irecv(in_buffer, 10*buffer_size,
+ > dp_type, successor(1),
+ > DEFAULT_TAG, comm_solve,
+ > requests(1), error)
+
+
+c---------------------------------------------------------------------
+c communication has already been started
+c while waiting, do the block-diagonal inversion for the
+c cell that was just finished
+c---------------------------------------------------------------------
+
+ call ninvr(slice(1,stage+1))
+
+c---------------------------------------------------------------------
+c wait for pending communication to complete
+c---------------------------------------------------------------------
+ call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c unpack the buffer for the first three factors
+c---------------------------------------------------------------------
+ n = 0
+ p = 0
+ i = iend
+ i1 = i - 1
+ do m = 1, 3
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ sm1 = in_buffer(p+1)
+ sm2 = in_buffer(p+2)
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*sm1 -
+ > lhs(i,j,k,n+5,c)*sm2
+ rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+ > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) -
+ > lhs(i1,j,k,n+5,c) * sm1
+ p = p + 2
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c now unpack the buffer for the remaining two factors
+c---------------------------------------------------------------------
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ sm1 = in_buffer(p+1)
+ sm2 = in_buffer(p+2)
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*sm1 -
+ > lhs(i,j,k,n+5,c)*sm2
+ rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
+ > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) -
+ > lhs(i1,j,k,n+5,c) * sm1
+ p = p + 2
+ end do
+ end do
+ end do
+
+ else
+
+c---------------------------------------------------------------------
+c now we know this is the first grid block on the back sweep,
+c so we don't need a message to start the substitution.
+c---------------------------------------------------------------------
+ i = iend-1
+ i1 = iend
+ n = 0
+ do m = 1, 3
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
+ end do
+ end do
+ end do
+
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c Whether or not this is the last processor, we always have
+c to complete the back-substitution
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c The first three factors
+c---------------------------------------------------------------------
+ n = 0
+ do m = 1, 3
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = iend-2, istart, -1
+ i1 = i + 1
+ i2 = i + 2
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
+ > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c And the remaining two
+c---------------------------------------------------------------------
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = iend-2, istart, -1
+ i1 = i + 1
+ i2 = i + 2
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
+ > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c send on information to the previous processor, if needed
+c---------------------------------------------------------------------
+ if (stage .ne. 1) then
+ i = istart
+ i1 = istart+1
+ p = 0
+ do m = 1, 5
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = start(2,c), jsize-end(2,c)-1
+ out_buffer(p+1) = rhs(i,j,k,m,c)
+ out_buffer(p+2) = rhs(i1,j,k,m,c)
+ p = p + 2
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c pack and send the buffer
+c---------------------------------------------------------------------
+ call mpi_isend(out_buffer, 10*buffer_size,
+ > dp_type, predecessor(1),
+ > DEFAULT_TAG, comm_solve,
+ > requests(2), error)
+
+ endif
+
+c---------------------------------------------------------------------
+c If this was the last stage, do the block-diagonal inversion
+c---------------------------------------------------------------------
+ if (stage .eq. 1) call ninvr(c)
+
+ end do
+
+ return
+ end
+
+
+
+
+
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine y_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function performs the solution of the approximate factorization
+c step in the y-direction for all five matrix components
+c simultaneously. The Thomas algorithm is employed to solve the
+c systems for the y-lines. Boundary conditions are non-periodic
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i, j, k, stage, ip, kp, n, isize, jend, ksize, j1, j2,
+ > buffer_size, c, m, p, jstart, error,
+ > requests(2), statuses(MPI_STATUS_SIZE, 2)
+ double precision r1, r2, d, e, s(5), sm1, sm2,
+ > fac1, fac2
+
+
+c---------------------------------------------------------------------
+c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
+c on this node in the direction of increasing i for the forward sweep,
+c and after that reversing the direction for the backsubstitution
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c FORWARD ELIMINATION
+c---------------------------------------------------------------------
+ do stage = 1, ncells
+ c = slice(2,stage)
+
+ jstart = 0
+ jend = cell_size(2,c)-1
+
+ isize = cell_size(1,c)
+ ksize = cell_size(3,c)
+ ip = cell_coord(1,c)-1
+ kp = cell_coord(3,c)-1
+
+ buffer_size = (isize-start(1,c)-end(1,c)) *
+ > (ksize-start(3,c)-end(3,c))
+
+ if ( stage .ne. 1) then
+
+c---------------------------------------------------------------------
+c if this is not the first processor in this row of cells,
+c receive data from predecessor containing the right hand
+c sides and the upper diagonal elements of the previous two rows
+c---------------------------------------------------------------------
+
+ call mpi_irecv(in_buffer, 22*buffer_size,
+ > dp_type, predecessor(2),
+ > DEFAULT_TAG, comm_solve,
+ > requests(1), error)
+
+c---------------------------------------------------------------------
+c communication has already been started.
+c compute the left hand side while waiting for the msg
+c---------------------------------------------------------------------
+ call lhsy(c)
+
+c---------------------------------------------------------------------
+c wait for pending communication to complete
+c This waits on the current receive and on the send
+c from the previous stage. They always come in pairs.
+c---------------------------------------------------------------------
+ call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c unpack the buffer
+c---------------------------------------------------------------------
+ j = jstart
+ j1 = jstart + 1
+ n = 0
+c---------------------------------------------------------------------
+c create a running pointer
+c---------------------------------------------------------------------
+ p = 0
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+ > in_buffer(p+1) * lhs(i,j,k,n+1,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+ > in_buffer(p+2) * lhs(i,j,k,n+1,c)
+ do m = 1, 3
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
+ end do
+ d = in_buffer(p+6)
+ e = in_buffer(p+7)
+ do m = 1, 3
+ s(m) = in_buffer(p+7+m)
+ end do
+ r1 = lhs(i,j,k,n+2,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+ lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+ do m = 1, 3
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
+ end do
+ r2 = lhs(i,j1,k,n+1,c)
+ lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
+ lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
+ do m = 1, 3
+ rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2
+ end do
+ p = p + 10
+ end do
+ end do
+
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+ > in_buffer(p+1) * lhs(i,j,k,n+1,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+ > in_buffer(p+2) * lhs(i,j,k,n+1,c)
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > in_buffer(p+3) * lhs(i,j,k,n+1,c)
+ d = in_buffer(p+4)
+ e = in_buffer(p+5)
+ s(m) = in_buffer(p+6)
+ r1 = lhs(i,j,k,n+2,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+ lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
+ r2 = lhs(i,j1,k,n+1,c)
+ lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
+ lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
+ rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2
+ p = p + 6
+ end do
+ end do
+ end do
+
+ else
+
+c---------------------------------------------------------------------
+c if this IS the first cell, we still compute the lhs
+c---------------------------------------------------------------------
+ call lhsy(c)
+ endif
+
+c---------------------------------------------------------------------
+c perform the Thomas algorithm; first, FORWARD ELIMINATION
+c---------------------------------------------------------------------
+ n = 0
+
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = jstart, jend-2
+ do i = start(1,c), isize-end(1,c)-1
+ j1 = j + 1
+ j2 = j + 2
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ end do
+ lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
+ > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
+ > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+ > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
+ end do
+ lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
+ > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
+ > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
+ > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c The last two rows in this grid block are a bit different,
+c since they do not have two more rows available for the
+c elimination of off-diagonal entries
+c---------------------------------------------------------------------
+
+ j = jend - 1
+ j1 = jend
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ end do
+ lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
+ > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
+ > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+ > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
+ end do
+c---------------------------------------------------------------------
+c scale the last row immediately (some of this is
+c overkill in case this is the last cell)
+c---------------------------------------------------------------------
+ fac2 = 1.d0/lhs(i,j1,k,n+3,c)
+ lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
+ lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c)
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c do the u+c and the u-c factors
+c---------------------------------------------------------------------
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = jstart, jend-2
+ do i = start(1,c), isize-end(1,c)-1
+ j1 = j + 1
+ j2 = j + 2
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
+ > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
+ > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
+ rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+ > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
+ lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
+ > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
+ > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
+ rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
+ > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c And again the last two rows separately
+c---------------------------------------------------------------------
+ j = jend - 1
+ j1 = jend
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
+ > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
+ > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
+ rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+ > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
+c---------------------------------------------------------------------
+c Scale the last row immediately (some of this is overkill
+c if this is the last cell)
+c---------------------------------------------------------------------
+ fac2 = 1.d0/lhs(i,j1,k,n+3,c)
+ lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
+ lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
+ rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c)
+
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c send information to the next processor, except when this
+c is the last grid block;
+c---------------------------------------------------------------------
+
+ if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c create a running pointer for the send buffer
+c---------------------------------------------------------------------
+ p = 0
+ n = 0
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ do j = jend-1, jend
+ out_buffer(p+1) = lhs(i,j,k,n+4,c)
+ out_buffer(p+2) = lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ out_buffer(p+2+m) = rhs(i,j,k,m,c)
+ end do
+ p = p+5
+ end do
+ end do
+ end do
+
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ do j = jend-1, jend
+ out_buffer(p+1) = lhs(i,j,k,n+4,c)
+ out_buffer(p+2) = lhs(i,j,k,n+5,c)
+ out_buffer(p+3) = rhs(i,j,k,m,c)
+ p = p + 3
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c pack and send the buffer
+c---------------------------------------------------------------------
+ call mpi_isend(out_buffer, 22*buffer_size,
+ > dp_type, successor(2),
+ > DEFAULT_TAG, comm_solve,
+ > requests(2), error)
+
+ endif
+ end do
+
+c---------------------------------------------------------------------
+c now go in the reverse direction
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c BACKSUBSTITUTION
+c---------------------------------------------------------------------
+ do stage = ncells, 1, -1
+ c = slice(2,stage)
+
+ jstart = 0
+ jend = cell_size(2,c)-1
+
+ isize = cell_size(1,c)
+ ksize = cell_size(3,c)
+ ip = cell_coord(1,c)-1
+ kp = cell_coord(3,c)-1
+
+ buffer_size = (isize-start(1,c)-end(1,c)) *
+ > (ksize-start(3,c)-end(3,c))
+
+ if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c if this is not the starting cell in this row of cells,
+c wait for a message to be received, containing the
+c solution of the previous two stations
+c---------------------------------------------------------------------
+
+ call mpi_irecv(in_buffer, 10*buffer_size,
+ > dp_type, successor(2),
+ > DEFAULT_TAG, comm_solve,
+ > requests(1), error)
+
+
+c---------------------------------------------------------------------
+c communication has already been started
+c while waiting, do the block-diagonal inversion for the
+c cell that was just finished
+c---------------------------------------------------------------------
+
+ call pinvr(slice(2,stage+1))
+
+c---------------------------------------------------------------------
+c wait for pending communication to complete
+c---------------------------------------------------------------------
+ call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c unpack the buffer for the first three factors
+c---------------------------------------------------------------------
+ n = 0
+ p = 0
+ j = jend
+ j1 = j - 1
+ do m = 1, 3
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ sm1 = in_buffer(p+1)
+ sm2 = in_buffer(p+2)
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*sm1 -
+ > lhs(i,j,k,n+5,c)*sm2
+ rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+ > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) -
+ > lhs(i,j1,k,n+5,c) * sm1
+ p = p + 2
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c now unpack the buffer for the remaining two factors
+c---------------------------------------------------------------------
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ sm1 = in_buffer(p+1)
+ sm2 = in_buffer(p+2)
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*sm1 -
+ > lhs(i,j,k,n+5,c)*sm2
+ rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
+ > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) -
+ > lhs(i,j1,k,n+5,c) * sm1
+ p = p + 2
+ end do
+ end do
+ end do
+
+ else
+c---------------------------------------------------------------------
+c now we know this is the first grid block on the back sweep,
+c so we don't need a message to start the substitution.
+c---------------------------------------------------------------------
+
+ j = jend - 1
+ j1 = jend
+ n = 0
+ do m = 1, 3
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
+ end do
+ end do
+ end do
+
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c Whether or not this is the last processor, we always have
+c to complete the back-substitution
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c The first three factors
+c---------------------------------------------------------------------
+ n = 0
+ do m = 1, 3
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = jend-2, jstart, -1
+ do i = start(1,c), isize-end(1,c)-1
+ j1 = j + 1
+ j2 = j + 2
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
+ > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c And the remaining two
+c---------------------------------------------------------------------
+ do m = 4, 5
+ n = (m-3)*5
+ do k = start(3,c), ksize-end(3,c)-1
+ do j = jend-2, jstart, -1
+ do i = start(1,c), isize-end(1,c)-1
+ j1 = j + 1
+ j2 = j1 + 1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
+ > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c send on information to the previous processor, if needed
+c---------------------------------------------------------------------
+ if (stage .ne. 1) then
+ j = jstart
+ j1 = jstart + 1
+ p = 0
+ do m = 1, 5
+ do k = start(3,c), ksize-end(3,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ out_buffer(p+1) = rhs(i,j,k,m,c)
+ out_buffer(p+2) = rhs(i,j1,k,m,c)
+ p = p + 2
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c pack and send the buffer
+c---------------------------------------------------------------------
+
+ call mpi_isend(out_buffer, 10*buffer_size,
+ > dp_type, predecessor(2),
+ > DEFAULT_TAG, comm_solve,
+ > requests(2), error)
+
+ endif
+
+c---------------------------------------------------------------------
+c If this was the last stage, do the block-diagonal inversion
+c---------------------------------------------------------------------
+ if (stage .eq. 1) call pinvr(c)
+
+ end do
+
+ return
+ end
+
+
+
+
+
+
+
--- /dev/null
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine z_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c this function performs the solution of the approximate factorization
+c step in the z-direction for all five matrix components
+c simultaneously. The Thomas algorithm is employed to solve the
+c systems for the z-lines. Boundary conditions are non-periodic
+c---------------------------------------------------------------------
+
+ include 'header.h'
+ include 'mpinpb.h'
+
+ integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2,
+ > buffer_size, c, m, p, kstart, error,
+ > requests(2), statuses(MPI_STATUS_SIZE, 2)
+ double precision r1, r2, d, e, s(5), sm1, sm2,
+ > fac1, fac2
+
+c---------------------------------------------------------------------
+c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
+c on this node in the direction of increasing i for the forward sweep,
+c and after that reversing the direction for the backsubstitution
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c FORWARD ELIMINATION
+c---------------------------------------------------------------------
+ do stage = 1, ncells
+ c = slice(3,stage)
+
+ kstart = 0
+ kend = cell_size(3,c)-1
+
+ isize = cell_size(1,c)
+ jsize = cell_size(2,c)
+ ip = cell_coord(1,c)-1
+ jp = cell_coord(2,c)-1
+
+ buffer_size = (isize-start(1,c)-end(1,c)) *
+ > (jsize-start(2,c)-end(2,c))
+
+ if (stage .ne. 1) then
+
+
+c---------------------------------------------------------------------
+c if this is not the first processor in this row of cells,
+c receive data from predecessor containing the right hand
+c sides and the upper diagonal elements of the previous two rows
+c---------------------------------------------------------------------
+
+ call mpi_irecv(in_buffer, 22*buffer_size,
+ > dp_type, predecessor(3),
+ > DEFAULT_TAG, comm_solve,
+ > requests(1), error)
+
+
+c---------------------------------------------------------------------
+c communication has already been started.
+c compute the left hand side while waiting for the msg
+c---------------------------------------------------------------------
+ call lhsz(c)
+
+c---------------------------------------------------------------------
+c wait for pending communication to complete
+c---------------------------------------------------------------------
+ call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c unpack the buffer
+c---------------------------------------------------------------------
+ k = kstart
+ k1 = kstart + 1
+ n = 0
+
+c---------------------------------------------------------------------
+c create a running pointer
+c---------------------------------------------------------------------
+ p = 0
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+ > in_buffer(p+1) * lhs(i,j,k,n+1,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+ > in_buffer(p+2) * lhs(i,j,k,n+1,c)
+ do m = 1, 3
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
+ end do
+ d = in_buffer(p+6)
+ e = in_buffer(p+7)
+ do m = 1, 3
+ s(m) = in_buffer(p+7+m)
+ end do
+ r1 = lhs(i,j,k,n+2,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+ lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+ do m = 1, 3
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
+ end do
+ r2 = lhs(i,j,k1,n+1,c)
+ lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
+ lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
+ do m = 1, 3
+ rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
+ end do
+ p = p + 10
+ end do
+ end do
+
+ do m = 4, 5
+ n = (m-3)*5
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
+ > in_buffer(p+1) * lhs(i,j,k,n+1,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
+ > in_buffer(p+2) * lhs(i,j,k,n+1,c)
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > in_buffer(p+3) * lhs(i,j,k,n+1,c)
+ d = in_buffer(p+4)
+ e = in_buffer(p+5)
+ s(m) = in_buffer(p+6)
+ r1 = lhs(i,j,k,n+2,c)
+ lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
+ lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
+ r2 = lhs(i,j,k1,n+1,c)
+ lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
+ lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
+ rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
+ p = p + 6
+ end do
+ end do
+ end do
+
+ else
+
+c---------------------------------------------------------------------
+c if this IS the first cell, we still compute the lhs
+c---------------------------------------------------------------------
+ call lhsz(c)
+ endif
+
+c---------------------------------------------------------------------
+c perform the Thomas algorithm; first, FORWARD ELIMINATION
+c---------------------------------------------------------------------
+ n = 0
+
+ do k = kstart, kend-2
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ k1 = k + 1
+ k2 = k + 2
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ end do
+ lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
+ > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
+ > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+ > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
+ end do
+ lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
+ > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
+ > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
+ > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c The last two rows in this grid block are a bit different,
+c since they do not have two more rows available for the
+c elimination of off-diagonal entries
+c---------------------------------------------------------------------
+ k = kend - 1
+ k1 = kend
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ end do
+ lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
+ > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
+ > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+ > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
+ end do
+c---------------------------------------------------------------------
+c scale the last row immediately (some of this is
+c overkill in case this is the last cell)
+c---------------------------------------------------------------------
+ fac2 = 1.d0/lhs(i,j,k1,n+3,c)
+ lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
+ lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
+ do m = 1, 3
+ rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c do the u+c and the u-c factors
+c---------------------------------------------------------------------
+ do m = 4, 5
+ n = (m-3)*5
+ do k = kstart, kend-2
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ k1 = k + 1
+ k2 = k + 2
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
+ > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
+ > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
+ rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+ > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
+ lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
+ > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
+ > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
+ rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
+ > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c And again the last two rows separately
+c---------------------------------------------------------------------
+ k = kend - 1
+ k1 = kend
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ fac1 = 1.d0/lhs(i,j,k,n+3,c)
+ lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
+ lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
+ rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
+ lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
+ > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
+ lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
+ > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
+ rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+ > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
+c---------------------------------------------------------------------
+c Scale the last row immediately (some of this is overkill
+c if this is the last cell)
+c---------------------------------------------------------------------
+ fac2 = 1.d0/lhs(i,j,k1,n+3,c)
+ lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
+ lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
+ rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
+
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c send information to the next processor, except when this
+c is the last grid block,
+c---------------------------------------------------------------------
+
+ if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c create a running pointer for the send buffer
+c---------------------------------------------------------------------
+ p = 0
+ n = 0
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ do k = kend-1, kend
+ out_buffer(p+1) = lhs(i,j,k,n+4,c)
+ out_buffer(p+2) = lhs(i,j,k,n+5,c)
+ do m = 1, 3
+ out_buffer(p+2+m) = rhs(i,j,k,m,c)
+ end do
+ p = p+5
+ end do
+ end do
+ end do
+
+ do m = 4, 5
+ n = (m-3)*5
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ do k = kend-1, kend
+ out_buffer(p+1) = lhs(i,j,k,n+4,c)
+ out_buffer(p+2) = lhs(i,j,k,n+5,c)
+ out_buffer(p+3) = rhs(i,j,k,m,c)
+ p = p + 3
+ end do
+ end do
+ end do
+ end do
+
+
+ call mpi_isend(out_buffer, 22*buffer_size,
+ > dp_type, successor(3),
+ > DEFAULT_TAG, comm_solve,
+ > requests(2), error)
+
+ endif
+ end do
+
+c---------------------------------------------------------------------
+c now go in the reverse direction
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c BACKSUBSTITUTION
+c---------------------------------------------------------------------
+ do stage = ncells, 1, -1
+ c = slice(3,stage)
+
+ kstart = 0
+ kend = cell_size(3,c)-1
+
+ isize = cell_size(1,c)
+ jsize = cell_size(2,c)
+ ip = cell_coord(1,c)-1
+ jp = cell_coord(2,c)-1
+
+ buffer_size = (isize-start(1,c)-end(1,c)) *
+ > (jsize-start(2,c)-end(2,c))
+
+ if (stage .ne. ncells) then
+
+c---------------------------------------------------------------------
+c if this is not the starting cell in this row of cells,
+c wait for a message to be received, containing the
+c solution of the previous two stations
+c---------------------------------------------------------------------
+
+ call mpi_irecv(in_buffer, 10*buffer_size,
+ > dp_type, successor(3),
+ > DEFAULT_TAG, comm_solve,
+ > requests(1), error)
+
+
+c---------------------------------------------------------------------
+c communication has already been started
+c while waiting, do the block-diagonal inversion for the
+c cell that was just finished
+c---------------------------------------------------------------------
+
+ call tzetar(slice(3,stage+1))
+
+c---------------------------------------------------------------------
+c wait for pending communication to complete
+c---------------------------------------------------------------------
+ call mpi_waitall(2, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c unpack the buffer for the first three factors
+c---------------------------------------------------------------------
+ n = 0
+ p = 0
+ k = kend
+ k1 = k - 1
+ do m = 1, 3
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ sm1 = in_buffer(p+1)
+ sm2 = in_buffer(p+2)
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*sm1 -
+ > lhs(i,j,k,n+5,c)*sm2
+ rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+ > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) -
+ > lhs(i,j,k1,n+5,c) * sm1
+ p = p + 2
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c now unpack the buffer for the remaining two factors
+c---------------------------------------------------------------------
+ do m = 4, 5
+ n = (m-3)*5
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ sm1 = in_buffer(p+1)
+ sm2 = in_buffer(p+2)
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*sm1 -
+ > lhs(i,j,k,n+5,c)*sm2
+ rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
+ > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) -
+ > lhs(i,j,k1,n+5,c) * sm1
+ p = p + 2
+ end do
+ end do
+ end do
+
+ else
+
+c---------------------------------------------------------------------
+c now we know this is the first grid block on the back sweep,
+c so we don't need a message to start the substitution.
+c---------------------------------------------------------------------
+
+ k = kend - 1
+ k1 = kend
+ n = 0
+ do m = 1, 3
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
+ end do
+ end do
+ end do
+
+ do m = 4, 5
+ n = (m-3)*5
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
+ end do
+ end do
+ end do
+ endif
+
+c---------------------------------------------------------------------
+c Whether or not this is the last processor, we always have
+c to complete the back-substitution
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c The first three factors
+c---------------------------------------------------------------------
+ n = 0
+ do m = 1, 3
+ do k = kend-2, kstart, -1
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ k1 = k + 1
+ k2 = k + 2
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
+ > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c And the remaining two
+c---------------------------------------------------------------------
+ do m = 4, 5
+ n = (m-3)*5
+ do k = kend-2, kstart, -1
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ k1 = k + 1
+ k2 = k + 2
+ rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
+ > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
+ > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
+ end do
+ end do
+ end do
+ end do
+
+c---------------------------------------------------------------------
+c send on information to the previous processor, if needed
+c---------------------------------------------------------------------
+ if (stage .ne. 1) then
+ k = kstart
+ k1 = kstart + 1
+ p = 0
+ do m = 1, 5
+ do j = start(2,c), jsize-end(2,c)-1
+ do i = start(1,c), isize-end(1,c)-1
+ out_buffer(p+1) = rhs(i,j,k,m,c)
+ out_buffer(p+2) = rhs(i,j,k1,m,c)
+ p = p + 2
+ end do
+ end do
+ end do
+
+ call mpi_isend(out_buffer, 10*buffer_size,
+ > dp_type, predecessor(3),
+ > DEFAULT_TAG, comm_solve,
+ > requests(2), error)
+
+ endif
+
+c---------------------------------------------------------------------
+c If this was the last stage, do the block-diagonal inversion
+c---------------------------------------------------------------------
+ if (stage .eq. 1) call tzetar(c)
+
+ end do
+
+ return
+ end
+
+
+
+
+
+
+
--- /dev/null
+/*****************************************************************/
+/****** C _ P R I N T _ R E S U L T S ******/
+/*****************************************************************/
+#include <stdlib.h>
+#include <stdio.h>
+
+void c_print_results( char *name,
+ char class,
+ int n1,
+ int n2,
+ int n3,
+ int niter,
+ int nprocs_compiled,
+ int nprocs_total,
+ double t,
+ double mops,
+ char *optype,
+ int passed_verification,
+ char *npbversion,
+ char *compiletime,
+ char *mpicc,
+ char *clink,
+ char *cmpi_lib,
+ char *cmpi_inc,
+ char *cflags,
+ char *clinkflags )
+{
+ char *evalue="1000";
+
+ printf( "\n\n %s Benchmark Completed\n", name );
+
+ printf( " Class = %c\n", class );
+
+ if( n3 == 0 ) {
+ long nn = n1;
+ if ( n2 != 0 ) nn *= n2;
+ printf( " Size = %12ld\n", nn ); /* as in IS */
+ }
+ else
+ printf( " Size = %3dx %3dx %3d\n", n1,n2,n3 );
+
+ printf( " Iterations = %12d\n", niter );
+
+ printf( " Time in seconds = %12.2f\n", t );
+
+ printf( " Total processes = %12d\n", nprocs_total );
+
+ if ( nprocs_compiled != 0 )
+ printf( " Compiled procs = %12d\n", nprocs_compiled );
+
+ printf( " Mop/s total = %12.2f\n", mops );
+
+ printf( " Mop/s/process = %12.2f\n", mops/((float) nprocs_total) );
+
+ printf( " Operation type = %24s\n", optype);
+
+ if( passed_verification )
+ printf( " Verification = SUCCESSFUL\n" );
+ else
+ printf( " Verification = UNSUCCESSFUL\n" );
+
+ printf( " Version = %12s\n", npbversion );
+
+ printf( " Compile date = %12s\n", compiletime );
+
+ printf( "\n Compile options:\n" );
+
+ printf( " MPICC = %s\n", mpicc );
+
+ printf( " CLINK = %s\n", clink );
+
+ printf( " CMPI_LIB = %s\n", cmpi_lib );
+
+ printf( " CMPI_INC = %s\n", cmpi_inc );
+
+ printf( " CFLAGS = %s\n", cflags );
+
+ printf( " CLINKFLAGS = %s\n", clinkflags );
+#ifdef SMP
+ evalue = getenv("MP_SET_NUMTHREADS");
+ printf( " MULTICPUS = %s\n", evalue );
+#endif
+
+ printf( "\n\n" );
+ printf( " Please send the results of this run to:\n\n" );
+ printf( " NPB Development Team\n" );
+ printf( " Internet: npb@nas.nasa.gov\n \n" );
+ printf( " If email is not available, send this to:\n\n" );
+ printf( " MS T27A-1\n" );
+ printf( " NASA Ames Research Center\n" );
+ printf( " Moffett Field, CA 94035-1000\n\n" );
+ printf( " Fax: 650-604-3957\n\n" );
+}
+
--- /dev/null
+
+#include "mpi.h"
+
+double start[64], elapsed[64];
+
+/*****************************************************************/
+/****** T I M E R _ C L E A R ******/
+/*****************************************************************/
+void timer_clear( int n )
+{
+ elapsed[n] = 0.0;
+}
+
+
+/*****************************************************************/
+/****** T I M E R _ S T A R T ******/
+/*****************************************************************/
+void timer_start( int n )
+{
+ start[n] = MPI_Wtime();
+}
+
+
+/*****************************************************************/
+/****** T I M E R _ S T O P ******/
+/*****************************************************************/
+void timer_stop( int n )
+{
+ double t, now;
+
+ now = MPI_Wtime();
+ t = now - start[n];
+ elapsed[n] += t;
+
+}
+
+
+/*****************************************************************/
+/****** T I M E R _ R E A D ******/
+/*****************************************************************/
+double timer_read( int n )
+{
+ return( elapsed[n] );
+}
+
--- /dev/null
+
+ subroutine print_results(name, class, n1, n2, n3, niter,
+ > nprocs_compiled, nprocs_total,
+ > t, mops, optype, verified, npbversion,
+ > compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
+
+ implicit none
+ character*2 name
+ character*1 class
+ integer n1, n2, n3, niter, nprocs_compiled, nprocs_total, j
+ double precision t, mops
+ character optype*24, size*15
+ logical verified
+ character*(*) npbversion, compiletime,
+ > cs1, cs2, cs3, cs4, cs5, cs6, cs7
+
+ write (*, 2) name
+ 2 format(//, ' ', A2, ' Benchmark Completed.')
+
+ write (*, 3) Class
+ 3 format(' Class = ', 12x, a12)
+
+c If this is not a grid-based problem (EP, FT, CG), then
+c we only print n1, which contains some measure of the
+c problem size. In that case, n2 and n3 are both zero.
+c Otherwise, we print the grid size n1xn2xn3
+
+ if ((n2 .eq. 0) .and. (n3 .eq. 0)) then
+ if (name(1:2) .eq. 'EP') then
+ write(size, '(f15.0)' ) 2.d0**n1
+ j = 15
+ if (size(j:j) .eq. '.') j = j - 1
+ write (*,42) size(1:j)
+ 42 format(' Size = ',9x, a15)
+ else
+ write (*,44) n1
+ 44 format(' Size = ',12x, i12)
+ endif
+ else
+ write (*, 4) n1,n2,n3
+ 4 format(' Size = ',9x, i4,'x',i4,'x',i4)
+ endif
+
+ write (*, 5) niter
+ 5 format(' Iterations = ', 12x, i12)
+
+ write (*, 6) t
+ 6 format(' Time in seconds = ',12x, f12.2)
+
+ write (*,7) nprocs_total
+ 7 format(' Total processes = ', 12x, i12)
+
+ write (*,8) nprocs_compiled
+ 8 format(' Compiled procs = ', 12x, i12)
+
+ write (*,9) mops
+ 9 format(' Mop/s total = ',12x, f12.2)
+
+ write (*,10) mops/float( nprocs_total )
+ 10 format(' Mop/s/process = ', 12x, f12.2)
+
+ write(*, 11) optype
+ 11 format(' Operation type = ', a24)
+
+ if (verified) then
+ write(*,12) ' SUCCESSFUL'
+ else
+ write(*,12) 'UNSUCCESSFUL'
+ endif
+ 12 format(' Verification = ', 12x, a)
+
+ write(*,13) npbversion
+ 13 format(' Version = ', 12x, a12)
+
+ write(*,14) compiletime
+ 14 format(' Compile date = ', 12x, a12)
+
+
+ write (*,121) cs1
+ 121 format(/, ' Compile options:', /,
+ > ' MPIF77 = ', A)
+
+ write (*,122) cs2
+ 122 format(' FLINK = ', A)
+
+ write (*,123) cs3
+ 123 format(' FMPI_LIB = ', A)
+
+ write (*,124) cs4
+ 124 format(' FMPI_INC = ', A)
+
+ write (*,125) cs5
+ 125 format(' FFLAGS = ', A)
+
+ write (*,126) cs6
+ 126 format(' FLINKFLAGS = ', A)
+
+ write(*, 127) cs7
+ 127 format(' RAND = ', A)
+
+ write (*,130)
+ 130 format(//' Please send the results of this run to:'//
+ > ' NPB Development Team '/
+ > ' Internet: npb@nas.nasa.gov'/
+ > ' '/
+ > ' If email is not available, send this to:'//
+ > ' MS T27A-1'/
+ > ' NASA Ames Research Center'/
+ > ' Moffett Field, CA 94035-1000'//
+ > ' Fax: 650-604-3957'//)
+
+
+ return
+ end
+
--- /dev/null
+//---------------------------------------------------------------------
+// This function is C verson of random number generator randdp.f
+//---------------------------------------------------------------------
+
+double randlc(X, A)
+double *X;
+double *A;
+{
+ static int KS=0;
+ static double R23, R46, T23, T46;
+ double T1, T2, T3, T4;
+ double A1;
+ double A2;
+ double X1;
+ double X2;
+ double Z;
+ int i, j;
+
+ if (KS == 0)
+ {
+ R23 = 1.0;
+ R46 = 1.0;
+ T23 = 1.0;
+ T46 = 1.0;
+
+ for (i=1; i<=23; i++)
+ {
+ R23 = 0.50 * R23;
+ T23 = 2.0 * T23;
+ }
+ for (i=1; i<=46; i++)
+ {
+ R46 = 0.50 * R46;
+ T46 = 2.0 * T46;
+ }
+ KS = 1;
+ }
+
+/* Break A into two parts such that A = 2^23 * A1 + A2 and set X = N. */
+
+ T1 = R23 * *A;
+ j = T1;
+ A1 = j;
+ A2 = *A - T23 * A1;
+
+/* Break X into two parts such that X = 2^23 * X1 + X2, compute
+ Z = A1 * X2 + A2 * X1 (mod 2^23), and then
+ X = 2^23 * Z + A2 * X2 (mod 2^46). */
+
+ T1 = R23 * *X;
+ j = T1;
+ X1 = j;
+ X2 = *X - T23 * X1;
+ T1 = A1 * X2 + A2 * X1;
+
+ j = R23 * T1;
+ T2 = j;
+ Z = T1 - T23 * T2;
+ T3 = T23 * Z + A2 * X2;
+ j = R46 * T3;
+ T4 = j;
+ *X = T3 - T46 * T4;
+ return(R46 * *X);
+}
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ double precision function randlc (x, a)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c This routine returns a uniform pseudorandom double precision number in the
+c range (0, 1) by using the linear congruential generator
+c
+c x_{k+1} = a x_k (mod 2^46)
+c
+c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+c before repeating. The argument A is the same as 'a' in the above formula,
+c and X is the same as x_0. A and X must be odd double precision integers
+c in the range (1, 2^46). The returned value RANDLC is normalized to be
+c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
+c the new seed x_1, so that subsequent calls to RANDLC using the same
+c arguments will generate a continuous sequence.
+c
+c This routine should produce the same results on any computer with at least
+c 48 mantissa bits in double precision floating point data. On 64 bit
+c systems, double precision should be disabled.
+c
+c David H. Bailey October 26, 1990
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ double precision r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
+ parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
+ > t46 = t23 ** 2)
+
+c---------------------------------------------------------------------
+c Break A into two parts such that A = 2^23 * A1 + A2.
+c---------------------------------------------------------------------
+ t1 = r23 * a
+ a1 = int (t1)
+ a2 = a - t23 * a1
+
+c---------------------------------------------------------------------
+c Break X into two parts such that X = 2^23 * X1 + X2, compute
+c Z = A1 * X2 + A2 * X1 (mod 2^23), and then
+c X = 2^23 * Z + A2 * X2 (mod 2^46).
+c---------------------------------------------------------------------
+ t1 = r23 * x
+ x1 = int (t1)
+ x2 = x - t23 * x1
+ t1 = a1 * x2 + a2 * x1
+ t2 = int (r23 * t1)
+ z = t1 - t23 * t2
+ t3 = t23 * z + a2 * x2
+ t4 = int (r46 * t3)
+ x = t3 - t46 * t4
+ randlc = r46 * x
+
+ return
+ end
+
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine vranlc (n, x, a, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c This routine generates N uniform pseudorandom double precision numbers in
+c the range (0, 1) by using the linear congruential generator
+c
+c x_{k+1} = a x_k (mod 2^46)
+c
+c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+c before repeating. The argument A is the same as 'a' in the above formula,
+c and X is the same as x_0. A and X must be odd double precision integers
+c in the range (1, 2^46). The N results are placed in Y and are normalized
+c to be between 0 and 1. X is updated to contain the new seed, so that
+c subsequent calls to VRANLC using the same arguments will generate a
+c continuous sequence. If N is zero, only initialization is performed, and
+c the variables X, A and Y are ignored.
+c
+c This routine is the standard version designed for scalar or RISC systems.
+c However, it should produce the same results on any single processor
+c computer with at least 48 mantissa bits in double precision floating point
+c data. On 64 bit systems, double precision should be disabled.
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ integer i,n
+ double precision y,r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
+ dimension y(*)
+ parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
+ > t46 = t23 ** 2)
+
+
+c---------------------------------------------------------------------
+c Break A into two parts such that A = 2^23 * A1 + A2.
+c---------------------------------------------------------------------
+ t1 = r23 * a
+ a1 = int (t1)
+ a2 = a - t23 * a1
+
+c---------------------------------------------------------------------
+c Generate N results. This loop is not vectorizable.
+c---------------------------------------------------------------------
+ do i = 1, n
+
+c---------------------------------------------------------------------
+c Break X into two parts such that X = 2^23 * X1 + X2, compute
+c Z = A1 * X2 + A2 * X1 (mod 2^23), and then
+c X = 2^23 * Z + A2 * X2 (mod 2^46).
+c---------------------------------------------------------------------
+ t1 = r23 * x
+ x1 = int (t1)
+ x2 = x - t23 * x1
+ t1 = a1 * x2 + a2 * x1
+ t2 = int (r23 * t1)
+ z = t1 - t23 * t2
+ t3 = t23 * z + a2 * x2
+ t4 = int (r46 * t3)
+ x = t3 - t46 * t4
+ y(i) = r46 * x
+ enddo
+
+ return
+ end
--- /dev/null
+c---------------------------------------------------------------------
+ double precision function randlc (x, a)
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c
+c This routine returns a uniform pseudorandom double precision number in the
+c range (0, 1) by using the linear congruential generator
+c
+c x_{k+1} = a x_k (mod 2^46)
+c
+c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+c before repeating. The argument A is the same as 'a' in the above formula,
+c and X is the same as x_0. A and X must be odd double precision integers
+c in the range (1, 2^46). The returned value RANDLC is normalized to be
+c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
+c the new seed x_1, so that subsequent calls to RANDLC using the same
+c arguments will generate a continuous sequence.
+c
+c This routine should produce the same results on any computer with at least
+c 48 mantissa bits in double precision floating point data. On 64 bit
+c systems, double precision should be disabled.
+c
+c David H. Bailey October 26, 1990
+c
+c---------------------------------------------------------------------
+
+ implicit none
+
+ double precision r23,r46,t23,t46,a,x,t1,t2,t3,t4,a1,a2,x1,x2,z
+ parameter (r23 = 0.5d0 ** 23, r46 = r23 ** 2, t23 = 2.d0 ** 23,
+ > t46 = t23 ** 2)
+
+c---------------------------------------------------------------------
+c Break A into two parts such that A = 2^23 * A1 + A2.
+c---------------------------------------------------------------------
+ t1 = r23 * a
+ a1 = int (t1)
+ a2 = a - t23 * a1
+
+c---------------------------------------------------------------------
+c Break X into two parts such that X = 2^23 * X1 + X2, compute
+c Z = A1 * X2 + A2 * X1 (mod 2^23), and then
+c X = 2^23 * Z + A2 * X2 (mod 2^46).
+c---------------------------------------------------------------------
+ t1 = r23 * x
+ x1 = int (t1)
+ x2 = x - t23 * x1
+
+
+ t1 = a1 * x2 + a2 * x1
+ t2 = int (r23 * t1)
+ z = t1 - t23 * t2
+ t3 = t23 * z + a2 * x2
+ t4 = int (r46 * t3)
+ x = t3 - t46 * t4
+ randlc = r46 * x
+ return
+ end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine vranlc (n, x, a, y)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c This routine generates N uniform pseudorandom double precision numbers in
+c the range (0, 1) by using the linear congruential generator
+c
+c x_{k+1} = a x_k (mod 2^46)
+c
+c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+c before repeating. The argument A is the same as 'a' in the above formula,
+c and X is the same as x_0. A and X must be odd double precision integers
+c in the range (1, 2^46). The N results are placed in Y and are normalized
+c to be between 0 and 1. X is updated to contain the new seed, so that
+c subsequent calls to RANDLC using the same arguments will generate a
+c continuous sequence.
+c
+c This routine generates the output sequence in batches of length NV, for
+c convenience on vector computers. This routine should produce the same
+c results on any computer with at least 48 mantissa bits in double precision
+c floating point data. On Cray systems, double precision should be disabled.
+c
+c David H. Bailey August 30, 1990
+c---------------------------------------------------------------------
+
+ integer n
+ double precision x, a, y(*)
+
+ double precision r23, r46, t23, t46
+ integer nv
+ parameter (r23 = 2.d0 ** (-23), r46 = r23 * r23, t23 = 2.d0 ** 23,
+ > t46 = t23 * t23, nv = 64)
+ double precision xv(nv), t1, t2, t3, t4, an, a1, a2, x1, x2, yy
+ integer n1, i, j
+ external randlc
+ double precision randlc
+
+c---------------------------------------------------------------------
+c Compute the first NV elements of the sequence using RANDLC.
+c---------------------------------------------------------------------
+ t1 = x
+ n1 = min (n, nv)
+
+ do i = 1, n1
+ xv(i) = t46 * randlc (t1, a)
+ enddo
+
+c---------------------------------------------------------------------
+c It is not necessary to compute AN, A1 or A2 unless N is greater than NV.
+c---------------------------------------------------------------------
+ if (n .gt. nv) then
+
+c---------------------------------------------------------------------
+c Compute AN = AA ^ NV (mod 2^46) using successive calls to RANDLC.
+c---------------------------------------------------------------------
+ t1 = a
+ t2 = r46 * a
+
+ do i = 1, nv - 1
+ t2 = randlc (t1, a)
+ enddo
+
+ an = t46 * t2
+
+c---------------------------------------------------------------------
+c Break AN into two parts such that AN = 2^23 * A1 + A2.
+c---------------------------------------------------------------------
+ t1 = r23 * an
+ a1 = aint (t1)
+ a2 = an - t23 * a1
+ endif
+
+c---------------------------------------------------------------------
+c Compute N pseudorandom results in batches of size NV.
+c---------------------------------------------------------------------
+ do j = 0, n - 1, nv
+ n1 = min (nv, n - j)
+
+c---------------------------------------------------------------------
+c Compute up to NV results based on the current seed vector XV.
+c---------------------------------------------------------------------
+ do i = 1, n1
+ y(i+j) = r46 * xv(i)
+ enddo
+
+c---------------------------------------------------------------------
+c If this is the last pass through the 140 loop, it is not necessary to
+c update the XV vector.
+c---------------------------------------------------------------------
+ if (j + n1 .eq. n) goto 150
+
+c---------------------------------------------------------------------
+c Update the XV vector by multiplying each element by AN (mod 2^46).
+c---------------------------------------------------------------------
+ do i = 1, nv
+ t1 = r23 * xv(i)
+ x1 = aint (t1)
+ x2 = xv(i) - t23 * x1
+ t1 = a1 * x2 + a2 * x1
+ t2 = aint (r23 * t1)
+ yy = t1 - t23 * t2
+ t3 = t23 * yy + a2 * x2
+ t4 = aint (r46 * t3)
+ xv(i) = t3 - t46 * t4
+ enddo
+
+ enddo
+
+c---------------------------------------------------------------------
+c Save the last seed in X so that subsequent calls to VRANLC will generate
+c a continuous sequence.
+c---------------------------------------------------------------------
+ 150 x = xv(n1)
+
+ return
+ end
+
+c----- end of program ------------------------------------------------
+
--- /dev/null
+ double precision function randlc(x, a)
+
+c---------------------------------------------------------------------
+c
+c This routine returns a uniform pseudorandom double precision number in the
+c range (0, 1) by using the linear congruential generator
+c
+c x_{k+1} = a x_k (mod 2^46)
+c
+c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+c before repeating. The argument A is the same as 'a' in the above formula,
+c and X is the same as x_0. A and X must be odd double precision integers
+c in the range (1, 2^46). The returned value RANDLC is normalized to be
+c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
+c the new seed x_1, so that subsequent calls to RANDLC using the same
+c arguments will generate a continuous sequence.
+
+ implicit none
+ double precision x, a
+ integer*8 i246m1, Lx, La
+ double precision d2m46
+
+ parameter(d2m46=0.5d0**46)
+
+ save i246m1
+ data i246m1/X'00003FFFFFFFFFFF'/
+
+ Lx = X
+ La = A
+
+ Lx = iand(Lx*La,i246m1)
+ randlc = d2m46*dble(Lx)
+ x = dble(Lx)
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+ SUBROUTINE VRANLC (N, X, A, Y)
+ implicit none
+ integer n, i
+ double precision x, a, y(*)
+ integer*8 i246m1, Lx, La
+ double precision d2m46
+
+c This doesn't work, because the compiler does the calculation in 32
+c bits and overflows. No standard way (without f90 stuff) to specify
+c that the rhs should be done in 64 bit arithmetic.
+c parameter(i246m1=2**46-1)
+
+ parameter(d2m46=0.5d0**46)
+
+ save i246m1
+ data i246m1/X'00003FFFFFFFFFFF'/
+
+c Note that the v6 compiler on an R8000 does something stupid with
+c the above. Using the following instead (or various other things)
+c makes the calculation run almost 10 times as fast.
+c
+c save d2m46
+c data d2m46/0.0d0/
+c if (d2m46 .eq. 0.0d0) then
+c d2m46 = 0.5d0**46
+c endif
+
+ Lx = X
+ La = A
+ do i = 1, N
+ Lx = iand(Lx*La,i246m1)
+ y(i) = d2m46*dble(Lx)
+ end do
+ x = dble(Lx)
+
+ return
+ end
+
--- /dev/null
+ double precision function randlc(x, a)
+
+c---------------------------------------------------------------------
+c
+c This routine returns a uniform pseudorandom double precision number in the
+c range (0, 1) by using the linear congruential generator
+c
+c x_{k+1} = a x_k (mod 2^46)
+c
+c where 0 < x_k < 2^46 and 0 < a < 2^46. This scheme generates 2^44 numbers
+c before repeating. The argument A is the same as 'a' in the above formula,
+c and X is the same as x_0. A and X must be odd double precision integers
+c in the range (1, 2^46). The returned value RANDLC is normalized to be
+c between 0 and 1, i.e. RANDLC = 2^(-46) * x_1. X is updated to contain
+c the new seed x_1, so that subsequent calls to RANDLC using the same
+c arguments will generate a continuous sequence.
+
+ implicit none
+ double precision x, a
+ integer*8 Lx, La, a1, a2, x1, x2, xa
+ double precision d2m46
+ parameter(d2m46=0.5d0**46)
+
+ Lx = x
+ La = A
+ a1 = ibits(La, 23, 23)
+ a2 = ibits(La, 0, 23)
+ x1 = ibits(Lx, 23, 23)
+ x2 = ibits(Lx, 0, 23)
+ xa = ishft(ibits(a1*x2+a2*x1, 0, 23), 23) + a2*x2
+ Lx = ibits(xa,0, 46)
+ x = dble(Lx)
+ randlc = d2m46*x
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+ SUBROUTINE VRANLC (N, X, A, Y)
+ implicit none
+ integer n, i
+ double precision x, a, y(*)
+ integer*8 Lx, La, a1, a2, x1, x2, xa
+ double precision d2m46
+ parameter(d2m46=0.5d0**46)
+
+ Lx = X
+ La = A
+ a1 = ibits(La, 23, 23)
+ a2 = ibits(La, 0, 23)
+ do i = 1, N
+ x1 = ibits(Lx, 23, 23)
+ x2 = ibits(Lx, 0, 23)
+ xa = ishft(ibits(a1*x2+a2*x1, 0, 23), 23) + a2*x2
+ Lx = ibits(xa,0, 46)
+ y(i) = d2m46*dble(Lx)
+ end do
+ x = dble(Lx)
+ return
+ end
+
--- /dev/null
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine timer_clear(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ integer n
+
+ double precision start(64), elapsed(64)
+ common /tt/ start, elapsed
+
+ elapsed(n) = 0.0
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine timer_start(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ integer n
+ include 'mpif.h'
+ double precision start(64), elapsed(64)
+ common /tt/ start, elapsed
+
+ start(n) = MPI_Wtime()
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ subroutine timer_stop(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ integer n
+ include 'mpif.h'
+ double precision start(64), elapsed(64)
+ common /tt/ start, elapsed
+ double precision t, now
+ now = MPI_Wtime()
+ t = now - start(n)
+ elapsed(n) = elapsed(n) + t
+
+ return
+ end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ double precision function timer_read(n)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+ implicit none
+ integer n
+ double precision start(64), elapsed(64)
+ common /tt/ start, elapsed
+
+ timer_read = elapsed(n)
+ return
+ end
+
--- /dev/null
+This directory contains examples of make.def files that were used
+by the NPB team in testing the benchmarks on different platforms.
+They can be used as starting points for make.def files for your
+own platform, but you may need to taylor them for best performance
+on your installation. A clean template can be found in directory
+`config'.
+Some examples of suite.def files are also provided.
\ No newline at end of file
--- /dev/null
+#This is for a DEC Alpha 8400. The code will execute on a
+#single processor
+#Warning: parallel make does not work properly in general
+MPIF77 = f77
+FLINK = f77
+#Optimization -O5 breaks SP; works fine for all other codes
+FFLAGS = -O4
+
+MPICC = cc
+CLINK = cc
+CFLAGS = -O5
+
+include ../config/make.dummy
+
+CC = cc -g
+BINDIR = ../bin
+
+RAND = randi8
--- /dev/null
+#This is for a generic single-processor SGI workstation
+MPIF77 = f77
+FLINK = f77
+FFLAGS = -O3
+
+MPICC = cc
+CLINK = cc
+CFLAGS = -O3
+
+include ../config/make.dummy
+
+CC = cc -g
+BINDIR = ../bin
+
+RAND = randi8
+
--- /dev/null
+# This is for a an SGI Origin 2000 or 3000 with vendor MPI. The Fortran
+# record length is specified, so it can be used for the I/O benchmark.
+# as well
+MPIF77 = f77
+FMPI_LIB = -lmpi
+FLINK = f77 -64
+FFLAGS = -O3 -64
+
+MPICC = cc
+CMPI_LIB = -lmpi
+CLINK = cc
+CFLAGS = -O3
+
+CC = cc -g
+BINDIR = ../bin
+
+RAND = randi8
+
+CONVERTFLAG = -DFORTRAN_REC_SIZE=4
+
--- /dev/null
+# This is for the SGI PowerChallenge Array at NASA Ames. mrf77 and
+# mrcc are local scripts that invoke the proper MPI library.
+MPIF77 = mrf77
+FLINK = mrf77
+FFLAGS = -O3 -OPT:fold_arith_limit=1204
+
+MPICC = mrcc
+CLINK = mrcc
+CFLAGS = -O3 -OPT:fold_arith_limit=1204
+
+CC = cc -g
+BINDIR = ../bin
+
+RAND = randi8
+
+
--- /dev/null
+#This is for the IBM SP2 at Ames; mrf77 and mrcc are local scripts
+MPIF77 = mrf77
+FLINK = mrf77
+FFLAGS = -O3
+FLINKFLAGS = -bmaxdata:0x60000000
+
+MPICC = mrcc
+CLINK = mrcc
+CFLAGS = -O3
+CLINKFLAGS = -bmaxdata:0x60000000
+
+CC = cc -g
+
+BINDIR = ../bin
+
+RAND = randi8
+
--- /dev/null
+# This is for a Sun SparcCenter or UltraEnterprise machine
+MPIF77 = f77
+FLINK = f77
+FMPI_LIB = -L<your mpich installation tree>/lib/solaris/ch_lfshmem -lmpi
+FMPI_INC = -I<your mpich installation tree>/include
+# sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m)
+# and f77 -V returns 4.0 or greater
+# FFLAGS = -fast -xtarget=super -xO4 -depend
+# Ultra1,2, UltraEnterprise servers (uname -m returns sun4u)
+FFLAGS = -fast -xtarget=ultra -xarch=v8plus -xO4 -depend
+FLINKFLAGS = -lmopt -lcopt -lsunmath
+
+MPICC = cc
+CLINK = cc
+CMPI_LIB = -L<your mpich installation tree>/lib/solaris/ch_lfshmem -lmpi
+CMPI_INC = -I<your mpich installation tree>/include
+# sparc10,20 SparcCenter{1,2}000 (uname -m returns sun4m)
+# and cc -V returns 4.0 or greater
+#CFLAGS = -fast -xtarget=super -xO4 -xdepend
+# Ultra1,2, UltraEnterprise servers (uname -m returns sun4u)
+CFLAGS = -fast -xtarget=ultra -xarch=v8plus -xO4 -xdepend
+CLINKFLAGS = -fast
+
+CC = cc -g
+
+BINDIR = ../bin
+
+# Cannot use randi8 or randi8-safe on a 32-but machine. Use double precision
+RAND = randdp
+
--- /dev/null
+#This is for the Cray T3D at the Jet Propulsion Laboratory
+MPIF77 = cf77
+FLINK = cf77
+FMPI_LIB = -L/usr/local/mpp/lib -lmpi
+FMPI_INC = -I/usr/local/mpp/lib/include/mpp
+FFLAGS = -dp -Wf-onoieeedivide -C cray-t3d
+#The following flags provide more effective optimization, but may
+#cause the random number generator randi8(_safe) to break in EP
+#FFLAGS = -dp -Wf-oaggress -Wf-onoieeedivide -C cray-t3d
+FLINKFLAGS = -Wl-Drdahead=on -C cray-t3d
+
+MPICC = cc
+CLINK = cc
+CMPI_LIB = -L/usr/local/mpp/lib -lmpi
+CMPI_INC = -I/usr/local/mpp/lib/include/mpp
+CFLAGS = -O3 -Tcray-t3d
+CLINKFLAGS = -Tcray-t3d
+
+CC = cc -g -Tcray-ymp
+BINDIR = ../bin
+
+CONVERTFLAG= -DCONVERTDOUBLE
+
+RAND = randi8
+
--- /dev/null
+#---------------------------------------------------------------------------
+#
+# SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS.
+#
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Items in this file will need to be changed for each platform.
+# (Note these definitions are inconsistent with NPB2.1.)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Parallel Fortran:
+#
+# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must
+# be defined:
+#
+# MPIF77 - Fortran compiler
+# FFLAGS - Fortran compilation arguments
+# FMPI_INC - any -I arguments required for compiling MPI/Fortran
+# FLINK - Fortran linker
+# FLINKFLAGS - Fortran linker arguments
+# FMPI_LIB - any -L and -l arguments required for linking MPI/Fortran
+#
+# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or
+# $(MPIF77) $(FFLAGS)
+# linking is done with $(FLINK) $(FMPI_LIB) $(FLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the fortran compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPIF77 = mpif77
+# This links MPI fortran programs; usually the same as ${MPIF77}
+FLINK = $(MPIF77)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+FMPI_LIB =
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpif.h'
+#---------------------------------------------------------------------------
+FMPI_INC =
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for Fortran programs
+#---------------------------------------------------------------------------
+FFLAGS = -fast
+# FFLAGS = -g
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable
+# size usually go here.
+#---------------------------------------------------------------------------
+FLINKFLAGS = -fast
+
+
+#---------------------------------------------------------------------------
+# Parallel C:
+#
+# For IS, which is in C, the following must be defined:
+#
+# MPICC - C compiler
+# CFLAGS - C compilation arguments
+# CMPI_INC - any -I arguments required for compiling MPI/C
+# CLINK - C linker
+# CLINKFLAGS - C linker flags
+# CMPI_LIB - any -L and -l arguments required for linking MPI/C
+#
+# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
+# $(MPICC) $(CFLAGS)
+# linking is done with $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the C compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPICC = mpicc
+# This links MPI C programs; usually the same as ${MPICC}
+CLINK = $(MPICC)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+CMPI_LIB =
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpi.h'
+#---------------------------------------------------------------------------
+CMPI_INC =
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for C programs
+#---------------------------------------------------------------------------
+CFLAGS = -fast
+# CFLAGS = -g
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable
+# size usually go here.
+#---------------------------------------------------------------------------
+CLINKFLAGS = -fast
+
+
+#---------------------------------------------------------------------------
+# MPI dummy library:
+#
+# Uncomment if you want to use the MPI dummy library supplied by NAS instead
+# of the true message-passing library. The include file redefines several of
+# the above macros. It also invokes make in subdirectory MPI_dummy. Make
+# sure that no spaces or tabs precede include.
+#---------------------------------------------------------------------------
+# include ../config/make.dummy
+
+
+#---------------------------------------------------------------------------
+# Utilities C:
+#
+# This is the C compiler used to compile C utilities. Flags required by
+# this compiler go here also; typically there are few flags required; hence
+# there are no separate macros provided for such flags.
+#---------------------------------------------------------------------------
+CC = cc -g
+
+
+#---------------------------------------------------------------------------
+# Destination of executables, relative to subdirs of the main directory. .
+#---------------------------------------------------------------------------
+BINDIR = ../bin
+
+
+#---------------------------------------------------------------------------
+# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which
+# is twice the precision required for the NPB suite. A compiler flag
+# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to
+# 64 bits, but the MPI library may continue to send 128 bits. Short of
+# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit
+# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment
+# the following line to enable this substitution.
+#
+# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO
+# SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC
+# VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE
+# THE CORRECT VALUE FOR "length".
+# IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED,
+# UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR
+# "length"
+#---------------------------------------------------------------------------
+# CONVERTFLAG = -DCONVERTDOUBLE
+CONVERTFLAG = -DFORTRAN_REC_SIZE=1
+# CONVERTFLAG = -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length
+
+
+#---------------------------------------------------------------------------
+# The variable RAND controls which random number generator
+# is used. It is described in detail in Doc/README.install.
+# Use "randi8" unless there is a reason to use another one.
+# Other allowed values are "randi8_safe", "randdp" and "randdpvec"
+#---------------------------------------------------------------------------
+RAND = randi8
+# The following is highly reliable but may be slow:
+# RAND = randdp
+
--- /dev/null
+bt S 1
+bt S 4
+bt S 9
+bt S 16
+bt A 1
+bt A 4
+bt A 9
+bt A 16
+bt A 25
+bt A 36
+bt A 49
+bt A 64
+bt A 81
+bt A 100
+bt A 121
+bt B 1
+bt B 4
+bt B 9
+bt B 16
+bt B 25
+bt B 36
+bt B 49
+bt B 64
+bt B 81
+bt B 100
+bt B 121
+bt C 1
+bt C 4
+bt C 9
+bt C 16
+bt C 25
+bt C 36
+bt C 49
+bt C 64
+bt C 81
+bt C 100
+bt C 121
--- /dev/null
+cg S 1
+cg S 2
+cg S 4
+cg S 8
+cg S 16
+cg A 1
+cg A 2
+cg A 4
+cg A 8
+cg A 16
+cg A 32
+cg A 64
+cg A 128
+cg B 1
+cg B 2
+cg B 4
+cg B 8
+cg B 16
+cg B 32
+cg B 64
+cg B 128
+cg C 1
+cg C 2
+cg C 4
+cg C 8
+cg C 16
+cg C 32
+cg C 64
+cg C 128
--- /dev/null
+ep S 1
+ep S 2
+ep S 4
+ep S 8
+ep S 16
+ep A 1
+ep A 2
+ep A 4
+ep A 8
+ep A 16
+ep A 32
+ep A 64
+ep A 128
+ep B 1
+ep B 2
+ep B 4
+ep B 8
+ep B 16
+ep B 32
+ep B 64
+ep B 128
+ep C 1
+ep C 2
+ep C 4
+ep C 8
+ep C 16
+ep C 32
+ep C 64
+ep C 128
--- /dev/null
+ft S 1
+ft S 2
+ft S 4
+ft S 8
+ft S 16
+ft A 1
+ft A 2
+ft A 4
+ft A 8
+ft A 16
+ft A 32
+ft A 64
+ft A 128
+ft B 1
+ft B 2
+ft B 4
+ft B 8
+ft B 16
+ft B 32
+ft B 64
+ft B 128
+ft C 1
+ft C 2
+ft C 4
+ft C 8
+ft C 16
+ft C 32
+ft C 64
+ft C 128
--- /dev/null
+is S 1
+is S 2
+is S 4
+is S 8
+is S 16
+is A 1
+is A 2
+is A 4
+is A 8
+is A 16
+is A 32
+is A 64
+is A 128
+is B 1
+is B 2
+is B 4
+is B 8
+is B 16
+is B 32
+is B 64
+is B 128
+is C 1
+is C 2
+is C 4
+is C 8
+is C 16
+is C 32
+is C 64
+is C 128
--- /dev/null
+lu S 1
+lu S 2
+lu S 4
+lu S 8
+lu S 16
+lu A 1
+lu A 2
+lu A 4
+lu A 8
+lu A 16
+lu A 32
+lu A 64
+lu A 128
+lu B 1
+lu B 2
+lu B 4
+lu B 8
+lu B 16
+lu B 32
+lu B 64
+lu B 128
+lu C 1
+lu C 2
+lu C 4
+lu C 8
+lu C 16
+lu C 32
+lu C 64
+lu C 128
--- /dev/null
+mg S 1
+mg S 2
+mg S 4
+mg S 8
+mg S 16
+mg A 1
+mg A 2
+mg A 4
+mg A 8
+mg A 16
+mg A 32
+mg A 64
+mg A 128
+mg B 1
+mg B 2
+mg B 4
+mg B 8
+mg B 16
+mg B 32
+mg B 64
+mg B 128
+mg C 1
+mg C 2
+mg C 4
+mg C 8
+mg C 16
+mg C 32
+mg C 64
+mg C 128
--- /dev/null
+bt S 1
+cg S 1
+ep S 1
+ft S 1
+is S 1
+lu S 1
+mg S 1
+sp S 1
--- /dev/null
+sp S 1
+sp S 4
+sp S 9
+sp S 16
+sp A 1
+sp A 4
+sp A 9
+sp A 16
+sp A 25
+sp A 36
+sp A 49
+sp A 64
+sp A 81
+sp A 100
+sp A 121
+sp B 1
+sp B 4
+sp B 9
+sp B 16
+sp B 25
+sp B 36
+sp B 49
+sp B 64
+sp B 81
+sp B 100
+sp B 121
+sp C 1
+sp C 4
+sp C 9
+sp C 16
+sp C 25
+sp C 36
+sp C 49
+sp C 64
+sp C 81
+sp C 100
+sp C 121
--- /dev/null
+#---------------------------------------------------------------------------
+#
+# SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS.
+#
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Items in this file will need to be changed for each platform.
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Parallel Fortran:
+#
+# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must
+# be defined:
+#
+# MPIF77 - Fortran compiler
+# FFLAGS - Fortran compilation arguments
+# FMPI_INC - any -I arguments required for compiling MPI/Fortran
+# FLINK - Fortran linker
+# FLINKFLAGS - Fortran linker arguments
+# FMPI_LIB - any -L and -l arguments required for linking MPI/Fortran
+#
+# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or
+# $(MPIF77) $(FFLAGS)
+# linking is done with $(FLINK) $(FMPI_LIB) $(FLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the fortran compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPIF77 = smpicc
+# This links MPI fortran programs; usually the same as ${MPIF77}
+FLINK = $(MPIF77)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+FMPI_LIB = -lgfortran
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpif.h'
+#---------------------------------------------------------------------------
+FMPI_INC = -I/usr/lib/openmpi/include
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for Fortran programs
+#---------------------------------------------------------------------------
+FFLAGS = -O2
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable
+# size usually go here.
+#---------------------------------------------------------------------------
+FLINKFLAGS = -O2
+
+
+#---------------------------------------------------------------------------
+# Parallel C:
+#
+# For IS, which is in C, the following must be defined:
+#
+# MPICC - C compiler
+# CFLAGS - C compilation arguments
+# CMPI_INC - any -I arguments required for compiling MPI/C
+# CLINK - C linker
+# CLINKFLAGS - C linker flags
+# CMPI_LIB - any -L and -l arguments required for linking MPI/C
+#
+# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
+# $(MPICC) $(CFLAGS)
+# linking is done with $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the C compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPICC = smpicc
+# This links MPI C programs; usually the same as ${MPICC}
+CLINK = $(MPICC)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+CMPI_LIB =
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpi.h'
+#---------------------------------------------------------------------------
+CMPI_INC =
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for C programs
+#---------------------------------------------------------------------------
+CFLAGS = -O2
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable
+# size usually go here.
+#---------------------------------------------------------------------------
+CLINKFLAGS = -O2
+
+
+#---------------------------------------------------------------------------
+# MPI dummy library:
+#
+# Uncomment if you want to use the MPI dummy library supplied by NAS instead
+# of the true message-passing library. The include file redefines several of
+# the above macros. It also invokes make in subdirectory MPI_dummy. Make
+# sure that no spaces or tabs precede include.
+#---------------------------------------------------------------------------
+# include ../config/make.dummy
+
+
+#---------------------------------------------------------------------------
+# Utilities C:
+#
+# This is the C compiler used to compile C utilities. Flags required by
+# this compiler go here also; typically there are few flags required; hence
+# there are no separate macros provided for such flags.
+#---------------------------------------------------------------------------
+CC = gcc -g
+
+
+#---------------------------------------------------------------------------
+# Destination of executables, relative to subdirs of the main directory. .
+#---------------------------------------------------------------------------
+BINDIR = ../bin
+
+
+#---------------------------------------------------------------------------
+# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which
+# is twice the precision required for the NPB suite. A compiler flag
+# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to
+# 64 bits, but the MPI library may continue to send 128 bits. Short of
+# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit
+# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment
+# the following line to enable this substitution.
+#
+# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO
+# SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC
+# VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE
+# THE CORRECT VALUE FOR "length".
+# IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED,
+# UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR
+# "length"
+#---------------------------------------------------------------------------
+# CONVERTFLAG = -DCONVERTDOUBLE
+# CONVERTFLAG = -DFORTRAN_REC_SIZE=length
+# CONVERTFLAG = -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length
+
+
+#---------------------------------------------------------------------------
+# The variable RAND controls which random number generator
+# is used. It is described in detail in README.install.
+# Use "randi8" unless there is a reason to use another one.
+# Other allowed values are "randi8_safe", "randdp" and "randdpvec"
+#---------------------------------------------------------------------------
+RAND = randi8
+# The following is highly reliable but may be slow:
+# RAND = randdp
+
--- /dev/null
+#---------------------------------------------------------------------------
+#
+# SITE- AND/OR PLATFORM-SPECIFIC DEFINITIONS.
+#
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Items in this file will need to be changed for each platform.
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# Parallel Fortran:
+#
+# For CG, EP, FT, MG, LU, SP and BT, which are in Fortran, the following must
+# be defined:
+#
+# MPIF77 - Fortran compiler
+# FFLAGS - Fortran compilation arguments
+# FMPI_INC - any -I arguments required for compiling MPI/Fortran
+# FLINK - Fortran linker
+# FLINKFLAGS - Fortran linker arguments
+# FMPI_LIB - any -L and -l arguments required for linking MPI/Fortran
+#
+# compilations are done with $(MPIF77) $(FMPI_INC) $(FFLAGS) or
+# $(MPIF77) $(FFLAGS)
+# linking is done with $(FLINK) $(FMPI_LIB) $(FLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the fortran compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPIF77 = f77
+# This links MPI fortran programs; usually the same as ${MPIF77}
+FLINK = $(MPIF77)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+FMPI_LIB = -L/usr/local/lib -lmpi
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpif.h'
+#---------------------------------------------------------------------------
+FMPI_INC = -I/usr/local/include
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for Fortran programs
+#---------------------------------------------------------------------------
+FFLAGS = -O
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable
+# size usually go here.
+#---------------------------------------------------------------------------
+FLINKFLAGS = -O
+
+
+#---------------------------------------------------------------------------
+# Parallel C:
+#
+# For IS, which is in C, the following must be defined:
+#
+# MPICC - C compiler
+# CFLAGS - C compilation arguments
+# CMPI_INC - any -I arguments required for compiling MPI/C
+# CLINK - C linker
+# CLINKFLAGS - C linker flags
+# CMPI_LIB - any -L and -l arguments required for linking MPI/C
+#
+# compilations are done with $(MPICC) $(CMPI_INC) $(CFLAGS) or
+# $(MPICC) $(CFLAGS)
+# linking is done with $(CLINK) $(CMPI_LIB) $(CLINKFLAGS)
+#---------------------------------------------------------------------------
+
+#---------------------------------------------------------------------------
+# This is the C compiler used for MPI programs
+#---------------------------------------------------------------------------
+MPICC = cc
+# This links MPI C programs; usually the same as ${MPICC}
+CLINK = $(MPICC)
+
+#---------------------------------------------------------------------------
+# These macros are passed to the linker to help link with MPI correctly
+#---------------------------------------------------------------------------
+CMPI_LIB = -L/usr/local/lib -lmpi
+
+#---------------------------------------------------------------------------
+# These macros are passed to the compiler to help find 'mpi.h'
+#---------------------------------------------------------------------------
+CMPI_INC = -I/usr/local/include
+
+#---------------------------------------------------------------------------
+# Global *compile time* flags for C programs
+#---------------------------------------------------------------------------
+CFLAGS = -O
+
+#---------------------------------------------------------------------------
+# Global *link time* flags. Flags for increasing maximum executable
+# size usually go here.
+#---------------------------------------------------------------------------
+CLINKFLAGS = -O
+
+
+#---------------------------------------------------------------------------
+# MPI dummy library:
+#
+# Uncomment if you want to use the MPI dummy library supplied by NAS instead
+# of the true message-passing library. The include file redefines several of
+# the above macros. It also invokes make in subdirectory MPI_dummy. Make
+# sure that no spaces or tabs precede include.
+#---------------------------------------------------------------------------
+# include ../config/make.dummy
+
+
+#---------------------------------------------------------------------------
+# Utilities C:
+#
+# This is the C compiler used to compile C utilities. Flags required by
+# this compiler go here also; typically there are few flags required; hence
+# there are no separate macros provided for such flags.
+#---------------------------------------------------------------------------
+CC = cc -g
+
+
+#---------------------------------------------------------------------------
+# Destination of executables, relative to subdirs of the main directory. .
+#---------------------------------------------------------------------------
+BINDIR = ../bin
+
+
+#---------------------------------------------------------------------------
+# Some machines (e.g. Crays) have 128-bit DOUBLE PRECISION numbers, which
+# is twice the precision required for the NPB suite. A compiler flag
+# (e.g. -dp) can usually be used to change DOUBLE PRECISION variables to
+# 64 bits, but the MPI library may continue to send 128 bits. Short of
+# recompiling MPI, the solution is to use MPI_REAL to send these 64-bit
+# numbers, and MPI_COMPLEX to send their complex counterparts. Uncomment
+# the following line to enable this substitution.
+#
+# NOTE: IF THE I/O BENCHMARK IS BEING BUILT, WE USE CONVERTFLAG TO
+# SPECIFIY THE FORTRAN RECORD LENGTH UNIT. IT IS A SYSTEM-SPECIFIC
+# VALUE (USUALLY 1 OR 4). UNCOMMENT THE SECOND LINE AND SUBSTITUTE
+# THE CORRECT VALUE FOR "length".
+# IF BOTH 128-BIT DOUBLE PRECISION NUMBERS AND I/O ARE TO BE ENABLED,
+# UNCOMMENT THE THIRD LINE AND SUBSTITUTE THE CORRECT VALUE FOR
+# "length"
+#---------------------------------------------------------------------------
+# CONVERTFLAG = -DCONVERTDOUBLE
+# CONVERTFLAG = -DFORTRAN_REC_SIZE=length
+# CONVERTFLAG = -DCONVERTDOUBLE -DFORTRAN_REC_SIZE=length
+
+
+#---------------------------------------------------------------------------
+# The variable RAND controls which random number generator
+# is used. It is described in detail in README.install.
+# Use "randi8" unless there is a reason to use another one.
+# Other allowed values are "randi8_safe", "randdp" and "randdpvec"
+#---------------------------------------------------------------------------
+RAND = randi8
+# The following is highly reliable but may be slow:
+# RAND = randdp
+
--- /dev/null
+FMPI_LIB = -L../MPI_dummy -lmpi
+FMPI_INC = -I../MPI_dummy
+CMPI_LIB = -L../MPI_dummy -lmpi
+CMPI_INC = -I../MPI_dummy
+default:: ${PROGRAM} libmpi.a
+libmpi.a:
+ cd ../MPI_dummy; $(MAKE) F77=$(MPIF77) CC=$(MPICC)
--- /dev/null
+# config/suite.def
+# This file is used to build several benchmarks with a single command.
+# Typing "make suite" in the main directory will build all the benchmarks
+# specified in this file.
+# Each line of this file contains a benchmark name, class, and number
+# of nodes. The name is one of "cg", "is", "ep", mg", "ft", "sp", "bt",
+# "lu", and "dt".
+# The class is one of "S", "W", "A", "B", "C", "D", and "E"
+# (except that no classes C, D and E for DT, and no class E for IS).
+# The number of nodes must be a legal number for a particular
+# benchmark. The utility which parses this file is primitive, so
+# formatting is inflexible. Separate name/class/number by tabs.
+# Comments start with "#" as the first character on a line.
+# No blank lines.
+# The following example builds 1 processor sample sizes of all benchmarks.
+ft S 1
+mg S 1
+sp S 1
+lu S 1
+bt S 1
+is S 1
+ep S 1
+cg S 1
+dt S 1
--- /dev/null
+include ../config/make.def
+
+# Note that COMPILE is also defined in make.common and should
+# be the same. We can't include make.common because it has a lot
+# of other garbage. LINK is not defined in make.common because
+# ${MPI_LIB} needs to go at the end of the line.
+FCOMPILE = $(MPIF77) -c $(FMPI_INC) $(FFLAGS)
+
+all: setparams
+
+# setparams creates an npbparam.h file for each benchmark
+# configuration. npbparams.h also contains info about how a benchmark
+# was compiled and linked
+
+setparams: setparams.c ../config/make.def
+ $(CC) ${CONVERTFLAG} -o setparams setparams.c
+
+
+clean:
+ -rm -f setparams setparams.h npbparams.h
+ -rm -f *~ *.o
+
--- /dev/null
+This directory contains utilities and files used by the
+build process. You should not need to change anything
+in this directory.
+
+Original Files
+--------------
+setparams.c:
+ Source for the setparams program. This program is used internally
+ in the build process to create the file "npbparams.h" for each
+ benchmark. npbparams.h contains Fortran or C parameters to build a
+ benchmark for a specific class and number of nodes. The setparams
+ program is never run directly by a user. Its invocation syntax is
+ "setparams benchmark-name nprocs class".
+ It examines the file "npbparams.h" in the current directory. If
+ the specified parameters are the same as those in the npbparams.h
+ file, nothing it changed. If the file does not exist or corresponds
+ to a different class/number of nodes, it is (re)built.
+ One of the more complicated things in npbparams.h is that it
+ contains, in a Fortran string, the compiler flags used to build a
+ benchmark, so that a benchmark can print out how it was compiled.
+
+make.common
+ A makefile segment that is included in each individual benchmark
+ program makefile. It sets up some standard macros (COMPILE, etc)
+ and makes sure everything is configured correctly (npbparams.h)
+
+Makefile
+ Builds setparams
+
+README
+ This file.
+
+
+Created files
+-------------
+
+setparams
+ See descriptions above
+
--- /dev/null
+PROGRAM = $(BINDIR)/$(BENCHMARK).$(CLASS).$(NPROCS)
+FCOMPILE = $(MPIF77) -c $(FMPI_INC) $(FFLAGS)
+CCOMPILE = $(MPICC) -c $(CMPI_INC) $(CFLAGS)
+
+# Class "U" is used internally by the setparams program to mean
+# "unknown". This means that if you don't specify CLASS=
+# on the command line, you'll get an error. It would be nice
+# to be able to avoid this, but we'd have to get information
+# from the setparams back to the make program, which isn't easy.
+CLASS=U
+NPROCS=1
+
+default:: ${PROGRAM}
+
+# This makes sure the configuration utility setparams
+# is up to date.
+# Note that this must be run every time, which is why the
+# target does not exist and is not created.
+# If you create a file called "config" you will break things.
+config:
+ @cd ../sys; ${MAKE} all
+ ../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE}
+
+COMMON=../common
+${COMMON}/${RAND}.o: ${COMMON}/${RAND}.f
+ cd ${COMMON}; ${FCOMPILE} ${RAND}.f
+${COMMON}/c_randdp.o: ${COMMON}/randdp.c
+ cd ${COMMON}; ${CCOMPILE} -o c_randdp.o randdp.c
+
+${COMMON}/print_results.o: ${COMMON}/print_results.f
+ cd ${COMMON}; ${FCOMPILE} print_results.f
+
+${COMMON}/c_print_results.o: ${COMMON}/c_print_results.c
+ cd ${COMMON}; ${CCOMPILE} c_print_results.c
+
+${COMMON}/timers.o: ${COMMON}/timers.f
+ cd ${COMMON}; ${FCOMPILE} timers.f
+
+${COMMON}/c_timers.o: ${COMMON}/c_timers.c
+ cd ${COMMON}; ${CCOMPILE} c_timers.c
+
+# Normally setparams updates npbparams.h only if the settings (CLASS/NPROCS)
+# have changed. However, we also want to update if the compile options
+# may have changed (set in ../config/make.def).
+npbparams.h: ../config/make.def
+ @ echo make.def modified. Rebuilding npbparams.h just in case
+ rm -f npbparams.h
+ ../sys/setparams ${BENCHMARK} ${NPROCS} ${CLASS} ${SUBTYPE}
+
+# So that "make benchmark-name" works
+${BENCHMARK}: default
+${BENCHMARKU}: default
+
+
--- /dev/null
+echo ' ========================================='
+echo ' = NAS Parallel Benchmarks 3.3 ='
+echo ' = MPI/F77/C ='
+echo ' ========================================='
+echo ''
--- /dev/null
+echo ''
+echo ' To make a NAS benchmark type '
+echo ''
+echo ' make <benchmark-name> NPROCS=<number> CLASS=<class> [SUBTYPE=<type>]'
+echo ''
+echo ' where <benchmark-name> is "bt", "cg", "ep", "ft", "is", "lu",'
+echo ' "mg", or "sp"'
+echo ' <number> is the number of processors'
+echo ' <class> is "S", "W", "A", "B", "C", or "D"'
+echo ''
+echo ' Only when making the I/O benchmark:'
+echo ''
+echo ' <benchmark-name> is "bt"'
+echo ' <number>, <class> as above'
+echo ' <type> is "full", "simple", "fortran", or "epio"'
+echo ''
+echo ' To make a set of benchmarks, create the file config/suite.def'
+echo ' according to the instructions in config/suite.def.template and type'
+echo ''
+echo ' make suite'
+echo ''
+echo ' ***************************************************************'
+echo ' * Remember to edit the file config/make.def for site specific *'
+echo ' * information as described in the README file *'
+echo ' ***************************************************************'
+
--- /dev/null
+/*
+ * This utility configures a NPB to be built for a specific number
+ * of nodes and a specific class. It creates a file "npbparams.h"
+ * in the source directory. This file keeps state information about
+ * which size of benchmark is currently being built (so that nothing
+ * if unnecessarily rebuilt) and defines (through PARAMETER statements)
+ * the number of nodes and class for which a benchmark is being built.
+
+ * The utility takes 3 arguments:
+ * setparams benchmark-name nprocs class
+ * benchmark-name is "sp", "bt", etc
+ * nprocs is the number of processors to run on
+ * class is the size of the benchmark
+ * These parameters are checked for the current benchmark. If they
+ * are invalid, this program prints a message and aborts.
+ * If the parameters are ok, the current npbsize.h (actually just
+ * the first line) is read in. If the new parameters are the same as
+ * the old, nothing is done, but an exit code is returned to force the
+ * user to specify (otherwise the make procedure succeeds but builds a
+ * binary of the wrong name). Otherwise the file is rewritten.
+ * Errors write a message (to stdout) and abort.
+ *
+ * This program makes use of two extra benchmark "classes"
+ * class "X" means an invalid specification. It is returned if
+ * there is an error parsing the config file.
+ * class "U" is an external specification meaning "unknown class"
+ *
+ * Unfortunately everything has to be case sensitive. This is
+ * because we can always convert lower to upper or v.v. but
+ * can't feed this information back to the makefile, so typing
+ * make CLASS=a and make CLASS=A will produce different binaries.
+ *
+ *
+ */
+
+#include <sys/types.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include <time.h>
+
+/*
+ * This is the master version number for this set of
+ * NPB benchmarks. It is in an obscure place so people
+ * won't accidentally change it.
+ */
+
+#define VERSION "3.3"
+
+/* controls verbose output from setparams */
+/* #define VERBOSE */
+
+#define FILENAME "npbparams.h"
+#define DESC_LINE "c NPROCS = %d CLASS = %c\n"
+#define BT_DESC_LINE "c NPROCS = %d CLASS = %c SUBTYPE = %s\n"
+#define DEF_CLASS_LINE "#define CLASS '%c'\n"
+#define DEF_NUM_PROCS_LINE "#define NUM_PROCS %d\n"
+#define FINDENT " "
+#define CONTINUE " > "
+
+#ifdef FORTRAN_REC_SIZE
+int fortran_rec_size = FORTRAN_REC_SIZE;
+#else
+int fortran_rec_size = 4;
+#endif
+
+void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp,
+ int* subtypep);
+void check_info(int type, int nprocs, char class);
+void read_info(int type, int *nprocsp, char *classp, int *subtypep);
+void write_info(int type, int nprocs, char class, int subtype);
+void write_sp_info(FILE *fp, int nprocs, char class);
+void write_bt_info(FILE *fp, int nprocs, char class, int io);
+void write_lu_info(FILE *fp, int nprocs, char class);
+void write_mg_info(FILE *fp, int nprocs, char class);
+void write_cg_info(FILE *fp, int nprocs, char class);
+void write_ft_info(FILE *fp, int nprocs, char class);
+void write_ep_info(FILE *fp, int nprocs, char class);
+void write_ep_info_C(FILE *fp, int nprocs, char class); /* after C translation */
+void write_is_info(FILE *fp, int nprocs, char class);
+void write_dt_info(FILE *fp, int nprocs, char class);
+void write_compiler_info(int type, FILE *fp);
+void write_convertdouble_info(int type, FILE *fp);
+void check_line(char *line, char *label, char *val);
+int check_include_line(char *line, char *filename);
+void put_string(FILE *fp, char *name, char *val);
+void put_def_string(FILE *fp, char *name, char *val);
+void put_def_variable(FILE *fp, char *name, char *val);
+int isqrt(int i);
+int ilog2(int i);
+int ipow2(int i);
+
+enum benchmark_types {SP, BT, LU, MG, FT, IS, DT, EP, CG};
+enum iotypes { NONE = 0, FULL, SIMPLE, EPIO, FORTRAN};
+
+int main(int argc, char *argv[])
+{
+ int nprocs, nprocs_old, type;
+ char class, class_old;
+ int subtype = -1, old_subtype = -1;
+
+ /* Get command line arguments. Make sure they're ok. */
+ get_info(argc, argv, &type, &nprocs, &class, &subtype);
+ if (class != 'U') {
+#ifdef VERBOSE
+ printf("setparams: For benchmark %s: number of processors = %d class = %c\n",
+ argv[1], nprocs, class);
+#endif
+ check_info(type, nprocs, class);
+ }
+
+ /* Get old information. */
+ read_info(type, &nprocs_old, &class_old, &old_subtype);
+ if (class != 'U') {
+ if (class_old != 'X') {
+#ifdef VERBOSE
+ printf("setparams: old settings: number of processors = %d class = %c\n",
+ nprocs_old, class_old);
+#endif
+ }
+ } else {
+ printf("setparams:\n\
+ *********************************************************************\n\
+ * You must specify NPROCS and CLASS to build this benchmark *\n\
+ * For example, to build a class A benchmark for 4 processors, type *\n\
+ * make {benchmark-name} NPROCS=4 CLASS=A *\n\
+ *********************************************************************\n\n");
+
+ if (class_old != 'X') {
+#ifdef VERBOSE
+ printf("setparams: Previous settings were CLASS=%c NPROCS=%d\n",
+ class_old, nprocs_old);
+#endif
+ }
+ exit(1); /* exit on class==U */
+ }
+
+ /* Write out new information if it's different. */
+ if (nprocs != nprocs_old || class != class_old || subtype != old_subtype) {
+#ifdef VERBOSE
+ printf("setparams: Writing %s\n", FILENAME);
+#endif
+ write_info(type, nprocs, class, subtype);
+ } else {
+#ifdef VERBOSE
+ printf("setparams: Settings unchanged. %s unmodified\n", FILENAME);
+#endif
+ }
+
+ return 0;
+}
+
+
+/*
+ * get_info(): Get parameters from command line
+ */
+
+void get_info(int argc, char *argv[], int *typep, int *nprocsp, char *classp,
+ int *subtypep)
+{
+
+ if (argc < 4) {
+ printf("Usage: %s (%d) benchmark-name nprocs class\n", argv[0], argc);
+ exit(1);
+ }
+
+ *nprocsp = atoi(argv[2]);
+
+ *classp = *argv[3];
+
+ if (!strcmp(argv[1], "sp") || !strcmp(argv[1], "SP")) *typep = SP;
+ else if (!strcmp(argv[1], "ft") || !strcmp(argv[1], "FT")) *typep = FT;
+ else if (!strcmp(argv[1], "lu") || !strcmp(argv[1], "LU")) *typep = LU;
+ else if (!strcmp(argv[1], "mg") || !strcmp(argv[1], "MG")) *typep = MG;
+ else if (!strcmp(argv[1], "is") || !strcmp(argv[1], "IS")) *typep = IS;
+ else if (!strcmp(argv[1], "dt") || !strcmp(argv[1], "DT")) *typep = DT;
+ else if (!strcmp(argv[1], "ep") || !strcmp(argv[1], "EP")) *typep = EP;
+ else if (!strcmp(argv[1], "cg") || !strcmp(argv[1], "CG")) *typep = CG;
+ else if (!strcmp(argv[1], "bt") || !strcmp(argv[1], "BT")) {
+ *typep = BT;
+ if (argc != 5) {
+ /* printf("Usage: %s (%d) benchmark-name nprocs class\n", argv[0], argc); */
+ /* exit(1); */
+ *subtypep = NONE;
+ } else {
+ if (!strcmp(argv[4], "full") || !strcmp(argv[4], "FULL")) {
+ *subtypep = FULL;
+ } else if (!strcmp(argv[4], "simple") || !strcmp(argv[4], "SIMPLE")) {
+ *subtypep = SIMPLE;
+ } else if (!strcmp(argv[4], "epio") || !strcmp(argv[4], "EPIO")) {
+ *subtypep = EPIO;
+ } else if (!strcmp(argv[4], "fortran") || !strcmp(argv[4], "FORTRAN")) {
+ *subtypep = FORTRAN;
+ } else if (!strcmp(argv[4], "none") || !strcmp(argv[4], "NONE")) {
+ *subtypep = NONE;
+ } else {
+ printf("setparams: Error: unknown btio type %s\n", argv[4]);
+ exit(1);
+ }
+ }
+ } else {
+ printf("setparams: Error: unknown benchmark type %s\n", argv[1]);
+ exit(1);
+ }
+}
+
+/*
+ * check_info(): Make sure command line data is ok for this benchmark
+ */
+
+void check_info(int type, int nprocs, char class)
+{
+ int rootprocs, logprocs;
+
+ /* check number of processors */
+ if (nprocs <= 0) {
+ printf("setparams: Number of processors must be greater than zero\n");
+ exit(1);
+ }
+ switch(type) {
+
+ case SP:
+ case BT:
+ rootprocs = isqrt(nprocs);
+ if (rootprocs < 0) {
+ printf("setparams: Number of processors %d must be a square (1,4,9,...) for this benchmark",
+ nprocs);
+ exit(1);
+ }
+ if (class == 'S' && nprocs > 16) {
+ printf("setparams: BT and SP sample sizes cannot be run on more\n");
+ printf(" than 16 processors because the cell size would be too small.\n");
+ exit(1);
+ }
+ break;
+
+ case CG:
+ case FT:
+ case MG:
+ case IS:
+ case LU:
+ logprocs = ilog2(nprocs);
+ if (logprocs < 0) {
+ printf("setparams: Number of processors must be a power of two (1,2,4,...) for this benchmark\n");
+ exit(1);
+ }
+
+ break;
+
+ case EP:
+ case DT:
+ break;
+
+ default:
+ /* never should have gotten this far with a bad name */
+ printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type);
+ exit(1);
+ }
+
+ /* check class */
+ if (class != 'S' &&
+ class != 'W' &&
+ class != 'A' &&
+ class != 'B' &&
+ class != 'C' &&
+ class != 'D' &&
+ class != 'E') {
+ printf("setparams: Unknown benchmark class %c\n", class);
+ printf("setparams: Allowed classes are \"S\", \"W\", and \"A\" through \"E\"\n");
+ exit(1);
+ }
+
+ if (class == 'E' && (type == IS || type == DT)) {
+ printf("setparams: Benchmark class %c not defined for IS or DT\n", class);
+ exit(1);
+ }
+
+ if (class == 'D' && type == IS && nprocs < 4) {
+ printf("setparams: IS class D size cannot be run on less than 4 processors\n");
+ exit(1);
+ }
+}
+
+
+/*
+ * read_info(): Read previous information from file.
+ * Not an error if file doesn't exist, because this
+ * may be the first time we're running.
+ * Assumes the first line of the file is in a special
+ * format that we understand (since we wrote it).
+ */
+
+void read_info(int type, int *nprocsp, char *classp, int *subtypep)
+{
+ int nread = 0;
+ FILE *fp;
+ fp = fopen(FILENAME, "r");
+ if (fp == NULL) {
+#ifdef VERBOSE
+ printf("setparams: INFO: configuration file %s does not exist (yet)\n", FILENAME);
+#endif
+ goto abort;
+ }
+
+ /* first line of file contains info (fortran), first two lines (C) */
+
+ switch(type) {
+ case BT: {
+ char subtype_str[100];
+ nread = fscanf(fp, BT_DESC_LINE, nprocsp, classp, subtype_str);
+ if (nread != 3) {
+ if (nread != 2) {
+ printf("setparams: Error parsing config file %s. Ignoring previous settings\n", FILENAME);
+ goto abort;
+ }
+ *subtypep = 0;
+ break;
+ }
+ if (!strcmp(subtype_str, "full") || !strcmp(subtype_str, "FULL")) {
+ *subtypep = FULL;
+ } else if (!strcmp(subtype_str, "simple") ||
+ !strcmp(subtype_str, "SIMPLE")) {
+ *subtypep = SIMPLE;
+ } else if (!strcmp(subtype_str, "epio") || !strcmp(subtype_str, "EPIO")) {
+ *subtypep = EPIO;
+ } else if (!strcmp(subtype_str, "fortran") ||
+ !strcmp(subtype_str, "FORTRAN")) {
+ *subtypep = FORTRAN;
+ } else {
+ *subtypep = -1;
+ }
+ break;
+ }
+
+ case SP:
+ case FT:
+ case MG:
+ case LU:
+ //case EP:
+ case CG:
+ nread = fscanf(fp, DESC_LINE, nprocsp, classp);
+ if (nread != 2) {
+ printf("setparams: Error line %d parsing config file %s. Ignoring previous settings\n", __LINE__,FILENAME);
+ goto abort;
+ }
+ break;
+ case IS:
+ case EP:
+ case DT:
+ nread = fscanf(fp, DEF_CLASS_LINE, classp);
+ nread += fscanf(fp, DEF_NUM_PROCS_LINE, nprocsp);
+ if (nread != 2) {
+ printf("setparams: Error line %d parsing config file %s. Ignoring previous settings\n", __LINE__,FILENAME);
+ goto abort;
+ }
+ break;
+ default:
+ /* never should have gotten this far with a bad name */
+ printf("setparams: (Internal Error) Benchmark type %d unknown to this program\n", type);
+ exit(1);
+ }
+
+ fclose(fp);
+
+
+ return;
+
+ abort:
+ *nprocsp = -1;
+ *classp = 'X';
+ *subtypep = -1;
+ return;
+}
+
+
+/*
+ * write_info(): Write new information to config file.
+ * First line is in a special format so we can read
+ * it in again. Then comes a warning. The rest is all
+ * specific to a particular benchmark.
+ */
+
+void write_info(int type, int nprocs, char class, int subtype)
+{
+ FILE *fp;
+ char *BT_TYPES[] = {"NONE", "FULL", "SIMPLE", "EPIO", "FORTRAN"};
+
+ fp = fopen(FILENAME, "w");
+ if (fp == NULL) {
+ printf("setparams: Can't open file %s for writing\n", FILENAME);
+ exit(1);
+ }
+
+ switch(type) {
+ case BT:
+ /* Write out the header */
+ if (subtype == -1 || subtype == 0) {
+ fprintf(fp, DESC_LINE, nprocs, class);
+ } else {
+ fprintf(fp, BT_DESC_LINE, nprocs, class, BT_TYPES[subtype]);
+ }
+ /* Print out a warning so bozos don't mess with the file */
+ fprintf(fp, "\
+c \n\
+c \n\
+c This file is generated automatically by the setparams utility.\n\
+c It sets the number of processors and the class of the NPB\n\
+c in this directory. Do not modify it by hand.\n\
+c \n");
+
+ break;
+
+ case SP:
+ case FT:
+ case MG:
+ case LU:
+ //case EP:
+ case CG:
+ /* Write out the header */
+ fprintf(fp, DESC_LINE, nprocs, class);
+ /* Print out a warning so bozos don't mess with the file */
+ fprintf(fp, "\
+c \n\
+c \n\
+c This file is generated automatically by the setparams utility.\n\
+c It sets the number of processors and the class of the NPB\n\
+c in this directory. Do not modify it by hand.\n\
+c \n");
+
+ break;
+ case EP:
+ case IS:
+ case DT:
+ fprintf(fp, DEF_CLASS_LINE, class);
+ fprintf(fp, DEF_NUM_PROCS_LINE, nprocs);
+ fprintf(fp, "\
+/*\n\
+ This file is generated automatically by the setparams utility.\n\
+ It sets the number of processors and the class of the NPB\n\
+ in this directory. Do not modify it by hand. */\n\
+ \n");
+ break;
+ default:
+ printf("setparams: (Internal error): Unknown benchmark type %d\n",
+ type);
+ exit(1);
+ }
+
+ /* Now do benchmark-specific stuff */
+ switch(type) {
+ case SP:
+ write_sp_info(fp, nprocs, class);
+ break;
+ case LU:
+ write_lu_info(fp, nprocs, class);
+ break;
+ case MG:
+ write_mg_info(fp, nprocs, class);
+ break;
+ case IS:
+ write_is_info(fp, nprocs, class);
+ break;
+ case DT:
+ write_dt_info(fp, nprocs, class);
+ break;
+ case FT:
+ write_ft_info(fp, nprocs, class);
+ break;
+ case EP:
+ //write_ep_info(fp, nprocs, class);
+ write_ep_info_C(fp, nprocs, class);
+ break;
+ case CG:
+ write_cg_info(fp, nprocs, class);
+ break;
+ case BT:
+ write_bt_info(fp, nprocs, class, subtype);
+ break;
+ default:
+ printf("setparams: (Internal error): Unknown benchmark type %d\n", type);
+ exit(1);
+ }
+ write_convertdouble_info(type, fp);
+ write_compiler_info(type, fp);
+ fclose(fp);
+ return;
+}
+
+
+/*
+ * write_sp_info(): Write SP specific info to config file
+ */
+
+void write_sp_info(FILE *fp, int nprocs, char class)
+{
+ int maxcells, problem_size, niter;
+ char *dt;
+ maxcells = isqrt(nprocs);
+ if (class == 'S') { problem_size = 12; dt = "0.015d0"; niter = 100; }
+ else if (class == 'W') { problem_size = 36; dt = "0.0015d0"; niter = 400; }
+ else if (class == 'A') { problem_size = 64; dt = "0.0015d0"; niter = 400; }
+ else if (class == 'B') { problem_size = 102; dt = "0.001d0"; niter = 400; }
+ else if (class == 'C') { problem_size = 162; dt = "0.00067d0"; niter = 400; }
+ else if (class == 'D') { problem_size = 408; dt = "0.00030d0"; niter = 500; }
+ else if (class == 'E') { problem_size = 1020; dt = "0.0001d0"; niter = 500; }
+ else {
+ printf("setparams: Internal error: invalid class %c\n", class);
+ exit(1);
+ }
+ fprintf(fp, "%sinteger maxcells, problem_size, niter_default\n", FINDENT);
+ fprintf(fp, "%sparameter (maxcells=%d, problem_size=%d, niter_default=%d)\n",
+ FINDENT, maxcells, problem_size, niter);
+ fprintf(fp, "%sdouble precision dt_default\n", FINDENT);
+ fprintf(fp, "%sparameter (dt_default = %s)\n", FINDENT, dt);
+}
+
+/*
+ * write_bt_info(): Write BT specific info to config file
+ */
+
+void write_bt_info(FILE *fp, int nprocs, char class, int io)
+{
+ int maxcells, problem_size, niter, wr_interval;
+ char *dt;
+ maxcells = isqrt(nprocs);
+ if (class == 'S') { problem_size = 12; dt = "0.010d0"; niter = 60; }
+ else if (class == 'W') { problem_size = 24; dt = "0.0008d0"; niter = 200; }
+ else if (class == 'A') { problem_size = 64; dt = "0.0008d0"; niter = 200; }
+ else if (class == 'B') { problem_size = 102; dt = "0.0003d0"; niter = 200; }
+ else if (class == 'C') { problem_size = 162; dt = "0.0001d0"; niter = 200; }
+ else if (class == 'D') { problem_size = 408; dt = "0.00002d0"; niter = 250; }
+ else if (class == 'E') { problem_size = 1020; dt = "0.4d-5"; niter = 250; }
+ else {
+ printf("setparams: Internal error: invalid class %c\n", class);
+ exit(1);
+ }
+ wr_interval = 5;
+ fprintf(fp, "%sinteger maxcells, problem_size, niter_default\n", FINDENT);
+ fprintf(fp, "%sparameter (maxcells=%d, problem_size=%d, niter_default=%d)\n",
+ FINDENT, maxcells, problem_size, niter);
+ fprintf(fp, "%sdouble precision dt_default\n", FINDENT);
+ fprintf(fp, "%sparameter (dt_default = %s)\n", FINDENT, dt);
+ fprintf(fp, "%sinteger wr_default\n", FINDENT);
+ fprintf(fp, "%sparameter (wr_default = %d)\n", FINDENT, wr_interval);
+ fprintf(fp, "%sinteger iotype\n", FINDENT);
+ fprintf(fp, "%sparameter (iotype = %d)\n", FINDENT, io);
+ if (io) {
+ fprintf(fp, "%scharacter*(*) filenm\n", FINDENT);
+ switch (io) {
+ case FULL:
+ fprintf(fp, "%sparameter (filenm = 'btio.full.out')\n", FINDENT);
+ break;
+ case SIMPLE:
+ fprintf(fp, "%sparameter (filenm = 'btio.simple.out')\n", FINDENT);
+ break;
+ case EPIO:
+ fprintf(fp, "%sparameter (filenm = 'btio.epio.out')\n", FINDENT);
+ break;
+ case FORTRAN:
+ fprintf(fp, "%sparameter (filenm = 'btio.fortran.out')\n", FINDENT);
+ fprintf(fp, "%sinteger fortran_rec_sz\n", FINDENT);
+ fprintf(fp, "%sparameter (fortran_rec_sz = %d)\n",
+ FINDENT, fortran_rec_size);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+
+
+/*
+ * write_lu_info(): Write SP specific info to config file
+ */
+
+void write_lu_info(FILE *fp, int nprocs, char class)
+{
+ int isiz1, isiz2, itmax, inorm, problem_size;
+ int xdiv, ydiv; /* number of cells in x and y direction */
+ char *dt_default;
+
+ if (class == 'S') { problem_size = 12; dt_default = "0.5d0"; itmax = 50; }
+ else if (class == 'W') { problem_size = 33; dt_default = "1.5d-3"; itmax = 300; }
+ else if (class == 'A') { problem_size = 64; dt_default = "2.0d0"; itmax = 250; }
+ else if (class == 'B') { problem_size = 102; dt_default = "2.0d0"; itmax = 250; }
+ else if (class == 'C') { problem_size = 162; dt_default = "2.0d0"; itmax = 250; }
+ else if (class == 'D') { problem_size = 408; dt_default = "1.0d0"; itmax = 300; }
+ else if (class == 'E') { problem_size = 1020; dt_default = "0.5d0"; itmax = 300; }
+ else {
+ printf("setparams: Internal error: invalid class %c\n", class);
+ exit(1);
+ }
+ inorm = itmax;
+ xdiv = ydiv = ilog2(nprocs)/2;
+ if (xdiv+ydiv != ilog2(nprocs)) xdiv += 1;
+ xdiv = ipow2(xdiv); ydiv = ipow2(ydiv);
+ isiz1 = problem_size/xdiv; if (isiz1*xdiv < problem_size) isiz1++;
+ isiz2 = problem_size/ydiv; if (isiz2*ydiv < problem_size) isiz2++;
+
+
+ fprintf(fp, "\nc number of nodes for which this version is compiled\n");
+ fprintf(fp, "%sinteger nnodes_compiled\n", FINDENT);
+ fprintf(fp, "%sparameter (nnodes_compiled = %d)\n", FINDENT, nprocs);
+
+ fprintf(fp, "\nc full problem size\n");
+ fprintf(fp, "%sinteger isiz01, isiz02, isiz03\n", FINDENT);
+ fprintf(fp, "%sparameter (isiz01=%d, isiz02=%d, isiz03=%d)\n",
+ FINDENT, problem_size, problem_size, problem_size);
+
+ fprintf(fp, "\nc sub-domain array size\n");
+ fprintf(fp, "%sinteger isiz1, isiz2, isiz3\n", FINDENT);
+ fprintf(fp, "%sparameter (isiz1=%d, isiz2=%d, isiz3=isiz03)\n",
+ FINDENT, isiz1, isiz2);
+
+ fprintf(fp, "\nc number of iterations and how often to print the norm\n");
+ fprintf(fp, "%sinteger itmax_default, inorm_default\n", FINDENT);
+ fprintf(fp, "%sparameter (itmax_default=%d, inorm_default=%d)\n",
+ FINDENT, itmax, inorm);
+
+ fprintf(fp, "%sdouble precision dt_default\n", FINDENT);
+ fprintf(fp, "%sparameter (dt_default = %s)\n", FINDENT, dt_default);
+
+}
+
+/*
+ * write_mg_info(): Write MG specific info to config file
+ */
+
+void write_mg_info(FILE *fp, int nprocs, char class)
+{
+ int problem_size, nit, log2_size, log2_nprocs, lt_default, lm;
+ int ndim1, ndim2, ndim3;
+ if (class == 'S') { problem_size = 32; nit = 4; }
+ else if (class == 'W') { problem_size = 128; nit = 4; }
+ else if (class == 'A') { problem_size = 256; nit = 4; }
+ else if (class == 'B') { problem_size = 256; nit = 20; }
+ else if (class == 'C') { problem_size = 512; nit = 20; }
+ else if (class == 'D') { problem_size = 1024; nit = 50; }
+ else if (class == 'E') { problem_size = 2048; nit = 50; }
+ else {
+ printf("setparams: Internal error: invalid class type %c\n", class);
+ exit(1);
+ }
+ log2_size = ilog2(problem_size);
+ log2_nprocs = ilog2(nprocs);
+ /* lt is log of largest total dimension */
+ lt_default = log2_size;
+ /* log of log of maximum dimension on a node */
+ lm = log2_size - log2_nprocs/3;
+ ndim1 = lm;
+ ndim3 = log2_size - (log2_nprocs+2)/3;
+ ndim2 = log2_size - (log2_nprocs+1)/3;
+
+ fprintf(fp, "%sinteger nprocs_compiled\n", FINDENT);
+ fprintf(fp, "%sparameter (nprocs_compiled = %d)\n", FINDENT, nprocs);
+ fprintf(fp, "%sinteger nx_default, ny_default, nz_default\n", FINDENT);
+ fprintf(fp, "%sparameter (nx_default=%d, ny_default=%d, nz_default=%d)\n",
+ FINDENT, problem_size, problem_size, problem_size);
+ fprintf(fp, "%sinteger nit_default, lm, lt_default\n", FINDENT);
+ fprintf(fp, "%sparameter (nit_default=%d, lm = %d, lt_default=%d)\n",
+ FINDENT, nit, lm, lt_default);
+ fprintf(fp, "%sinteger debug_default\n", FINDENT);
+ fprintf(fp, "%sparameter (debug_default=%d)\n", FINDENT, 0);
+ fprintf(fp, "%sinteger ndim1, ndim2, ndim3\n", FINDENT);
+ fprintf(fp, "%sparameter (ndim1 = %d, ndim2 = %d, ndim3 = %d)\n",
+ FINDENT, ndim1, ndim2, ndim3);
+}
+
+
+/*
+ * write_dt_info(): Write DT specific info to config file
+ */
+
+void write_dt_info(FILE *fp, int nprocs, char class)
+{
+ int num_samples,deviation,num_sources;
+ if (class == 'S') { num_samples=1728; deviation=128; num_sources=4; }
+ else if (class == 'W') { num_samples=1728*8; deviation=128*2; num_sources=4*2; }
+ else if (class == 'A') { num_samples=1728*64; deviation=128*4; num_sources=4*4; }
+ else if (class == 'B') { num_samples=1728*512; deviation=128*8; num_sources=4*8; }
+ else if (class == 'C') { num_samples=1728*4096; deviation=128*16; num_sources=4*16; }
+ else if (class == 'D') { num_samples=1728*4096*8; deviation=128*32; num_sources=4*32; }
+ else {
+ printf("setparams: Internal error: invalid class type %c\n", class);
+ exit(1);
+ }
+ fprintf(fp, "#define NUM_SAMPLES %d\n", num_samples);
+ fprintf(fp, "#define STD_DEVIATION %d\n", deviation);
+ fprintf(fp, "#define NUM_SOURCES %d\n", num_sources);
+}
+
+/*
+ * write_is_info(): Write IS specific info to config file
+ */
+
+void write_is_info(FILE *fp, int nprocs, char class)
+{
+ if( class != 'S' &&
+ class != 'W' &&
+ class != 'A' &&
+ class != 'B' &&
+ class != 'C' &&
+ class != 'D' )
+ {
+ printf("setparams: Internal error: invalid class type %c\n", class);
+ exit(1);
+ }
+}
+
+/*
+ * write_cg_info(): Write CG specific info to config file
+ */
+
+void write_cg_info(FILE *fp, int nprocs, char class)
+{
+ int na,nonzer,niter;
+ char *shift,*rcond="1.0d-1";
+ char *shiftS="10.",
+ *shiftW="12.",
+ *shiftA="20.",
+ *shiftB="60.",
+ *shiftC="110.",
+ *shiftD="500.",
+ *shiftE="1.5d3";
+
+ int num_proc_cols, num_proc_rows;
+
+
+ if( class == 'S' )
+ { na=1400; nonzer=7; niter=15; shift=shiftS; }
+ else if( class == 'W' )
+ { na=7000; nonzer=8; niter=15; shift=shiftW; }
+ else if( class == 'A' )
+ { na=14000; nonzer=11; niter=15; shift=shiftA; }
+ else if( class == 'B' )
+ { na=75000; nonzer=13; niter=75; shift=shiftB; }
+ else if( class == 'C' )
+ { na=150000; nonzer=15; niter=75; shift=shiftC; }
+ else if( class == 'D' )
+ { na=1500000; nonzer=21; niter=100; shift=shiftD; }
+ else if( class == 'E' )
+ { na=9000000; nonzer=26; niter=100; shift=shiftE; }
+ else
+ {
+ printf("setparams: Internal error: invalid class type %c\n", class);
+ exit(1);
+ }
+ fprintf( fp, "%sinteger na, nonzer, niter\n", FINDENT );
+ fprintf( fp, "%sdouble precision shift, rcond\n", FINDENT );
+ fprintf( fp, "%sparameter( na=%d,\n", FINDENT, na );
+ fprintf( fp, "%s nonzer=%d,\n", CONTINUE, nonzer );
+ fprintf( fp, "%s niter=%d,\n", CONTINUE, niter );
+ fprintf( fp, "%s shift=%s,\n", CONTINUE, shift );
+ fprintf( fp, "%s rcond=%s )\n", CONTINUE, rcond );
+
+
+ num_proc_cols = num_proc_rows = ilog2(nprocs)/2;
+ if (num_proc_cols+num_proc_rows != ilog2(nprocs)) num_proc_cols += 1;
+ num_proc_cols = ipow2(num_proc_cols); num_proc_rows = ipow2(num_proc_rows);
+
+ fprintf( fp, "\nc number of nodes for which this version is compiled\n" );
+ fprintf( fp, "%sinteger nnodes_compiled\n", FINDENT );
+ fprintf( fp, "%sparameter( nnodes_compiled = %d)\n", FINDENT, nprocs );
+ fprintf( fp, "%sinteger num_proc_cols, num_proc_rows\n", FINDENT );
+ fprintf( fp, "%sparameter( num_proc_cols=%d, num_proc_rows=%d )\n",
+ FINDENT,
+ num_proc_cols,
+ num_proc_rows );
+}
+
+
+/*
+ * write_ft_info(): Write FT specific info to config file
+ */
+
+void write_ft_info(FILE *fp, int nprocs, char class)
+{
+ /* easiest way (given the way the benchmark is written)
+ * is to specify log of number of grid points in each
+ * direction m1, m2, m3. nt is the number of iterations
+ */
+ int nx, ny, nz, maxdim, niter;
+ if (class == 'S') { nx = 64; ny = 64; nz = 64; niter = 6;}
+ else if (class == 'W') { nx = 128; ny = 128; nz = 32; niter = 6;}
+ else if (class == 'A') { nx = 256; ny = 256; nz = 128; niter = 6;}
+ else if (class == 'B') { nx = 512; ny = 256; nz = 256; niter =20;}
+ else if (class == 'C') { nx = 512; ny = 512; nz = 512; niter =20;}
+ else if (class == 'D') { nx = 2048; ny = 1024; nz = 1024; niter =25;}
+ else if (class == 'E') { nx = 4096; ny = 2048; nz = 2048; niter =25;}
+ else {
+ printf("setparams: Internal error: invalid class type %c\n", class);
+ exit(1);
+ }
+ maxdim = nx;
+ if (ny > maxdim) maxdim = ny;
+ if (nz > maxdim) maxdim = nz;
+ fprintf(fp, "%sinteger nx, ny, nz, maxdim, niter_default, ntdivnp, np_min\n", FINDENT);
+ fprintf(fp, "%sparameter (nx=%d, ny=%d, nz=%d, maxdim=%d)\n",
+ FINDENT, nx, ny, nz, maxdim);
+ fprintf(fp, "%sparameter (niter_default=%d)\n", FINDENT, niter);
+ fprintf(fp, "%sparameter (np_min = %d)\n", FINDENT, nprocs);
+ fprintf(fp, "%sparameter (ntdivnp=((nx*ny)/np_min)*nz)\n", FINDENT);
+ fprintf(fp, "%sdouble precision ntotal_f\n", FINDENT);
+ fprintf(fp, "%sparameter (ntotal_f=1.d0*nx*ny*nz)\n", FINDENT);
+}
+
+/*
+ * write_ep_info(): Write EP specific info to config file
+ */
+
+void write_ep_info(FILE *fp, int nprocs, char class)
+{
+ /* easiest way (given the way the benchmark is written)
+ * is to specify log of number of grid points in each
+ * direction m1, m2, m3. nt is the number of iterations
+ */
+ int m;
+ if (class == 'S') { m = 24; }
+ else if (class == 'W') { m = 25; }
+ else if (class == 'A') { m = 28; }
+ else if (class == 'B') { m = 30; }
+ else if (class == 'C') { m = 32; }
+ else if (class == 'D') { m = 36; }
+ else if (class == 'E') { m = 40; }
+ else {
+ printf("setparams: Internal error: invalid class type %c\n", class);
+ exit(1);
+ }
+ /* number of processors given by "npm" */
+
+
+ fprintf(fp, "%scharacter class\n",FINDENT);
+ fprintf(fp, "%sparameter (class =\'%c\')\n",
+ FINDENT, class);
+ fprintf(fp, "%sinteger m, npm\n", FINDENT);
+ fprintf(fp, "%sparameter (m=%d, npm=%d)\n",
+ FINDENT, m, nprocs);
+}
+/*
+ * write_ep_info_C(): Write EP specific info to config file
+ */
+
+
+void write_ep_info_C(FILE *fp, int nprocs, char class)
+{
+ /* easiest way (given the way the benchmark is written)
+ * is to specify log of number of grid points in each
+ * direction m1, m2, m3. nt is the number of iterations
+ */
+ int m;
+ if (class == 'S') { m = 24; }
+ else if (class == 'W') { m = 25; }
+ else if (class == 'A') { m = 28; }
+ else if (class == 'B') { m = 30; }
+ else if (class == 'C') { m = 32; }
+ else if (class == 'D') { m = 36; }
+ else if (class == 'E') { m = 40; }
+ else {
+ printf("setparams: Internal error: invalid class type %c\n", class);
+ exit(1);
+ }
+ /* number of processors given by "npm" */
+
+
+ fprintf(fp, "%schar *_class=\"%c\";\n",FINDENT,class);
+ fprintf(fp, "%sint m=%d;\n", FINDENT,m);
+ fprintf(fp, "%sint npm=%d;\n", FINDENT,nprocs);
+}
+/*
+ * This is a gross hack to allow the benchmarks to
+ * print out how they were compiled. Various other ways
+ * of doing this have been tried and they all fail on
+ * some machine - due to a broken "make" program, or
+ * F77 limitations, of whatever. Hopefully this will
+ * always work because it uses very portable C. Unfortunately
+ * it relies on parsing the make.def file - YUK.
+ * If your machine doesn't have <string.h> or <ctype.h>, happy hacking!
+ *
+ */
+
+#define VERBOSE
+#define LL 400
+#include <stdio.h>
+#define DEFFILE "../config/make.def"
+#define DEFAULT_MESSAGE "(none)"
+FILE *deffile;
+void write_compiler_info(int type, FILE *fp)
+{
+ char line[LL];
+ char mpif77[LL], flink[LL], fmpi_lib[LL], fmpi_inc[LL], fflags[LL], flinkflags[LL];
+ char compiletime[LL], randfile[LL];
+ char mpicc[LL], cflags[LL], clink[LL], clinkflags[LL],
+ cmpi_lib[LL], cmpi_inc[LL];
+ struct tm *tmp;
+ time_t t;
+ deffile = fopen(DEFFILE, "r");
+ if (deffile == NULL) {
+ printf("\n\
+setparams: File %s doesn't exist. To build the NAS benchmarks\n\
+ you need to create is according to the instructions\n\
+ in the README in the main directory and comments in \n\
+ the file config/make.def.template\n", DEFFILE);
+ exit(1);
+ }
+ strcpy(mpif77, DEFAULT_MESSAGE);
+ strcpy(flink, DEFAULT_MESSAGE);
+ strcpy(fmpi_lib, DEFAULT_MESSAGE);
+ strcpy(fmpi_inc, DEFAULT_MESSAGE);
+ strcpy(fflags, DEFAULT_MESSAGE);
+ strcpy(flinkflags, DEFAULT_MESSAGE);
+ strcpy(randfile, DEFAULT_MESSAGE);
+ strcpy(mpicc, DEFAULT_MESSAGE);
+ strcpy(cflags, DEFAULT_MESSAGE);
+ strcpy(clink, DEFAULT_MESSAGE);
+ strcpy(clinkflags, DEFAULT_MESSAGE);
+ strcpy(cmpi_lib, DEFAULT_MESSAGE);
+ strcpy(cmpi_inc, DEFAULT_MESSAGE);
+
+ while (fgets(line, LL, deffile) != NULL) {
+ if (*line == '#') continue;
+ /* yes, this is inefficient. but it's simple! */
+ check_line(line, "MPIF77", mpif77);
+ check_line(line, "FLINK", flink);
+ check_line(line, "FMPI_LIB", fmpi_lib);
+ check_line(line, "FMPI_INC", fmpi_inc);
+ check_line(line, "FFLAGS", fflags);
+ check_line(line, "FLINKFLAGS", flinkflags);
+ check_line(line, "RAND", randfile);
+ check_line(line, "MPICC", mpicc);
+ check_line(line, "CFLAGS", cflags);
+ check_line(line, "CLINK", clink);
+ check_line(line, "CLINKFLAGS", clinkflags);
+ check_line(line, "CMPI_LIB", cmpi_lib);
+ check_line(line, "CMPI_INC", cmpi_inc);
+ /* if the dummy library is used by including make.dummy, we set the
+ Fortran and C paths to libraries and headers accordingly */
+ if(check_include_line(line, "../config/make.dummy")) {
+ strcpy(fmpi_lib, "-L../MPI_dummy -lmpi");
+ strcpy(fmpi_inc, "-I../MPI_dummy");
+ strcpy(cmpi_lib, "-L../MPI_dummy -lmpi");
+ strcpy(cmpi_inc, "-I../MPI_dummy");
+ }
+ }
+
+
+ (void) time(&t);
+ tmp = localtime(&t);
+ (void) strftime(compiletime, (size_t)LL, "%d %b %Y", tmp);
+
+
+ switch(type) {
+ case FT:
+ case SP:
+ case BT:
+ case MG:
+ case LU:
+ //case EP:
+ case CG:
+ put_string(fp, "compiletime", compiletime);
+ put_string(fp, "npbversion", VERSION);
+ put_string(fp, "cs1", mpif77);
+ put_string(fp, "cs2", flink);
+ put_string(fp, "cs3", fmpi_lib);
+ put_string(fp, "cs4", fmpi_inc);
+ put_string(fp, "cs5", fflags);
+ put_string(fp, "cs6", flinkflags);
+ put_string(fp, "cs7", randfile);
+ break;
+ case IS:
+ case EP:
+ case DT:
+ put_def_string(fp, "COMPILETIME", compiletime);
+ put_def_string(fp, "NPBVERSION", VERSION);
+ put_def_string(fp, "MPICC", mpicc);
+ put_def_string(fp, "CFLAGS", cflags);
+ put_def_string(fp, "CLINK", clink);
+ put_def_string(fp, "CLINKFLAGS", clinkflags);
+ put_def_string(fp, "CMPI_LIB", cmpi_lib);
+ put_def_string(fp, "CMPI_INC", cmpi_inc);
+ break;
+ default:
+ printf("setparams: (Internal error): Unknown benchmark type %d\n",
+ type);
+ exit(1);
+ }
+
+}
+
+void check_line(char *line, char *label, char *val)
+{
+ char *original_line;
+ int n;
+ original_line = line;
+ /* compare beginning of line and label */
+ while (*label != '\0' && *line == *label) {
+ line++; label++;
+ }
+ /* if *label is not EOS, we must have had a mismatch */
+ if (*label != '\0') return;
+ /* if *line is not a space, actual label is longer than test label */
+ if (!isspace(*line) && *line != '=') return ;
+ /* skip over white space */
+ while (isspace(*line)) line++;
+ /* next char should be '=' */
+ if (*line != '=') return;
+ /* skip over white space */
+ while (isspace(*++line));
+ /* if EOS, nothing was specified */
+ if (*line == '\0') return;
+ /* finally we've come to the value */
+ strcpy(val, line);
+ /* chop off the newline at the end */
+ n = strlen(val)-1;
+ if (n >= 0 && val[n] == '\n')
+ val[n--] = '\0';
+ if (n >= 0 && val[n] == '\r')
+ val[n--] = '\0';
+ /* treat continuation */
+ while (val[n] == '\\' && fgets(original_line, LL, deffile)) {
+ line = original_line;
+ while (isspace(*line)) line++;
+ if (isspace(*original_line)) val[n++] = ' ';
+ while (*line && *line != '\n' && *line != '\r' && n < LL-1)
+ val[n++] = *line++;
+ val[n] = '\0';
+ n--;
+ }
+/* if (val[strlen(val) - 1] == '\\') {
+ printf("\n\
+setparams: Error in file make.def. Because of the way in which\n\
+ command line arguments are incorporated into the\n\
+ executable benchmark, you can't have any continued\n\
+ lines in the file make.def, that is, lines ending\n\
+ with the character \"\\\". Although it may be ugly, \n\
+ you should be able to reformat without continuation\n\
+ lines. The offending line is\n\
+ %s\n", original_line);
+ exit(1);
+ } */
+}
+
+int check_include_line(char *line, char *filename)
+{
+ char *include_string = "include";
+ /* compare beginning of line and "include" */
+ while (*include_string != '\0' && *line == *include_string) {
+ line++; include_string++;
+ }
+ /* if *include_string is not EOS, we must have had a mismatch */
+ if (*include_string != '\0') return(0);
+ /* if *line is not a space, first word is not "include" */
+ if (!isspace(*line)) return(0);
+ /* skip over white space */
+ while (isspace(*++line));
+ /* if EOS, nothing was specified */
+ if (*line == '\0') return(0);
+ /* next keyword should be name of include file in *filename */
+ while (*filename != '\0' && *line == *filename) {
+ line++; filename++;
+ }
+ if (*filename != '\0' ||
+ (*line != ' ' && *line != '\0' && *line !='\n')) return(0);
+ else return(1);
+}
+
+
+#define MAXL 46
+void put_string(FILE *fp, char *name, char *val)
+{
+ int len;
+ len = strlen(val);
+ if (len > MAXL) {
+ val[MAXL] = '\0';
+ val[MAXL-1] = '.';
+ val[MAXL-2] = '.';
+ val[MAXL-3] = '.';
+ len = MAXL;
+ }
+ fprintf(fp, "%scharacter*%d %s\n", FINDENT, len, name);
+ fprintf(fp, "%sparameter (%s=\'%s\')\n", FINDENT, name, val);
+}
+
+/* NOTE: is the ... stuff necessary in C? */
+void put_def_string(FILE *fp, char *name, char *val)
+{
+ int len;
+ len = strlen(val);
+ if (len > MAXL) {
+ val[MAXL] = '\0';
+ val[MAXL-1] = '.';
+ val[MAXL-2] = '.';
+ val[MAXL-3] = '.';
+ len = MAXL;
+ }
+ fprintf(fp, "#define %s \"%s\"\n", name, val);
+}
+
+void put_def_variable(FILE *fp, char *name, char *val)
+{
+ int len;
+ len = strlen(val);
+ if (len > MAXL) {
+ val[MAXL] = '\0';
+ val[MAXL-1] = '.';
+ val[MAXL-2] = '.';
+ val[MAXL-3] = '.';
+ len = MAXL;
+ }
+ fprintf(fp, "#define %s %s\n", name, val);
+}
+
+
+
+#if 0
+
+/* this version allows arbitrarily long lines but
+ * some compilers don't like that and they're rarely
+ * useful
+ */
+
+#define LINELEN 65
+void put_string(FILE *fp, char *name, char *val)
+{
+ int len, nlines, pos, i;
+ char line[100];
+ len = strlen(val);
+ nlines = len/LINELEN;
+ if (nlines*LINELEN < len) nlines++;
+ fprintf(fp, "%scharacter*%d %s\n", FINDENT, nlines*LINELEN, name);
+ fprintf(fp, "%sparameter (%s = \n", FINDENT, name);
+ for (i = 0; i < nlines; i++) {
+ pos = i*LINELEN;
+ if (i == 0) fprintf(fp, "%s\'", CONTINUE);
+ else fprintf(fp, "%s", CONTINUE);
+ /* number should be same as LINELEN */
+ fprintf(fp, "%.65s", val+pos);
+ if (i == nlines-1) fprintf(fp, "\')\n");
+ else fprintf(fp, "\n");
+ }
+}
+
+#endif
+
+
+/* integer square root. Return error if argument isn't
+ * a perfect square or is less than or equal to zero
+ */
+
+int isqrt(int i)
+{
+ int root, square;
+ if (i <= 0) return(-1);
+ square = 0;
+ for (root = 1; square <= i; root++) {
+ square = root*root;
+ if (square == i) return(root);
+ }
+ return(-1);
+}
+
+
+/* integer log base two. Return error is argument isn't
+ * a power of two or is less than or equal to zero
+ */
+
+int ilog2(int i)
+{
+ int log2;
+ int exp2 = 1;
+ if (i <= 0) return(-1);
+
+ for (log2 = 0; log2 < 20; log2++) {
+ if (exp2 == i) return(log2);
+ exp2 *= 2;
+ }
+ return(-1);
+}
+
+int ipow2(int i)
+{
+ int pow2 = 1;
+ if (i < 0) return(-1);
+ if (i == 0) return(1);
+ while(i--) pow2 *= 2;
+ return(pow2);
+}
+
+
+
+void write_convertdouble_info(int type, FILE *fp)
+{
+ switch(type) {
+ case SP:
+ case BT:
+ case LU:
+ case FT:
+ case MG:
+ //case EP:
+ case CG:
+ fprintf(fp, "%slogical convertdouble\n", FINDENT);
+#ifdef CONVERTDOUBLE
+ fprintf(fp, "%sparameter (convertdouble = .true.)\n", FINDENT);
+#else
+ fprintf(fp, "%sparameter (convertdouble = .false.)\n", FINDENT);
+#endif
+ break;
+ }
+}
--- /dev/null
+BEGIN { SMAKE = "make" } {
+ if ($1 !~ /^#/ && NF > 2) {
+ printf "cd `echo %s|tr '[a-z]' '[A-Z]'`; %s clean;", $1, SMAKE;
+ printf "%s CLASS=%s NPROCS=%s", SMAKE, $2, $3;
+ if ( NF > 3 ) {
+ if ( $4 ~ /^vec/ || $4 ~ /^VEC/ ) {
+ printf " VERSION=%s", $4;
+ if ( NF > 4 ) {
+ printf " SUBTYPE=%s", $5;
+ }
+ } else {
+ printf " SUBTYPE=%s", $4;
+ if ( NF > 4 ) {
+ printf " VERSION=%s", $5;
+ }
+ }
+ }
+ printf "; cd ..\n";
+ }
+}