+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=bt
-BENCHMARKU=BT
-VEC=
-
-include ../config/make.def
-
-
-OBJS = bt.o make_set.o initialize.o exact_solution.o exact_rhs.o \
- set_constants.o adi.o define.o copy_faces.o rhs.o solve_subs.o \
- x_solve$(VEC).o y_solve$(VEC).o z_solve$(VEC).o add.o error.o \
- verify.o setup_mpi.o \
- ${COMMON}/print_results.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-# npbparams.h is included by header.h
-# The following rule should do the trick but many make programs (not gmake)
-# will do the wrong thing and rebuild the world every time (because the
-# mod time on header.h is not changed. One solution would be to
-# touch header.h but this might cause confusion if someone has
-# accidentally deleted it. Instead, make the dependency on npbparams.h
-# explicit in all the lines below (even though dependence is indirect).
-
-# header.h: npbparams.h
-
-${PROGRAM}: config
- @if [ x$(VERSION) = xvec ] ; then \
- ${MAKE} VEC=_vec exec; \
- elif [ x$(VERSION) = xVEC ] ; then \
- ${MAKE} VEC=_vec exec; \
- else \
- ${MAKE} exec; \
- fi
-
-exec: $(OBJS)
- @if [ x$(SUBTYPE) = xfull ] ; then \
- ${MAKE} bt-full; \
- elif [ x$(SUBTYPE) = xFULL ] ; then \
- ${MAKE} bt-full; \
- elif [ x$(SUBTYPE) = xsimple ] ; then \
- ${MAKE} bt-simple; \
- elif [ x$(SUBTYPE) = xSIMPLE ] ; then \
- ${MAKE} bt-simple; \
- elif [ x$(SUBTYPE) = xfortran ] ; then \
- ${MAKE} bt-fortran; \
- elif [ x$(SUBTYPE) = xFORTRAN ] ; then \
- ${MAKE} bt-fortran; \
- elif [ x$(SUBTYPE) = xepio ] ; then \
- ${MAKE} bt-epio; \
- elif [ x$(SUBTYPE) = xEPIO ] ; then \
- ${MAKE} bt-epio; \
- else \
- ${MAKE} bt-bt; \
- fi
-
-bt-bt: ${OBJS} btio.o
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} btio.o ${FMPI_LIB}
-
-bt-full: ${OBJS} full_mpiio.o btio_common.o
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_full ${OBJS} btio_common.o full_mpiio.o ${FMPI_LIB}
-
-bt-simple: ${OBJS} simple_mpiio.o btio_common.o
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_simple ${OBJS} btio_common.o simple_mpiio.o ${FMPI_LIB}
-
-bt-fortran: ${OBJS} fortran_io.o btio_common.o
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.fortran_io ${OBJS} btio_common.o fortran_io.o ${FMPI_LIB}
-
-bt-epio: ${OBJS} epio.o btio_common.o
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.ep_io ${OBJS} btio_common.o epio.o ${FMPI_LIB}
-
-.f.o:
- ${FCOMPILE} $<
-
-.c.o:
- ${CCOMPILE} $<
-
-
-bt.o: bt.f header.h npbparams.h mpinpb.h
-make_set.o: make_set.f header.h npbparams.h mpinpb.h
-initialize.o: initialize.f header.h npbparams.h
-exact_solution.o: exact_solution.f header.h npbparams.h
-exact_rhs.o: exact_rhs.f header.h npbparams.h
-set_constants.o: set_constants.f header.h npbparams.h
-adi.o: adi.f header.h npbparams.h
-define.o: define.f header.h npbparams.h
-copy_faces.o: copy_faces.f header.h npbparams.h mpinpb.h
-rhs.o: rhs.f header.h npbparams.h
-x_solve$(VEC).o: x_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h
-y_solve$(VEC).o: y_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h
-z_solve$(VEC).o: z_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h
-solve_subs.o: solve_subs.f npbparams.h
-add.o: add.f header.h npbparams.h
-error.o: error.f header.h npbparams.h mpinpb.h
-verify.o: verify.f header.h npbparams.h mpinpb.h
-setup_mpi.o: setup_mpi.f mpinpb.h npbparams.h
-btio.o: btio.f header.h npbparams.h
-btio_common.o: btio_common.f mpinpb.h npbparams.h
-fortran_io.o: fortran_io.f mpinpb.h npbparams.h
-simple_mpiio.o: simple_mpiio.f mpinpb.h npbparams.h
-full_mpiio.o: full_mpiio.f mpinpb.h npbparams.h
-epio.o: epio.f mpinpb.h npbparams.h
-
-clean:
- - rm -f *.o *~ mputil*
- - rm -f npbparams.h core
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine add
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c addition of update to the vector u
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, i, j, k, m
-
- do c = 1, ncells
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- u(m,i,j,k,c) = u(m,i,j,k,c) + rhs(m,i,j,k,c)
- enddo
- enddo
- enddo
- enddo
- enddo
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine adi
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- call copy_faces
-
- call x_solve
-
- call y_solve
-
- call z_solve
-
- call add
-
- return
- end
-
+++ /dev/null
-!-------------------------------------------------------------------------!
-! !
-! N A S P A R A L L E L B E N C H M A R K S 3.3 !
-! !
-! B T !
-! !
-!-------------------------------------------------------------------------!
-! !
-! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
-! It is described in NAS Technical Reports 95-020 and 02-007. !
-! !
-! Permission to use, copy, distribute and modify this software !
-! for any purpose with or without fee is hereby granted. We !
-! request, however, that all derived work reference the NAS !
-! Parallel Benchmarks 3.3. This software is provided "as is" !
-! without express or implied warranty. !
-! !
-! Information on NPB 3.3, including the technical report, the !
-! original specifications, source code, results and information !
-! on how to submit new results, is available at: !
-! !
-! http://www.nas.nasa.gov/Software/NPB/ !
-! !
-! Send comments or suggestions to npb@nas.nasa.gov !
-! !
-! NAS Parallel Benchmarks Group !
-! NASA Ames Research Center !
-! Mail Stop: T27A-1 !
-! Moffett Field, CA 94035-1000 !
-! !
-! E-mail: npb@nas.nasa.gov !
-! Fax: (650) 604-3957 !
-! !
-!-------------------------------------------------------------------------!
-
-c---------------------------------------------------------------------
-c
-c Authors: R. F. Van der Wijngaart
-c T. Harris
-c M. Yarrow
-c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
- program MPBT
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i, niter, step, c, error, fstatus
- double precision navg, mflops, mbytes, n3
-
- external timer_read
- double precision t, tmax, tiominv, tpc, timer_read
- logical verified
- character class, cbuff*40
-
- integer wr_interval
-
- call setup_mpi
- if (.not. active) goto 999
-
-c---------------------------------------------------------------------
-c Root node reads input file (if it exists) else takes
-c defaults from parameters
-c---------------------------------------------------------------------
- if (node .eq. root) then
-
- write(*, 1000)
- open (unit=2,file='inputbt.data',status='old', iostat=fstatus)
-c
- rd_interval = 0
- if (fstatus .eq. 0) then
- write(*,233)
- 233 format(' Reading from input file inputbt.data')
- read (2,*) niter
- read (2,*) dt
- read (2,*) grid_points(1), grid_points(2), grid_points(3)
- if (iotype .ne. 0) then
- read (2,'(A)') cbuff
- read (cbuff,*,iostat=i) wr_interval, rd_interval
- if (i .ne. 0) rd_interval = 0
- if (wr_interval .le. 0) wr_interval = wr_default
- endif
- if (iotype .eq. 1) then
- read (2,*) collbuf_nodes, collbuf_size
- write(*,*) 'collbuf_nodes ', collbuf_nodes
- write(*,*) 'collbuf_size ', collbuf_size
- endif
- close(2)
- else
- write(*,234)
- niter = niter_default
- dt = dt_default
- grid_points(1) = problem_size
- grid_points(2) = problem_size
- grid_points(3) = problem_size
- wr_interval = wr_default
- if (iotype .eq. 1) then
-c set number of nodes involved in collective buffering to 4,
-c unless total number of nodes is smaller than that.
-c set buffer size for collective buffering to 1MB per node
-c collbuf_nodes = min(4,no_nodes)
-c set default to No-File-Hints with a value of 0
- collbuf_nodes = 0
- collbuf_size = 1000000
- endif
- endif
- 234 format(' No input file inputbt.data. Using compiled defaults')
-
- write(*, 1001) grid_points(1), grid_points(2), grid_points(3)
- write(*, 1002) niter, dt
- if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes
- if (no_nodes .ne. maxcells*maxcells)
- > write(*, 1005) maxcells*maxcells
- write(*, 1003) no_nodes
-
- if (iotype .eq. 1) write(*, 1006) 'FULL MPI-IO', wr_interval
- if (iotype .eq. 2) write(*, 1006) 'SIMPLE MPI-IO', wr_interval
- if (iotype .eq. 3) write(*, 1006) 'EPIO', wr_interval
- if (iotype .eq. 4) write(*, 1006) 'FORTRAN IO', wr_interval
-
- 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- BT Benchmark ',/)
- 1001 format(' Size: ', i4, 'x', i4, 'x', i4)
- 1002 format(' Iterations: ', i4, ' dt: ', F11.7)
- 1004 format(' Total number of processes: ', i5)
- 1005 format(' WARNING: compiled for ', i5, ' processes ')
- 1003 format(' Number of active processes: ', i5, /)
- 1006 format(' BTIO -- ', A, ' write interval: ', i3 /)
-
- endif
-
- call mpi_bcast(niter, 1, MPI_INTEGER,
- > root, comm_setup, error)
-
- call mpi_bcast(dt, 1, dp_type,
- > root, comm_setup, error)
-
- call mpi_bcast(grid_points(1), 3, MPI_INTEGER,
- > root, comm_setup, error)
-
- call mpi_bcast(wr_interval, 1, MPI_INTEGER,
- > root, comm_setup, error)
-
- call mpi_bcast(rd_interval, 1, MPI_INTEGER,
- > root, comm_setup, error)
-
- call make_set
-
- do c = 1, maxcells
- if ( (cell_size(1,c) .gt. IMAX) .or.
- > (cell_size(2,c) .gt. JMAX) .or.
- > (cell_size(3,c) .gt. KMAX) ) then
- print *,node, c, (cell_size(i,c),i=1,3)
- print *,' Problem size too big for compiled array sizes'
- goto 999
- endif
- end do
-
- call set_constants
-
- call initialize
-
- call setup_btio
- idump = 0
-
- call lhsinit
-
- call exact_rhs
-
- call compute_buffer_size(5)
-
-c---------------------------------------------------------------------
-c do one time step to touch all code, and reinitialize
-c---------------------------------------------------------------------
- call adi
- call initialize
-
- call timer_clear(2)
-
-c---------------------------------------------------------------------
-c Synchronize before placing time stamp
-c---------------------------------------------------------------------
- call mpi_barrier(comm_setup, error)
-
- call timer_clear(1)
- call timer_start(1)
-
- do step = 1, niter
-
- if (node .eq. root) then
- if (mod(step, 20) .eq. 0 .or. step .eq. niter .or.
- > step .eq. 1) then
- write(*, 200) step
- 200 format(' Time step ', i4)
- endif
- endif
-
- call adi
-
- if (iotype .ne. 0) then
- call timer_start(2)
- if (mod(step, wr_interval).eq.0 .or. step .eq. niter) then
- if (node .eq. root) then
- print *, 'Writing data set, time step', step
- endif
- if (step .eq. niter .and. rd_interval .gt. 1) then
- rd_interval = 1
- endif
- call output_timestep
- idump = idump + 1
- endif
- call timer_stop(2)
- endif
- end do
-
- call btio_cleanup
-
- call timer_stop(1)
- t = timer_read(1)
-
- call verify(niter, class, verified)
-
- call mpi_reduce(t, tmax, 1,
- > dp_type, MPI_MAX,
- > root, comm_setup, error)
-
- if (iotype .ne. 0) then
- t = timer_read(2)
- if (t .ne. 0.d0) t = 1.0d0 / t
- call mpi_reduce(t, tiominv, 1,
- > dp_type, MPI_SUM,
- > root, comm_setup, error)
- endif
-
- if( node .eq. root ) then
- n3 = 1.0d0*grid_points(1)*grid_points(2)*grid_points(3)
- navg = (grid_points(1)+grid_points(2)+grid_points(3))/3.0
- if( tmax .ne. 0. ) then
- mflops = 1.0e-6*float(niter)*
- > (3478.8*n3-17655.7*navg**2+28023.7*navg)
- > / tmax
- else
- mflops = 0.0
- endif
-
- if (iotype .ne. 0) then
- mbytes = n3 * 40.0 * idump * 1.0d-6
- tiominv = tiominv / no_nodes
- t = 0.0
- if (tiominv .ne. 0.) t = 1.d0 / tiominv
- tpc = 0.0
- if (tmax .ne. 0.) tpc = t * 100.0 / tmax
- write(*,1100) t, tpc, mbytes, mbytes*tiominv
- 1100 format(/' BTIO -- statistics:'/
- > ' I/O timing in seconds : ', f14.2/
- > ' I/O timing percentage : ', f14.2/
- > ' Total data written (MB) : ', f14.2/
- > ' I/O data rate (MB/sec) : ', f14.2)
- endif
-
- call print_results('BT', class, grid_points(1),
- > grid_points(2), grid_points(3), niter, maxcells*maxcells,
- > total_nodes, tmax, mflops, ' floating point',
- > verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5,
- > cs6, '(none)')
- endif
-
- 999 continue
- call mpi_barrier(MPI_COMM_WORLD, error)
- call mpi_finalize(error)
-
- end
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine btio_verify(verified)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- logical verified
-
- verified = .true.
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- double precision xce_acc(5)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine checksum_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine clear_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer cio, kio, jio, ix
-
- do cio=1,ncells
- do kio=0, cell_size(3,cio)-1
- do jio=0, cell_size(2,cio)-1
- do ix=0,cell_size(1,cio)-1
- u(1,ix, jio,kio,cio) = 0
- u(2,ix, jio,kio,cio) = 0
- u(3,ix, jio,kio,cio) = 0
- u(4,ix, jio,kio,cio) = 0
- u(5,ix, jio,kio,cio) = 0
- enddo
- enddo
- enddo
- enddo
-
- return
- end
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine copy_faces
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c This function copies the face values of a variable defined on a set
-c of cells to the overlap locations of the adjacent sets of cells.
-c Because a set of cells interfaces in each direction with exactly one
-c other set, we only need to fill six different buffers. We could try to
-c overlap communication with computation, by computing
-c some internal values while communicating boundary values, but this
-c adds so much overhead that it's not clearly useful.
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i, j, k, c, m, requests(0:11), p0, p1,
- > p2, p3, p4, p5, b_size(0:5), ss(0:5),
- > sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11)
-
-c---------------------------------------------------------------------
-c exit immediately if there are no faces to be copied
-c---------------------------------------------------------------------
- if (no_nodes .eq. 1) then
- call compute_rhs
- return
- endif
-
- ss(0) = start_send_east
- ss(1) = start_send_west
- ss(2) = start_send_north
- ss(3) = start_send_south
- ss(4) = start_send_top
- ss(5) = start_send_bottom
-
- sr(0) = start_recv_east
- sr(1) = start_recv_west
- sr(2) = start_recv_north
- sr(3) = start_recv_south
- sr(4) = start_recv_top
- sr(5) = start_recv_bottom
-
- b_size(0) = east_size
- b_size(1) = west_size
- b_size(2) = north_size
- b_size(3) = south_size
- b_size(4) = top_size
- b_size(5) = bottom_size
-
-c---------------------------------------------------------------------
-c because the difference stencil for the diagonalized scheme is
-c orthogonal, we do not have to perform the staged copying of faces,
-c but can send all face information simultaneously to the neighboring
-c cells in all directions
-c---------------------------------------------------------------------
- p0 = 0
- p1 = 0
- p2 = 0
- p3 = 0
- p4 = 0
- p5 = 0
-
- do c = 1, ncells
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to eastern neighbors (i-dir)
-c---------------------------------------------------------------------
- if (cell_coord(1,c) .ne. ncells) then
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = cell_size(1,c)-2, cell_size(1,c)-1
- do m = 1, 5
- out_buffer(ss(0)+p0) = u(m,i,j,k,c)
- p0 = p0 + 1
- end do
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to western neighbors
-c---------------------------------------------------------------------
- if (cell_coord(1,c) .ne. 1) then
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, 1
- do m = 1, 5
- out_buffer(ss(1)+p1) = u(m,i,j,k,c)
- p1 = p1 + 1
- end do
- end do
- end do
- end do
-
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to northern neighbors (j_dir)
-c---------------------------------------------------------------------
- if (cell_coord(2,c) .ne. ncells) then
- do k = 0, cell_size(3,c)-1
- do j = cell_size(2,c)-2, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- out_buffer(ss(2)+p2) = u(m,i,j,k,c)
- p2 = p2 + 1
- end do
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to southern neighbors
-c---------------------------------------------------------------------
- if (cell_coord(2,c).ne. 1) then
- do k = 0, cell_size(3,c)-1
- do j = 0, 1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- out_buffer(ss(3)+p3) = u(m,i,j,k,c)
- p3 = p3 + 1
- end do
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to top neighbors (k-dir)
-c---------------------------------------------------------------------
- if (cell_coord(3,c) .ne. ncells) then
- do k = cell_size(3,c)-2, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- out_buffer(ss(4)+p4) = u(m,i,j,k,c)
- p4 = p4 + 1
- end do
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to bottom neighbors
-c---------------------------------------------------------------------
- if (cell_coord(3,c).ne. 1) then
- do k=0, 1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- out_buffer(ss(5)+p5) = u(m,i,j,k,c)
- p5 = p5 + 1
- end do
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c cell loop
-c---------------------------------------------------------------------
- end do
-
- call mpi_irecv(in_buffer(sr(0)), b_size(0),
- > dp_type, successor(1), WEST,
- > comm_rhs, requests(0), error)
- call mpi_irecv(in_buffer(sr(1)), b_size(1),
- > dp_type, predecessor(1), EAST,
- > comm_rhs, requests(1), error)
- call mpi_irecv(in_buffer(sr(2)), b_size(2),
- > dp_type, successor(2), SOUTH,
- > comm_rhs, requests(2), error)
- call mpi_irecv(in_buffer(sr(3)), b_size(3),
- > dp_type, predecessor(2), NORTH,
- > comm_rhs, requests(3), error)
- call mpi_irecv(in_buffer(sr(4)), b_size(4),
- > dp_type, successor(3), BOTTOM,
- > comm_rhs, requests(4), error)
- call mpi_irecv(in_buffer(sr(5)), b_size(5),
- > dp_type, predecessor(3), TOP,
- > comm_rhs, requests(5), error)
-
- call mpi_isend(out_buffer(ss(0)), b_size(0),
- > dp_type, successor(1), EAST,
- > comm_rhs, requests(6), error)
- call mpi_isend(out_buffer(ss(1)), b_size(1),
- > dp_type, predecessor(1), WEST,
- > comm_rhs, requests(7), error)
- call mpi_isend(out_buffer(ss(2)), b_size(2),
- > dp_type,successor(2), NORTH,
- > comm_rhs, requests(8), error)
- call mpi_isend(out_buffer(ss(3)), b_size(3),
- > dp_type,predecessor(2), SOUTH,
- > comm_rhs, requests(9), error)
- call mpi_isend(out_buffer(ss(4)), b_size(4),
- > dp_type,successor(3), TOP,
- > comm_rhs, requests(10), error)
- call mpi_isend(out_buffer(ss(5)), b_size(5),
- > dp_type,predecessor(3), BOTTOM,
- > comm_rhs,requests(11), error)
-
-
- call mpi_waitall(12, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c unpack the data that has just been received;
-c---------------------------------------------------------------------
- p0 = 0
- p1 = 0
- p2 = 0
- p3 = 0
- p4 = 0
- p5 = 0
-
- do c = 1, ncells
-
- if (cell_coord(1,c) .ne. 1) then
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = -2, -1
- do m = 1, 5
- u(m,i,j,k,c) = in_buffer(sr(1)+p0)
- p0 = p0 + 1
- end do
- end do
- end do
- end do
- endif
-
- if (cell_coord(1,c) .ne. ncells) then
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = cell_size(1,c), cell_size(1,c)+1
- do m = 1, 5
- u(m,i,j,k,c) = in_buffer(sr(0)+p1)
- p1 = p1 + 1
- end do
- end do
- end do
- end do
- end if
-
- if (cell_coord(2,c) .ne. 1) then
- do k = 0, cell_size(3,c)-1
- do j = -2, -1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- u(m,i,j,k,c) = in_buffer(sr(3)+p2)
- p2 = p2 + 1
- end do
- end do
- end do
- end do
-
- endif
-
- if (cell_coord(2,c) .ne. ncells) then
- do k = 0, cell_size(3,c)-1
- do j = cell_size(2,c), cell_size(2,c)+1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- u(m,i,j,k,c) = in_buffer(sr(2)+p3)
- p3 = p3 + 1
- end do
- end do
- end do
- end do
- endif
-
- if (cell_coord(3,c) .ne. 1) then
- do k = -2, -1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- u(m,i,j,k,c) = in_buffer(sr(5)+p4)
- p4 = p4 + 1
- end do
- end do
- end do
- end do
- endif
-
- if (cell_coord(3,c) .ne. ncells) then
- do k = cell_size(3,c), cell_size(3,c)+1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- u(m,i,j,k,c) = in_buffer(sr(4)+p5)
- p5 = p5 + 1
- end do
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c cells loop
-c---------------------------------------------------------------------
- end do
-
-c---------------------------------------------------------------------
-c do the rest of the rhs that uses the copied face values
-c---------------------------------------------------------------------
- call compute_rhs
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine compute_buffer_size(dim)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, dim, face_size
-
- if (ncells .eq. 1) return
-
-c---------------------------------------------------------------------
-c compute the actual sizes of the buffers; note that there is
-c always one cell face that doesn't need buffer space, because it
-c is at the boundary of the grid
-c---------------------------------------------------------------------
- west_size = 0
- east_size = 0
-
- do c = 1, ncells
- face_size = cell_size(2,c) * cell_size(3,c) * dim * 2
- if (cell_coord(1,c).ne.1) west_size = west_size + face_size
- if (cell_coord(1,c).ne.ncells) east_size = east_size +
- > face_size
- end do
-
- north_size = 0
- south_size = 0
- do c = 1, ncells
- face_size = cell_size(1,c)*cell_size(3,c) * dim * 2
- if (cell_coord(2,c).ne.1) south_size = south_size + face_size
- if (cell_coord(2,c).ne.ncells) north_size = north_size +
- > face_size
- end do
-
- top_size = 0
- bottom_size = 0
- do c = 1, ncells
- face_size = cell_size(1,c) * cell_size(2,c) * dim * 2
- if (cell_coord(3,c).ne.1) bottom_size = bottom_size +
- > face_size
- if (cell_coord(3,c).ne.ncells) top_size = top_size +
- > face_size
- end do
-
- start_send_west = 1
- start_send_east = start_send_west + west_size
- start_send_south = start_send_east + east_size
- start_send_north = start_send_south + south_size
- start_send_bottom = start_send_north + north_size
- start_send_top = start_send_bottom + bottom_size
- start_recv_west = 1
- start_recv_east = start_recv_west + west_size
- start_recv_south = start_recv_east + east_size
- start_recv_north = start_recv_south + south_size
- start_recv_bottom = start_recv_north + north_size
- start_recv_top = start_recv_bottom + bottom_size
-
- return
- end
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- character*(128) newfilenm
- integer m
-
- if (node .lt. 10000) then
- write (newfilenm, 996) filenm,node
- else
- print *, 'error generating file names (> 10000 nodes)'
- stop
- endif
-
-996 format (a,'.',i4.4)
-
- open (unit=99, file=newfilenm, form='unformatted',
- $ status='unknown')
-
- do m = 1, 5
- xce_sub(m) = 0.d0
- end do
-
- idump_sub = 0
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- include 'header.h'
- include 'mpinpb.h'
-
- integer ix, iio, jio, kio, cio, aio
-
- do cio=1,ncells
- write(99)
- $ ((((u(aio,ix, jio,kio,cio),aio=1,5),
- $ ix=0, cell_size(1,cio)-1),
- $ jio=0, cell_size(2,cio)-1),
- $ kio=0, cell_size(3,cio)-1)
- enddo
-
- idump_sub = idump_sub + 1
- if (rd_interval .gt. 0) then
- if (idump_sub .ge. rd_interval) then
-
- rewind(99)
- call acc_sub_norms(idump+1)
-
- rewind(99)
- idump_sub = 0
- endif
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine acc_sub_norms(idump_cur)
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer idump_cur
-
- integer ix, jio, kio, cio, ii, m, ichunk
- double precision xce_single(5)
-
- ichunk = idump_cur - idump_sub + 1
- do ii=0, idump_sub-1
- do cio=1,ncells
- read(99)
- $ ((((u(m,ix, jio,kio,cio),m=1,5),
- $ ix=0, cell_size(1,cio)-1),
- $ jio=0, cell_size(2,cio)-1),
- $ kio=0, cell_size(3,cio)-1)
- enddo
-
- if (node .eq. root) print *, 'Reading data set ', ii+ichunk
-
- call error_norm(xce_single)
- do m = 1, 5
- xce_sub(m) = xce_sub(m) + xce_single(m)
- end do
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- close(unit=99)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- double precision xce_acc(5)
-
- character*(128) newfilenm
- integer m
-
- if (rd_interval .gt. 0) goto 20
-
- if (node .lt. 10000) then
- write (newfilenm, 996) filenm,node
- else
- print *, 'error generating file names (> 10000 nodes)'
- stop
- endif
-
-996 format (a,'.',i4.4)
-
- open (unit=99, file=newfilenm,
- $ form='unformatted')
-
-c clear the last time step
-
- call clear_timestep
-
-c read back the time steps and accumulate norms
-
- call acc_sub_norms(idump)
-
- close(unit=99)
-
- 20 continue
- do m = 1, 5
- xce_acc(m) = xce_sub(m) / dble(idump)
- end do
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine error_norm(rms)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function computes the norm of the difference between the
-c computed solution and the exact solution
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer c, i, j, k, m, ii, jj, kk, d, error
- double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5),
- > add
-
- do m = 1, 5
- rms_work(m) = 0.0d0
- enddo
-
- do c = 1, ncells
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- ii = 0
- do i = cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
- call exact_solution(xi, eta, zeta, u_exact)
-
- do m = 1, 5
- add = u(m,ii,jj,kk,c)-u_exact(m)
- rms_work(m) = rms_work(m) + add*add
- enddo
- ii = ii + 1
- enddo
- jj = jj + 1
- enddo
- kk = kk + 1
- enddo
- enddo
-
- call mpi_allreduce(rms_work, rms, 5, dp_type,
- > MPI_SUM, comm_setup, error)
-
- do m = 1, 5
- do d = 1, 3
- rms(m) = rms(m) / dble(grid_points(d)-2)
- enddo
- rms(m) = dsqrt(rms(m))
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine rhs_norm(rms)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer c, i, j, k, d, m, error
- double precision rms(5), rms_work(5), add
-
- do m = 1, 5
- rms_work(m) = 0.0d0
- enddo
-
- do c = 1, ncells
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- add = rhs(m,i,j,k,c)
- rms_work(m) = rms_work(m) + add*add
- enddo
- enddo
- enddo
- enddo
- enddo
-
- call mpi_allreduce(rms_work, rms, 5, dp_type,
- > MPI_SUM, comm_setup, error)
-
- do m = 1, 5
- do d = 1, 3
- rms(m) = rms(m) / dble(grid_points(d)-2)
- enddo
- rms(m) = dsqrt(rms(m))
- enddo
-
- return
- end
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exact_rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- double precision dtemp(5), xi, eta, zeta, dtpp
- integer c, m, i, j, k, ip1, im1, jp1,
- > jm1, km1, kp1
-
-
-c---------------------------------------------------------------------
-c loop over all cells owned by this node
-c---------------------------------------------------------------------
- do c = 1, ncells
-
-c---------------------------------------------------------------------
-c initialize
-c---------------------------------------------------------------------
- do k= 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- forcing(m,i,j,k,c) = 0.0d0
- enddo
- enddo
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c xi-direction flux differences
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- zeta = dble(k+cell_low(3,c)) * dnzm1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- eta = dble(j+cell_low(2,c)) * dnym1
-
- do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c)
- xi = dble(i+cell_low(1,c)) * dnxm1
-
- call exact_solution(xi, eta, zeta, dtemp)
- do m = 1, 5
- ue(i,m) = dtemp(m)
- enddo
-
- dtpp = 1.0d0 / dtemp(1)
-
- do m = 2, 5
- buf(i,m) = dtpp * dtemp(m)
- enddo
-
- cuf(i) = buf(i,2) * buf(i,2)
- buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) +
- > buf(i,4) * buf(i,4)
- q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
- > buf(i,4)*ue(i,4))
-
- enddo
-
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- im1 = i-1
- ip1 = i+1
-
- forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
- > tx2*( ue(ip1,2)-ue(im1,2) )+
- > dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1))
-
- forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tx2 * (
- > (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
- > (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
- > xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+
- > dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2))
-
- forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tx2 * (
- > ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
- > xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+
- > dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3))
-
- forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tx2*(
- > ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
- > xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+
- > dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4))
-
- forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tx2*(
- > buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
- > buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
- > 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+
- > buf(im1,1))+
- > xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+
- > xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+
- > dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5))
- enddo
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation
-c---------------------------------------------------------------------
- if (start(1,c) .gt. 0) then
- do m = 1, 5
- i = 1
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m))
- i = 2
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) -
- > 4.0d0*ue(i+1,m) + ue(i+2,m))
- enddo
- endif
-
- do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1
- do m = 1, 5
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
- > (ue(i-2,m) - 4.0d0*ue(i-1,m) +
- > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m))
- enddo
- enddo
-
- if (end(1,c) .gt. 0) then
- do m = 1, 5
- i = cell_size(1,c)-3
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (ue(i-2,m) - 4.0d0*ue(i-1,m) +
- > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m))
- i = cell_size(1,c)-2
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m))
- enddo
- endif
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c eta-direction flux differences
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- zeta = dble(k+cell_low(3,c)) * dnzm1
- do i=start(1,c), cell_size(1,c)-end(1,c)-1
- xi = dble(i+cell_low(1,c)) * dnxm1
-
- do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c)
- eta = dble(j+cell_low(2,c)) * dnym1
-
- call exact_solution(xi, eta, zeta, dtemp)
- do m = 1, 5
- ue(j,m) = dtemp(m)
- enddo
-
- dtpp = 1.0d0/dtemp(1)
-
- do m = 2, 5
- buf(j,m) = dtpp * dtemp(m)
- enddo
-
- cuf(j) = buf(j,3) * buf(j,3)
- buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) +
- > buf(j,4) * buf(j,4)
- q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
- > buf(j,4)*ue(j,4))
- enddo
-
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- jm1 = j-1
- jp1 = j+1
-
- forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
- > ty2*( ue(jp1,3)-ue(jm1,3) )+
- > dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1))
-
- forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - ty2*(
- > ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
- > yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+
- > dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2))
-
- forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - ty2*(
- > (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
- > (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
- > yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+
- > dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3))
-
- forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - ty2*(
- > ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
- > yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+
- > dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4))
-
- forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - ty2*(
- > buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
- > buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
- > 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+
- > buf(jm1,1))+
- > yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+
- > yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+
- > dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5))
- enddo
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation
-c---------------------------------------------------------------------
- if (start(2,c) .gt. 0) then
- do m = 1, 5
- j = 1
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m))
- j = 2
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) -
- > 4.0d0*ue(j+1,m) + ue(j+2,m))
- enddo
- endif
-
- do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1
- do m = 1, 5
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
- > (ue(j-2,m) - 4.0d0*ue(j-1,m) +
- > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m))
- enddo
- enddo
-
- if (end(2,c) .gt. 0) then
- do m = 1, 5
- j = cell_size(2,c)-3
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (ue(j-2,m) - 4.0d0*ue(j-1,m) +
- > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m))
- j = cell_size(2,c)-2
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m))
-
- enddo
- endif
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c zeta-direction flux differences
-c---------------------------------------------------------------------
- do j=start(2,c), cell_size(2,c)-end(2,c)-1
- eta = dble(j+cell_low(2,c)) * dnym1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- xi = dble(i+cell_low(1,c)) * dnxm1
-
- do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c)
- zeta = dble(k+cell_low(3,c)) * dnzm1
-
- call exact_solution(xi, eta, zeta, dtemp)
- do m = 1, 5
- ue(k,m) = dtemp(m)
- enddo
-
- dtpp = 1.0d0/dtemp(1)
-
- do m = 2, 5
- buf(k,m) = dtpp * dtemp(m)
- enddo
-
- cuf(k) = buf(k,4) * buf(k,4)
- buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) +
- > buf(k,3) * buf(k,3)
- q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
- > buf(k,4)*ue(k,4))
- enddo
-
- do k=start(3,c), cell_size(3,c)-end(3,c)-1
- km1 = k-1
- kp1 = k+1
-
- forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
- > tz2*( ue(kp1,4)-ue(km1,4) )+
- > dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1))
-
- forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tz2 * (
- > ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
- > zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+
- > dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2))
-
- forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tz2 * (
- > ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
- > zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+
- > dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3))
-
- forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tz2 * (
- > (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
- > (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
- > zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+
- > dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4))
-
- forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tz2 * (
- > buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
- > buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
- > 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1)
- > +buf(km1,1))+
- > zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+
- > zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+
- > dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5))
- enddo
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation
-c---------------------------------------------------------------------
- if (start(3,c) .gt. 0) then
- do m = 1, 5
- k = 1
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m))
- k = 2
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) -
- > 4.0d0*ue(k+1,m) + ue(k+2,m))
- enddo
- endif
-
- do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1
- do m = 1, 5
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
- > (ue(k-2,m) - 4.0d0*ue(k-1,m) +
- > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m))
- enddo
- enddo
-
- if (end(3,c) .gt. 0) then
- do m = 1, 5
- k = cell_size(3,c)-3
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (ue(k-2,m) - 4.0d0*ue(k-1,m) +
- > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m))
- k = cell_size(3,c)-2
- forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
- > (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m))
- enddo
- endif
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c now change the sign of the forcing function,
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- forcing(m,i,j,k,c) = -1.d0 * forcing(m,i,j,k,c)
- enddo
- enddo
- enddo
- enddo
-
- enddo
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exact_solution(xi,eta,zeta,dtemp)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function returns the exact solution at point xi, eta, zeta
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- double precision xi, eta, zeta, dtemp(5)
- integer m
-
- do m = 1, 5
- dtemp(m) = ce(m,1) +
- > xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
- > eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
- > zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) +
- > zeta*ce(m,13))))
- enddo
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- character*(128) newfilenm
- integer m, ierr
-
- if (node.eq.root) record_length = 40/fortran_rec_sz
- call mpi_bcast(record_length, 1, MPI_INTEGER,
- > root, comm_setup, ierr)
-
- open (unit=99, file=filenm,
- $ form='unformatted', access='direct',
- $ recl=record_length)
-
- do m = 1, 5
- xce_sub(m) = 0.d0
- end do
-
- idump_sub = 0
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- include 'header.h'
- include 'mpinpb.h'
-
- integer ix, jio, kio, cio
-
- do cio=1,ncells
- do kio=0, cell_size(3,cio)-1
- do jio=0, cell_size(2,cio)-1
- iseek=(cell_low(1,cio) +
- $ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
- $ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
- $ PROBLEM_SIZE*idump_sub)))
-
- do ix=0,cell_size(1,cio)-1
- write(99, rec=iseek+ix+1)
- $ u(1,ix, jio,kio,cio),
- $ u(2,ix, jio,kio,cio),
- $ u(3,ix, jio,kio,cio),
- $ u(4,ix, jio,kio,cio),
- $ u(5,ix, jio,kio,cio)
- enddo
- enddo
- enddo
- enddo
-
- idump_sub = idump_sub + 1
- if (rd_interval .gt. 0) then
- if (idump_sub .ge. rd_interval) then
-
- call acc_sub_norms(idump+1)
-
- idump_sub = 0
- endif
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine acc_sub_norms(idump_cur)
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer idump_cur
-
- integer ix, jio, kio, cio, ii, m, ichunk
- double precision xce_single(5)
-
- ichunk = idump_cur - idump_sub + 1
- do ii=0, idump_sub-1
- do cio=1,ncells
- do kio=0, cell_size(3,cio)-1
- do jio=0, cell_size(2,cio)-1
- iseek=(cell_low(1,cio) +
- $ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
- $ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
- $ PROBLEM_SIZE*ii)))
-
-
- do ix=0,cell_size(1,cio)-1
- read(99, rec=iseek+ix+1)
- $ u(1,ix, jio,kio,cio),
- $ u(2,ix, jio,kio,cio),
- $ u(3,ix, jio,kio,cio),
- $ u(4,ix, jio,kio,cio),
- $ u(5,ix, jio,kio,cio)
- enddo
- enddo
- enddo
- enddo
-
- if (node .eq. root) print *, 'Reading data set ', ii+ichunk
-
- call error_norm(xce_single)
- do m = 1, 5
- xce_sub(m) = xce_sub(m) + xce_single(m)
- end do
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- close(unit=99)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- include 'header.h'
- include 'mpinpb.h'
-
- double precision xce_acc(5)
- integer m
-
- if (rd_interval .gt. 0) goto 20
-
- open (unit=99, file=filenm,
- $ form='unformatted', access='direct',
- $ recl=record_length)
-
-c clear the last time step
-
- call clear_timestep
-
-c read back the time steps and accumulate norms
-
- call acc_sub_norms(idump)
-
- close(unit=99)
-
- 20 continue
- do m = 1, 5
- xce_acc(m) = xce_sub(m) / dble(idump)
- end do
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer ierr
- integer mstatus(MPI_STATUS_SIZE)
- integer sizes(4), starts(4), subsizes(4)
- integer cell_btype(maxcells), cell_ftype(maxcells)
- integer cell_blength(maxcells)
- integer info
- character*20 cb_nodes, cb_size
- integer c, m
- integer cell_disp(maxcells)
-
- call mpi_bcast(collbuf_nodes, 1, MPI_INTEGER,
- > root, comm_setup, ierr)
-
- call mpi_bcast(collbuf_size, 1, MPI_INTEGER,
- > root, comm_setup, ierr)
-
- if (collbuf_nodes .eq. 0) then
- info = MPI_INFO_NULL
- else
- write (cb_nodes,*) collbuf_nodes
- write (cb_size,*) collbuf_size
- call MPI_Info_create(info, ierr)
- call MPI_Info_set(info, 'cb_nodes', cb_nodes, ierr)
- call MPI_Info_set(info, 'cb_buffer_size', cb_size, ierr)
- call MPI_Info_set(info, 'collective_buffering', 'true', ierr)
- endif
-
- call MPI_Type_contiguous(5, MPI_DOUBLE_PRECISION,
- $ element, ierr)
- call MPI_Type_commit(element, ierr)
- call MPI_Type_extent(element, eltext, ierr)
-
- do c = 1, ncells
-c
-c Outer array dimensions ar same for every cell
-c
- sizes(1) = IMAX+4
- sizes(2) = JMAX+4
- sizes(3) = KMAX+4
-c
-c 4th dimension is cell number, total of maxcells cells
-c
- sizes(4) = maxcells
-c
-c Internal dimensions of cells can differ slightly between cells
-c
- subsizes(1) = cell_size(1, c)
- subsizes(2) = cell_size(2, c)
- subsizes(3) = cell_size(3, c)
-c
-c Cell is 4th dimension, 1 cell per cell type to handle varying
-c cell sub-array sizes
-c
- subsizes(4) = 1
-
-c
-c type constructors use 0-based start addresses
-c
- starts(1) = 2
- starts(2) = 2
- starts(3) = 2
- starts(4) = c-1
-
-c
-c Create buftype for a cell
-c
- call MPI_Type_create_subarray(4, sizes, subsizes,
- $ starts, MPI_ORDER_FORTRAN, element,
- $ cell_btype(c), ierr)
-c
-c block length and displacement for joining cells -
-c 1 cell buftype per block, cell buftypes have own displacment
-c generated from cell number (4th array dimension)
-c
- cell_blength(c) = 1
- cell_disp(c) = 0
-
- enddo
-c
-c Create combined buftype for all cells
-c
- call MPI_Type_struct(ncells, cell_blength, cell_disp,
- $ cell_btype, combined_btype, ierr)
- call MPI_Type_commit(combined_btype, ierr)
-
- do c = 1, ncells
-c
-c Entire array size
-c
- sizes(1) = PROBLEM_SIZE
- sizes(2) = PROBLEM_SIZE
- sizes(3) = PROBLEM_SIZE
-
-c
-c Size of c'th cell
-c
- subsizes(1) = cell_size(1, c)
- subsizes(2) = cell_size(2, c)
- subsizes(3) = cell_size(3, c)
-
-c
-c Starting point in full array of c'th cell
-c
- starts(1) = cell_low(1,c)
- starts(2) = cell_low(2,c)
- starts(3) = cell_low(3,c)
-
- call MPI_Type_create_subarray(3, sizes, subsizes,
- $ starts, MPI_ORDER_FORTRAN,
- $ element, cell_ftype(c), ierr)
- cell_blength(c) = 1
- cell_disp(c) = 0
- enddo
-
- call MPI_Type_struct(ncells, cell_blength, cell_disp,
- $ cell_ftype, combined_ftype, ierr)
- call MPI_Type_commit(combined_ftype, ierr)
-
- iseek=0
- if (node .eq. root) then
- call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
- endif
-
-
- call MPI_Barrier(comm_solve, ierr)
-
- call MPI_File_open(comm_solve,
- $ filenm,
- $ MPI_MODE_RDWR+MPI_MODE_CREATE,
- $ MPI_INFO_NULL, fp, ierr)
-
- if (ierr .ne. MPI_SUCCESS) then
- print *, 'Error opening file'
- stop
- endif
-
- call MPI_File_set_view(fp, iseek, element,
- $ combined_ftype, 'native', info, ierr)
-
- if (ierr .ne. MPI_SUCCESS) then
- print *, 'Error setting file view'
- stop
- endif
-
- do m = 1, 5
- xce_sub(m) = 0.d0
- end do
-
- idump_sub = 0
-
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- include 'header.h'
- include 'mpinpb.h'
-
- integer mstatus(MPI_STATUS_SIZE)
- integer ierr
-
- call MPI_File_write_at_all(fp, iseek, u,
- $ 1, combined_btype, mstatus, ierr)
- if (ierr .ne. MPI_SUCCESS) then
- print *, 'Error writing to file'
- stop
- endif
-
- call MPI_Type_size(combined_btype, iosize, ierr)
- iseek = iseek + iosize/eltext
-
- idump_sub = idump_sub + 1
- if (rd_interval .gt. 0) then
- if (idump_sub .ge. rd_interval) then
-
- iseek = 0
- call acc_sub_norms(idump+1)
-
- iseek = 0
- idump_sub = 0
- endif
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine acc_sub_norms(idump_cur)
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer idump_cur
-
- integer ii, m, ichunk
- integer ierr
- integer mstatus(MPI_STATUS_SIZE)
- double precision xce_single(5)
-
- ichunk = idump_cur - idump_sub + 1
- do ii=0, idump_sub-1
-
- call MPI_File_read_at_all(fp, iseek, u,
- $ 1, combined_btype, mstatus, ierr)
- if (ierr .ne. MPI_SUCCESS) then
- print *, 'Error reading back file'
- call MPI_File_close(fp, ierr)
- stop
- endif
-
- call MPI_Type_size(combined_btype, iosize, ierr)
- iseek = iseek + iosize/eltext
-
- if (node .eq. root) print *, 'Reading data set ', ii+ichunk
-
- call error_norm(xce_single)
- do m = 1, 5
- xce_sub(m) = xce_sub(m) + xce_single(m)
- end do
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- include 'header.h'
- include 'mpinpb.h'
-
- integer ierr
-
- call MPI_File_close(fp, ierr)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
- subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- double precision xce_acc(5)
- integer m, ierr
-
- if (rd_interval .gt. 0) goto 20
-
- call MPI_File_open(comm_solve,
- $ filenm,
- $ MPI_MODE_RDONLY,
- $ MPI_INFO_NULL,
- $ fp,
- $ ierr)
-
- iseek = 0
- call MPI_File_set_view(fp, iseek, element, combined_ftype,
- $ 'native', MPI_INFO_NULL, ierr)
-
-c clear the last time step
-
- call clear_timestep
-
-c read back the time steps and accumulate norms
-
- call acc_sub_norms(idump)
-
- call MPI_File_close(fp, ierr)
-
- 20 continue
- do m = 1, 5
- xce_acc(m) = xce_sub(m) / dble(idump)
- end do
-
- return
- end
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-c
-c header.h
-c
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
-c---------------------------------------------------------------------
-c The following include file is generated automatically by the
-c "setparams" utility. It defines
-c maxcells: the square root of the maximum number of processors
-c problem_size: 12, 64, 102, 162 (for class T, A, B, C)
-c dt_default: default time step for this problem size if no
-c config file
-c niter_default: default number of iterations for this problem size
-c---------------------------------------------------------------------
-
- include 'npbparams.h'
-
- integer aa, bb, cc, BLOCK_SIZE
- parameter (aa=1, bb=2, cc=3, BLOCK_SIZE=5)
-
- integer ncells, grid_points(3)
- double precision elapsed_time
- common /global/ elapsed_time, ncells, grid_points
-
- double precision tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
- > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4,
- > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt,
- > ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2,
- > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
- > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
- > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
- > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1,
- > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1,
- > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2,
- > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
- > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1,
- > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6,
- > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
-
- common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
- > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4,
- > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt,
- > ce, dxmax, dymax, dzmax, xxcon1, xxcon2,
- > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
- > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
- > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
- > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1,
- > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1,
- > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2,
- > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
- > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1,
- > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6,
- > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
-
- integer EAST, WEST, NORTH, SOUTH,
- > BOTTOM, TOP
-
- parameter (EAST=2000, WEST=3000, NORTH=4000, SOUTH=5000,
- > BOTTOM=6000, TOP=7000)
-
- integer cell_coord (3,maxcells), cell_low (3,maxcells),
- > cell_high (3,maxcells), cell_size(3,maxcells),
- > predecessor(3), slice (3,maxcells),
- > grid_size (3), successor(3) ,
- > start (3,maxcells), end (3,maxcells)
- common /partition/ cell_coord, cell_low, cell_high, cell_size,
- > grid_size, successor, predecessor, slice,
- > start, end
-
- integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE
-
- parameter (MAX_CELL_DIM = (problem_size/maxcells)+1)
-
- parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM)
-
- parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60+1)
-
- double precision
- > us ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > vs ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > ws ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > qs ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > rho_i ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > square ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > forcing (5, 0:IMAX-1, 0:JMAX-1, 0:KMAX-1, maxcells),
- > u (5, -2:IMAX+1,-2:JMAX+1,-2:KMAX+1, maxcells),
- > rhs (5, -1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
- > lhsc (5,5,-1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
- > backsub_info (5, 0:MAX_CELL_DIM, 0:MAX_CELL_DIM, maxcells),
- > in_buffer(BUF_SIZE), out_buffer(BUF_SIZE)
- common /fields/ u, us, vs, ws, qs, rho_i, square,
- > rhs, forcing, lhsc, in_buffer, out_buffer,
- > backsub_info
-
- double precision cv(-2:MAX_CELL_DIM+1), rhon(-2:MAX_CELL_DIM+1),
- > rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1),
- > cuf(-2:MAX_CELL_DIM+1), q(-2:MAX_CELL_DIM+1),
- > ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5)
- common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf
-
- integer west_size, east_size, bottom_size, top_size,
- > north_size, south_size, start_send_west,
- > start_send_east, start_send_south, start_send_north,
- > start_send_bottom, start_send_top, start_recv_west,
- > start_recv_east, start_recv_south, start_recv_north,
- > start_recv_bottom, start_recv_top
- common /box/ west_size, east_size, bottom_size,
- > top_size, north_size, south_size,
- > start_send_west, start_send_east, start_send_south,
- > start_send_north, start_send_bottom, start_send_top,
- > start_recv_west, start_recv_east, start_recv_south,
- > start_recv_north, start_recv_bottom, start_recv_top
-
- double precision tmp_block(5,5), b_inverse(5,5), tmp_vec(5)
- common /work_solve/ tmp_block, b_inverse, tmp_vec
-
-c
-c These are used by btio
-c
- integer collbuf_nodes, collbuf_size, iosize, eltext,
- $ combined_btype, fp, idump, record_length, element,
- $ combined_ftype, idump_sub, rd_interval
- common /btio/ collbuf_nodes, collbuf_size, iosize, eltext,
- $ combined_btype, fp, idump, record_length,
- $ idump_sub, rd_interval
- double precision sum(niter_default), xce_sub(5)
- common /btio/ sum, xce_sub
- integer*8 iseek
- common /btio/ iseek, element, combined_ftype
-
-
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine initialize
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This subroutine initializes the field variable u using
-c tri-linear transfinite interpolation of the boundary values
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, i, j, k, m, ii, jj, kk, ix, iy, iz
- double precision xi, eta, zeta, Pface(5,3,2), Pxi, Peta,
- > Pzeta, temp(5)
-
-c---------------------------------------------------------------------
-c Later (in compute_rhs) we compute 1/u for every element. A few of
-c the corner elements are not used, but it convenient (and faster)
-c to compute the whole thing with a simple loop. Make sure those
-c values are nonzero by initializing the whole thing here.
-c---------------------------------------------------------------------
- do c = 1, ncells
- do kk = -1, KMAX
- do jj = -1, JMAX
- do ii = -1, IMAX
- do m = 1, 5
- u(m, ii, jj, kk, c) = 1.0
- end do
- end do
- end do
- end do
- end do
-c---------------------------------------------------------------------
-
-
-
-c---------------------------------------------------------------------
-c first store the "interpolated" values everywhere on the grid
-c---------------------------------------------------------------------
- do c=1, ncells
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- ii = 0
- do i = cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
-
- do ix = 1, 2
- call exact_solution(dble(ix-1), eta, zeta,
- > Pface(1,1,ix))
- enddo
-
- do iy = 1, 2
- call exact_solution(xi, dble(iy-1) , zeta,
- > Pface(1,2,iy))
- enddo
-
- do iz = 1, 2
- call exact_solution(xi, eta, dble(iz-1),
- > Pface(1,3,iz))
- enddo
-
- do m = 1, 5
- Pxi = xi * Pface(m,1,2) +
- > (1.0d0-xi) * Pface(m,1,1)
- Peta = eta * Pface(m,2,2) +
- > (1.0d0-eta) * Pface(m,2,1)
- Pzeta = zeta * Pface(m,3,2) +
- > (1.0d0-zeta) * Pface(m,3,1)
-
- u(m,ii,jj,kk,c) = Pxi + Peta + Pzeta -
- > Pxi*Peta - Pxi*Pzeta - Peta*Pzeta +
- > Pxi*Peta*Pzeta
-
- enddo
- ii = ii + 1
- enddo
- jj = jj + 1
- enddo
- kk = kk+1
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c now store the exact values on the boundaries
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c west face
-c---------------------------------------------------------------------
- c = slice(1,1)
- ii = 0
- xi = 0.0d0
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(m,ii,jj,kk,c) = temp(m)
- enddo
- jj = jj + 1
- enddo
- kk = kk + 1
- enddo
-
-c---------------------------------------------------------------------
-c east face
-c---------------------------------------------------------------------
- c = slice(1,ncells)
- ii = cell_size(1,c)-1
- xi = 1.0d0
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(m,ii,jj,kk,c) = temp(m)
- enddo
- jj = jj + 1
- enddo
- kk = kk + 1
- enddo
-
-c---------------------------------------------------------------------
-c south face
-c---------------------------------------------------------------------
- c = slice(2,1)
- jj = 0
- eta = 0.0d0
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- ii = 0
- do i = cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(m,ii,jj,kk,c) = temp(m)
- enddo
- ii = ii + 1
- enddo
- kk = kk + 1
- enddo
-
-
-c---------------------------------------------------------------------
-c north face
-c---------------------------------------------------------------------
- c = slice(2,ncells)
- jj = cell_size(2,c)-1
- eta = 1.0d0
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- ii = 0
- do i = cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(m,ii,jj,kk,c) = temp(m)
- enddo
- ii = ii + 1
- enddo
- kk = kk + 1
- enddo
-
-c---------------------------------------------------------------------
-c bottom face
-c---------------------------------------------------------------------
- c = slice(3,1)
- kk = 0
- zeta = 0.0d0
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- ii = 0
- do i =cell_low(1,c), cell_high(1,c)
- xi = dble(i) *dnxm1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(m,ii,jj,kk,c) = temp(m)
- enddo
- ii = ii + 1
- enddo
- jj = jj + 1
- enddo
-
-c---------------------------------------------------------------------
-c top face
-c---------------------------------------------------------------------
- c = slice(3,ncells)
- kk = cell_size(3,c)-1
- zeta = 1.0d0
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- ii = 0
- do i =cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(m,ii,jj,kk,c) = temp(m)
- enddo
- ii = ii + 1
- enddo
- jj = jj + 1
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine lhsinit
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i, j, k, d, c, m, n
-
-c---------------------------------------------------------------------
-c loop over all cells
-c---------------------------------------------------------------------
- do c = 1, ncells
-
-c---------------------------------------------------------------------
-c first, initialize the start and end arrays
-c---------------------------------------------------------------------
- do d = 1, 3
- if (cell_coord(d,c) .eq. 1) then
- start(d,c) = 1
- else
- start(d,c) = 0
- endif
- if (cell_coord(d,c) .eq. ncells) then
- end(d,c) = 1
- else
- end(d,c) = 0
- endif
- enddo
-
-c---------------------------------------------------------------------
-c zero the whole left hand side for starters
-c---------------------------------------------------------------------
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- do m = 1,5
- do n = 1, 5
- lhsc(m,n,i,j,k,c) = 0.0d0
- enddo
- enddo
- enddo
- enddo
- enddo
-
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine lhsabinit(lhsa, lhsb, size)
- implicit none
-
- integer size
- double precision lhsa(5, 5, -1:size), lhsb(5, 5, -1:size)
-
- integer i, m, n
-
-c---------------------------------------------------------------------
-c next, set all diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
- do i = 0, size
- do m = 1, 5
- do n = 1, 5
- lhsa(m,n,i) = 0.0d0
- lhsb(m,n,i) = 0.0d0
- enddo
- lhsb(m,m,i) = 1.0d0
- enddo
- enddo
-
- return
- end
-
-
-
+++ /dev/null
-200 number of time steps
-0.0008d0 dt for class A = 0.0008d0. class B = 0.0003d0 class C = 0.0001d0
-64 64 64
-5 0 write interval (optional read interval) for BTIO
-0 1000000 number of nodes in collective buffering and buffer size for BTIO
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine make_set
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This function allocates space for a set of cells and fills the set
-c such that communication between cells on different nodes is only
-c nearest neighbor
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
-
- integer p, i, j, c, dir, size, excess, ierr,ierrcode
-
-c---------------------------------------------------------------------
-c compute square root; add small number to allow for roundoff
-c (note: this is computed in setup_mpi.f also, but prefer to do
-c it twice because of some include file problems).
-c---------------------------------------------------------------------
- ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0))
-
-c---------------------------------------------------------------------
-c this makes coding easier
-c---------------------------------------------------------------------
- p = ncells
-
-c---------------------------------------------------------------------
-c determine the location of the cell at the bottom of the 3D
-c array of cells
-c---------------------------------------------------------------------
- cell_coord(1,1) = mod(node,p)
- cell_coord(2,1) = node/p
- cell_coord(3,1) = 0
-
-c---------------------------------------------------------------------
-c set the cell_coords for cells in the rest of the z-layers;
-c this comes down to a simple linear numbering in the z-direct-
-c ion, and to the doubly-cyclic numbering in the other dirs
-c---------------------------------------------------------------------
- do c=2, p
- cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p)
- cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p)
- cell_coord(3,c) = c-1
- end do
-
-c---------------------------------------------------------------------
-c offset all the coordinates by 1 to adjust for Fortran arrays
-c---------------------------------------------------------------------
- do dir = 1, 3
- do c = 1, p
- cell_coord(dir,c) = cell_coord(dir,c) + 1
- end do
- end do
-
-c---------------------------------------------------------------------
-c slice(dir,n) contains the sequence number of the cell that is in
-c coordinate plane n in the dir direction
-c---------------------------------------------------------------------
- do dir = 1, 3
- do c = 1, p
- slice(dir,cell_coord(dir,c)) = c
- end do
- end do
-
-
-c---------------------------------------------------------------------
-c fill the predecessor and successor entries, using the indices
-c of the bottom cells (they are the same at each level of k
-c anyway) acting as if full periodicity pertains; note that p is
-c added to those arguments to the mod functions that might
-c otherwise return wrong values when using the modulo function
-c---------------------------------------------------------------------
- i = cell_coord(1,1)-1
- j = cell_coord(2,1)-1
-
- predecessor(1) = mod(i-1+p,p) + p*j
- predecessor(2) = i + p*mod(j-1+p,p)
- predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p)
- successor(1) = mod(i+1,p) + p*j
- successor(2) = i + p*mod(j+1,p)
- successor(3) = mod(i-1+p,p) + p*mod(j+1,p)
-
-c---------------------------------------------------------------------
-c now compute the sizes of the cells
-c---------------------------------------------------------------------
- do dir= 1, 3
-c---------------------------------------------------------------------
-c set cell_coord range for each direction
-c---------------------------------------------------------------------
- size = grid_points(dir)/p
- excess = mod(grid_points(dir),p)
- do c=1, ncells
- if (cell_coord(dir,c) .le. excess) then
- cell_size(dir,c) = size+1
- cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1)
- cell_high(dir,c) = cell_low(dir,c)+size
- else
- cell_size(dir,c) = size
- cell_low(dir,c) = excess*(size+1)+
- > (cell_coord(dir,c)-excess-1)*size
- cell_high(dir,c) = cell_low(dir,c)+size-1
- endif
- if (cell_size(dir, c) .le. 2) then
- write(*,50)
- 50 format(' Error: Cell size too small. Min size is 3')
- call MPI_Abort(mpi_comm_world,ierrcode,ierr)
- stop
- endif
- end do
- end do
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'mpif.h'
-
- integer node, no_nodes, total_nodes, root, comm_setup,
- > comm_solve, comm_rhs, dp_type
- logical active
- common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup,
- > comm_solve, comm_rhs, dp_type, active
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine compute_rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, i, j, k, m
- double precision rho_inv, uijk, up1, um1, vijk, vp1, vm1,
- > wijk, wp1, wm1
-
-
-c---------------------------------------------------------------------
-c loop over all cells owned by this node
-c---------------------------------------------------------------------
- do c = 1, ncells
-
-c---------------------------------------------------------------------
-c compute the reciprocal of density, and the kinetic energy,
-c and the speed of sound.
-c---------------------------------------------------------------------
- do k = -1, cell_size(3,c)
- do j = -1, cell_size(2,c)
- do i = -1, cell_size(1,c)
- rho_inv = 1.0d0/u(1,i,j,k,c)
- rho_i(i,j,k,c) = rho_inv
- us(i,j,k,c) = u(2,i,j,k,c) * rho_inv
- vs(i,j,k,c) = u(3,i,j,k,c) * rho_inv
- ws(i,j,k,c) = u(4,i,j,k,c) * rho_inv
- square(i,j,k,c) = 0.5d0* (
- > u(2,i,j,k,c)*u(2,i,j,k,c) +
- > u(3,i,j,k,c)*u(3,i,j,k,c) +
- > u(4,i,j,k,c)*u(4,i,j,k,c) ) * rho_inv
- qs(i,j,k,c) = square(i,j,k,c) * rho_inv
- enddo
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c copy the exact forcing term to the right hand side; because
-c this forcing term is known, we can store it on the whole of every
-c cell, including the boundary
-c---------------------------------------------------------------------
-
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = forcing(m,i,j,k,c)
- enddo
- enddo
- enddo
- enddo
-
-
-c---------------------------------------------------------------------
-c compute xi-direction fluxes
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- uijk = us(i,j,k,c)
- up1 = us(i+1,j,k,c)
- um1 = us(i-1,j,k,c)
-
- rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dx1tx1 *
- > (u(1,i+1,j,k,c) - 2.0d0*u(1,i,j,k,c) +
- > u(1,i-1,j,k,c)) -
- > tx2 * (u(2,i+1,j,k,c) - u(2,i-1,j,k,c))
-
- rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dx2tx1 *
- > (u(2,i+1,j,k,c) - 2.0d0*u(2,i,j,k,c) +
- > u(2,i-1,j,k,c)) +
- > xxcon2*con43 * (up1 - 2.0d0*uijk + um1) -
- > tx2 * (u(2,i+1,j,k,c)*up1 -
- > u(2,i-1,j,k,c)*um1 +
- > (u(5,i+1,j,k,c)- square(i+1,j,k,c)-
- > u(5,i-1,j,k,c)+ square(i-1,j,k,c))*
- > c2)
-
- rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dx3tx1 *
- > (u(3,i+1,j,k,c) - 2.0d0*u(3,i,j,k,c) +
- > u(3,i-1,j,k,c)) +
- > xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) +
- > vs(i-1,j,k,c)) -
- > tx2 * (u(3,i+1,j,k,c)*up1 -
- > u(3,i-1,j,k,c)*um1)
-
- rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dx4tx1 *
- > (u(4,i+1,j,k,c) - 2.0d0*u(4,i,j,k,c) +
- > u(4,i-1,j,k,c)) +
- > xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) +
- > ws(i-1,j,k,c)) -
- > tx2 * (u(4,i+1,j,k,c)*up1 -
- > u(4,i-1,j,k,c)*um1)
-
- rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dx5tx1 *
- > (u(5,i+1,j,k,c) - 2.0d0*u(5,i,j,k,c) +
- > u(5,i-1,j,k,c)) +
- > xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) +
- > qs(i-1,j,k,c)) +
- > xxcon4 * (up1*up1 - 2.0d0*uijk*uijk +
- > um1*um1) +
- > xxcon5 * (u(5,i+1,j,k,c)*rho_i(i+1,j,k,c) -
- > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
- > u(5,i-1,j,k,c)*rho_i(i-1,j,k,c)) -
- > tx2 * ( (c1*u(5,i+1,j,k,c) -
- > c2*square(i+1,j,k,c))*up1 -
- > (c1*u(5,i-1,j,k,c) -
- > c2*square(i-1,j,k,c))*um1 )
- enddo
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c add fourth order xi-direction dissipation
-c---------------------------------------------------------------------
- if (start(1,c) .gt. 0) then
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- i = 1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp *
- > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) +
- > u(m,i+2,j,k,c))
- enddo
-
- i = 2
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > (-4.0d0*u(m,i-1,j,k,c) + 6.0d0*u(m,i,j,k,c) -
- > 4.0d0*u(m,i+1,j,k,c) + u(m,i+2,j,k,c))
- enddo
- enddo
- enddo
- endif
-
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) +
- > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) +
- > u(m,i+2,j,k,c) )
- enddo
- enddo
- enddo
- enddo
-
-
- if (end(1,c) .gt. 0) then
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- i = cell_size(1,c)-3
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) +
- > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) )
- enddo
-
- i = cell_size(1,c)-2
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > ( u(m,i-2,j,k,c) - 4.d0*u(m,i-1,j,k,c) +
- > 5.d0*u(m,i,j,k,c) )
- enddo
- enddo
- enddo
- endif
-
-c---------------------------------------------------------------------
-c compute eta-direction fluxes
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- vijk = vs(i,j,k,c)
- vp1 = vs(i,j+1,k,c)
- vm1 = vs(i,j-1,k,c)
- rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dy1ty1 *
- > (u(1,i,j+1,k,c) - 2.0d0*u(1,i,j,k,c) +
- > u(1,i,j-1,k,c)) -
- > ty2 * (u(3,i,j+1,k,c) - u(3,i,j-1,k,c))
- rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dy2ty1 *
- > (u(2,i,j+1,k,c) - 2.0d0*u(2,i,j,k,c) +
- > u(2,i,j-1,k,c)) +
- > yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) +
- > us(i,j-1,k,c)) -
- > ty2 * (u(2,i,j+1,k,c)*vp1 -
- > u(2,i,j-1,k,c)*vm1)
- rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dy3ty1 *
- > (u(3,i,j+1,k,c) - 2.0d0*u(3,i,j,k,c) +
- > u(3,i,j-1,k,c)) +
- > yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) -
- > ty2 * (u(3,i,j+1,k,c)*vp1 -
- > u(3,i,j-1,k,c)*vm1 +
- > (u(5,i,j+1,k,c) - square(i,j+1,k,c) -
- > u(5,i,j-1,k,c) + square(i,j-1,k,c))
- > *c2)
- rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dy4ty1 *
- > (u(4,i,j+1,k,c) - 2.0d0*u(4,i,j,k,c) +
- > u(4,i,j-1,k,c)) +
- > yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) +
- > ws(i,j-1,k,c)) -
- > ty2 * (u(4,i,j+1,k,c)*vp1 -
- > u(4,i,j-1,k,c)*vm1)
- rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dy5ty1 *
- > (u(5,i,j+1,k,c) - 2.0d0*u(5,i,j,k,c) +
- > u(5,i,j-1,k,c)) +
- > yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) +
- > qs(i,j-1,k,c)) +
- > yycon4 * (vp1*vp1 - 2.0d0*vijk*vijk +
- > vm1*vm1) +
- > yycon5 * (u(5,i,j+1,k,c)*rho_i(i,j+1,k,c) -
- > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
- > u(5,i,j-1,k,c)*rho_i(i,j-1,k,c)) -
- > ty2 * ((c1*u(5,i,j+1,k,c) -
- > c2*square(i,j+1,k,c)) * vp1 -
- > (c1*u(5,i,j-1,k,c) -
- > c2*square(i,j-1,k,c)) * vm1)
- enddo
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c add fourth order eta-direction dissipation
-c---------------------------------------------------------------------
- if (start(2,c) .gt. 0) then
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- j = 1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp *
- > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) +
- > u(m,i,j+2,k,c))
- enddo
- enddo
-
- j = 2
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > (-4.0d0*u(m,i,j-1,k,c) + 6.0d0*u(m,i,j,k,c) -
- > 4.0d0*u(m,i,j+1,k,c) + u(m,i,j+2,k,c))
- enddo
- enddo
- enddo
- endif
-
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1
- do i = start(1,c),cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) +
- > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) +
- > u(m,i,j+2,k,c) )
- enddo
- enddo
- enddo
- enddo
-
- if (end(2,c) .gt. 0) then
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- j = cell_size(2,c)-3
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) +
- > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) )
- enddo
- enddo
-
- j = cell_size(2,c)-2
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > ( u(m,i,j-2,k,c) - 4.d0*u(m,i,j-1,k,c) +
- > 5.d0*u(m,i,j,k,c) )
- enddo
- enddo
- enddo
- endif
-
-c---------------------------------------------------------------------
-c compute zeta-direction fluxes
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- wijk = ws(i,j,k,c)
- wp1 = ws(i,j,k+1,c)
- wm1 = ws(i,j,k-1,c)
-
- rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dz1tz1 *
- > (u(1,i,j,k+1,c) - 2.0d0*u(1,i,j,k,c) +
- > u(1,i,j,k-1,c)) -
- > tz2 * (u(4,i,j,k+1,c) - u(4,i,j,k-1,c))
- rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dz2tz1 *
- > (u(2,i,j,k+1,c) - 2.0d0*u(2,i,j,k,c) +
- > u(2,i,j,k-1,c)) +
- > zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) +
- > us(i,j,k-1,c)) -
- > tz2 * (u(2,i,j,k+1,c)*wp1 -
- > u(2,i,j,k-1,c)*wm1)
- rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dz3tz1 *
- > (u(3,i,j,k+1,c) - 2.0d0*u(3,i,j,k,c) +
- > u(3,i,j,k-1,c)) +
- > zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) +
- > vs(i,j,k-1,c)) -
- > tz2 * (u(3,i,j,k+1,c)*wp1 -
- > u(3,i,j,k-1,c)*wm1)
- rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dz4tz1 *
- > (u(4,i,j,k+1,c) - 2.0d0*u(4,i,j,k,c) +
- > u(4,i,j,k-1,c)) +
- > zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) -
- > tz2 * (u(4,i,j,k+1,c)*wp1 -
- > u(4,i,j,k-1,c)*wm1 +
- > (u(5,i,j,k+1,c) - square(i,j,k+1,c) -
- > u(5,i,j,k-1,c) + square(i,j,k-1,c))
- > *c2)
- rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dz5tz1 *
- > (u(5,i,j,k+1,c) - 2.0d0*u(5,i,j,k,c) +
- > u(5,i,j,k-1,c)) +
- > zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) +
- > qs(i,j,k-1,c)) +
- > zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk +
- > wm1*wm1) +
- > zzcon5 * (u(5,i,j,k+1,c)*rho_i(i,j,k+1,c) -
- > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
- > u(5,i,j,k-1,c)*rho_i(i,j,k-1,c)) -
- > tz2 * ( (c1*u(5,i,j,k+1,c) -
- > c2*square(i,j,k+1,c))*wp1 -
- > (c1*u(5,i,j,k-1,c) -
- > c2*square(i,j,k-1,c))*wm1)
- enddo
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c add fourth order zeta-direction dissipation
-c---------------------------------------------------------------------
- if (start(3,c) .gt. 0) then
- k = 1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp *
- > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) +
- > u(m,i,j,k+2,c))
- enddo
- enddo
- enddo
-
- k = 2
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > (-4.0d0*u(m,i,j,k-1,c) + 6.0d0*u(m,i,j,k,c) -
- > 4.0d0*u(m,i,j,k+1,c) + u(m,i,j,k+2,c))
- enddo
- enddo
- enddo
- endif
-
- do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c),cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) +
- > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) +
- > u(m,i,j,k+2,c) )
- enddo
- enddo
- enddo
- enddo
-
- if (end(3,c) .gt. 0) then
- k = cell_size(3,c)-3
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) +
- > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) )
- enddo
- enddo
- enddo
-
- k = cell_size(3,c)-2
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
- > ( u(m,i,j,k-2,c) - 4.d0*u(m,i,j,k-1,c) +
- > 5.d0*u(m,i,j,k,c) )
- enddo
- enddo
- enddo
- endif
-
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c) * dt
- enddo
- enddo
- enddo
- enddo
-
- enddo
-
- return
- end
-
-
-
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine set_constants
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- ce(1,1) = 2.0d0
- ce(1,2) = 0.0d0
- ce(1,3) = 0.0d0
- ce(1,4) = 4.0d0
- ce(1,5) = 5.0d0
- ce(1,6) = 3.0d0
- ce(1,7) = 0.5d0
- ce(1,8) = 0.02d0
- ce(1,9) = 0.01d0
- ce(1,10) = 0.03d0
- ce(1,11) = 0.5d0
- ce(1,12) = 0.4d0
- ce(1,13) = 0.3d0
-
- ce(2,1) = 1.0d0
- ce(2,2) = 0.0d0
- ce(2,3) = 0.0d0
- ce(2,4) = 0.0d0
- ce(2,5) = 1.0d0
- ce(2,6) = 2.0d0
- ce(2,7) = 3.0d0
- ce(2,8) = 0.01d0
- ce(2,9) = 0.03d0
- ce(2,10) = 0.02d0
- ce(2,11) = 0.4d0
- ce(2,12) = 0.3d0
- ce(2,13) = 0.5d0
-
- ce(3,1) = 2.0d0
- ce(3,2) = 2.0d0
- ce(3,3) = 0.0d0
- ce(3,4) = 0.0d0
- ce(3,5) = 0.0d0
- ce(3,6) = 2.0d0
- ce(3,7) = 3.0d0
- ce(3,8) = 0.04d0
- ce(3,9) = 0.03d0
- ce(3,10) = 0.05d0
- ce(3,11) = 0.3d0
- ce(3,12) = 0.5d0
- ce(3,13) = 0.4d0
-
- ce(4,1) = 2.0d0
- ce(4,2) = 2.0d0
- ce(4,3) = 0.0d0
- ce(4,4) = 0.0d0
- ce(4,5) = 0.0d0
- ce(4,6) = 2.0d0
- ce(4,7) = 3.0d0
- ce(4,8) = 0.03d0
- ce(4,9) = 0.05d0
- ce(4,10) = 0.04d0
- ce(4,11) = 0.2d0
- ce(4,12) = 0.1d0
- ce(4,13) = 0.3d0
-
- ce(5,1) = 5.0d0
- ce(5,2) = 4.0d0
- ce(5,3) = 3.0d0
- ce(5,4) = 2.0d0
- ce(5,5) = 0.1d0
- ce(5,6) = 0.4d0
- ce(5,7) = 0.3d0
- ce(5,8) = 0.05d0
- ce(5,9) = 0.04d0
- ce(5,10) = 0.03d0
- ce(5,11) = 0.1d0
- ce(5,12) = 0.3d0
- ce(5,13) = 0.2d0
-
- c1 = 1.4d0
- c2 = 0.4d0
- c3 = 0.1d0
- c4 = 1.0d0
- c5 = 1.4d0
-
- bt = dsqrt(0.5d0)
-
- dnxm1 = 1.0d0 / dble(grid_points(1)-1)
- dnym1 = 1.0d0 / dble(grid_points(2)-1)
- dnzm1 = 1.0d0 / dble(grid_points(3)-1)
-
- c1c2 = c1 * c2
- c1c5 = c1 * c5
- c3c4 = c3 * c4
- c1345 = c1c5 * c3c4
-
- conz1 = (1.0d0-c1c5)
-
- tx1 = 1.0d0 / (dnxm1 * dnxm1)
- tx2 = 1.0d0 / (2.0d0 * dnxm1)
- tx3 = 1.0d0 / dnxm1
-
- ty1 = 1.0d0 / (dnym1 * dnym1)
- ty2 = 1.0d0 / (2.0d0 * dnym1)
- ty3 = 1.0d0 / dnym1
-
- tz1 = 1.0d0 / (dnzm1 * dnzm1)
- tz2 = 1.0d0 / (2.0d0 * dnzm1)
- tz3 = 1.0d0 / dnzm1
-
- dx1 = 0.75d0
- dx2 = 0.75d0
- dx3 = 0.75d0
- dx4 = 0.75d0
- dx5 = 0.75d0
-
- dy1 = 0.75d0
- dy2 = 0.75d0
- dy3 = 0.75d0
- dy4 = 0.75d0
- dy5 = 0.75d0
-
- dz1 = 1.0d0
- dz2 = 1.0d0
- dz3 = 1.0d0
- dz4 = 1.0d0
- dz5 = 1.0d0
-
- dxmax = dmax1(dx3, dx4)
- dymax = dmax1(dy2, dy4)
- dzmax = dmax1(dz2, dz3)
-
- dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) )
-
- c4dssp = 4.0d0 * dssp
- c5dssp = 5.0d0 * dssp
-
- dttx1 = dt*tx1
- dttx2 = dt*tx2
- dtty1 = dt*ty1
- dtty2 = dt*ty2
- dttz1 = dt*tz1
- dttz2 = dt*tz2
-
- c2dttx1 = 2.0d0*dttx1
- c2dtty1 = 2.0d0*dtty1
- c2dttz1 = 2.0d0*dttz1
-
- dtdssp = dt*dssp
-
- comz1 = dtdssp
- comz4 = 4.0d0*dtdssp
- comz5 = 5.0d0*dtdssp
- comz6 = 6.0d0*dtdssp
-
- c3c4tx3 = c3c4*tx3
- c3c4ty3 = c3c4*ty3
- c3c4tz3 = c3c4*tz3
-
- dx1tx1 = dx1*tx1
- dx2tx1 = dx2*tx1
- dx3tx1 = dx3*tx1
- dx4tx1 = dx4*tx1
- dx5tx1 = dx5*tx1
-
- dy1ty1 = dy1*ty1
- dy2ty1 = dy2*ty1
- dy3ty1 = dy3*ty1
- dy4ty1 = dy4*ty1
- dy5ty1 = dy5*ty1
-
- dz1tz1 = dz1*tz1
- dz2tz1 = dz2*tz1
- dz3tz1 = dz3*tz1
- dz4tz1 = dz4*tz1
- dz5tz1 = dz5*tz1
-
- c2iv = 2.5d0
- con43 = 4.0d0/3.0d0
- con16 = 1.0d0/6.0d0
-
- xxcon1 = c3c4tx3*con43*tx3
- xxcon2 = c3c4tx3*tx3
- xxcon3 = c3c4tx3*conz1*tx3
- xxcon4 = c3c4tx3*con16*tx3
- xxcon5 = c3c4tx3*c1c5*tx3
-
- yycon1 = c3c4ty3*con43*ty3
- yycon2 = c3c4ty3*ty3
- yycon3 = c3c4ty3*conz1*ty3
- yycon4 = c3c4ty3*con16*ty3
- yycon5 = c3c4ty3*c1c5*ty3
-
- zzcon1 = c3c4tz3*con43*tz3
- zzcon2 = c3c4tz3*tz3
- zzcon3 = c3c4tz3*conz1*tz3
- zzcon4 = c3c4tz3*con16*tz3
- zzcon5 = c3c4tz3*c1c5*tz3
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setup_mpi
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c set up MPI stuff
-c---------------------------------------------------------------------
-
- implicit none
- include 'mpinpb.h'
- include 'npbparams.h'
- integer error, color, nc
-
- call mpi_init(error)
-
- call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error)
- call mpi_comm_rank(MPI_COMM_WORLD, node, error)
-
- if (.not. convertdouble) then
- dp_type = MPI_DOUBLE_PRECISION
- else
- dp_type = MPI_REAL
- endif
-
-c---------------------------------------------------------------------
-c compute square root; add small number to allow for roundoff
-c---------------------------------------------------------------------
- nc = dint(dsqrt(dble(total_nodes) + 0.00001d0))
-
-c---------------------------------------------------------------------
-c We handle a non-square number of nodes by making the excess nodes
-c inactive. However, we can never handle more cells than were compiled
-c in.
-c---------------------------------------------------------------------
-
- if (nc .gt. maxcells) nc = maxcells
- if (node .ge. nc*nc) then
- active = .false.
- color = 1
- else
- active = .true.
- color = 0
- end if
-
- call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error)
- if (.not. active) return
-
- call mpi_comm_size(comm_setup, no_nodes, error)
- call mpi_comm_dup(comm_setup, comm_solve, error)
- call mpi_comm_dup(comm_setup, comm_rhs, error)
-
-c---------------------------------------------------------------------
-c let node 0 be the root for the group (there is only one)
-c---------------------------------------------------------------------
- root = 0
-
- return
- end
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setup_btio
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer m, ierr
-
- iseek=0
-
- if (node .eq. root) then
- call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
- endif
-
- call MPI_Barrier(comm_solve, ierr)
-
- call MPI_File_open(comm_solve,
- $ filenm,
- $ MPI_MODE_RDWR + MPI_MODE_CREATE,
- $ MPI_INFO_NULL,
- $ fp,
- $ ierr)
-
- call MPI_File_set_view(fp,
- $ iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
- $ 'native', MPI_INFO_NULL, ierr)
-
- if (ierr .ne. MPI_SUCCESS) then
- print *, 'Error opening file'
- stop
- endif
-
- do m = 1, 5
- xce_sub(m) = 0.d0
- end do
-
- idump_sub = 0
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine output_timestep
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- include 'header.h'
- include 'mpinpb.h'
-
- integer count, jio, kio, cio, aio
- integer ierr
- integer mstatus(MPI_STATUS_SIZE)
-
- do cio=1,ncells
- do kio=0, cell_size(3,cio)-1
- do jio=0, cell_size(2,cio)-1
- iseek=5*(cell_low(1,cio) +
- $ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
- $ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
- $ PROBLEM_SIZE*idump_sub)))
-
- count=5*cell_size(1,cio)
-
- call MPI_File_write_at(fp, iseek,
- $ u(1,0,jio,kio,cio),
- $ count, MPI_DOUBLE_PRECISION,
- $ mstatus, ierr)
-
- if (ierr .ne. MPI_SUCCESS) then
- print *, 'Error writing to file'
- stop
- endif
- enddo
- enddo
- enddo
-
- idump_sub = idump_sub + 1
- if (rd_interval .gt. 0) then
- if (idump_sub .ge. rd_interval) then
-
- call acc_sub_norms(idump+1)
-
- idump_sub = 0
- endif
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine acc_sub_norms(idump_cur)
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer idump_cur
-
- integer count, jio, kio, cio, ii, m, ichunk
- integer ierr
- integer mstatus(MPI_STATUS_SIZE)
- double precision xce_single(5)
-
- ichunk = idump_cur - idump_sub + 1
- do ii=0, idump_sub-1
- do cio=1,ncells
- do kio=0, cell_size(3,cio)-1
- do jio=0, cell_size(2,cio)-1
- iseek=5*(cell_low(1,cio) +
- $ PROBLEM_SIZE*((cell_low(2,cio)+jio) +
- $ PROBLEM_SIZE*((cell_low(3,cio)+kio) +
- $ PROBLEM_SIZE*ii)))
-
- count=5*cell_size(1,cio)
-
- call MPI_File_read_at(fp, iseek,
- $ u(1,0,jio,kio,cio),
- $ count, MPI_DOUBLE_PRECISION,
- $ mstatus, ierr)
-
- if (ierr .ne. MPI_SUCCESS) then
- print *, 'Error reading back file'
- call MPI_File_close(fp, ierr)
- stop
- endif
- enddo
- enddo
- enddo
-
- if (node .eq. root) print *, 'Reading data set ', ii+ichunk
-
- call error_norm(xce_single)
- do m = 1, 5
- xce_sub(m) = xce_sub(m) + xce_single(m)
- end do
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine btio_cleanup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer ierr
-
- call MPI_File_close(fp, ierr)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine accumulate_norms(xce_acc)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- double precision xce_acc(5)
- integer m, ierr
-
- if (rd_interval .gt. 0) goto 20
-
- call MPI_File_open(comm_solve,
- $ filenm,
- $ MPI_MODE_RDONLY,
- $ MPI_INFO_NULL,
- $ fp,
- $ ierr)
-
- iseek = 0
- call MPI_File_set_view(fp,
- $ iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
- $ 'native', MPI_INFO_NULL, ierr)
-
-c clear the last time step
-
- call clear_timestep
-
-c read back the time steps and accumulate norms
-
- call acc_sub_norms(idump)
-
- call MPI_File_close(fp, ierr)
-
- 20 continue
- do m = 1, 5
- xce_acc(m) = xce_sub(m) / dble(idump)
- end do
-
- return
- end
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine matvec_sub(ablock,avec,bvec)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c subtracts bvec=bvec - ablock*avec
-c---------------------------------------------------------------------
-
- implicit none
-
- double precision ablock,avec,bvec
- dimension ablock(5,5),avec(5),bvec(5)
-
-c---------------------------------------------------------------------
-c rhs(i,ic,jc,kc,ccell) = rhs(i,ic,jc,kc,ccell)
-c $ - lhs(i,1,ablock,ia,ja,ka,acell)*
-c---------------------------------------------------------------------
- bvec(1) = bvec(1) - ablock(1,1)*avec(1)
- > - ablock(1,2)*avec(2)
- > - ablock(1,3)*avec(3)
- > - ablock(1,4)*avec(4)
- > - ablock(1,5)*avec(5)
- bvec(2) = bvec(2) - ablock(2,1)*avec(1)
- > - ablock(2,2)*avec(2)
- > - ablock(2,3)*avec(3)
- > - ablock(2,4)*avec(4)
- > - ablock(2,5)*avec(5)
- bvec(3) = bvec(3) - ablock(3,1)*avec(1)
- > - ablock(3,2)*avec(2)
- > - ablock(3,3)*avec(3)
- > - ablock(3,4)*avec(4)
- > - ablock(3,5)*avec(5)
- bvec(4) = bvec(4) - ablock(4,1)*avec(1)
- > - ablock(4,2)*avec(2)
- > - ablock(4,3)*avec(3)
- > - ablock(4,4)*avec(4)
- > - ablock(4,5)*avec(5)
- bvec(5) = bvec(5) - ablock(5,1)*avec(1)
- > - ablock(5,2)*avec(2)
- > - ablock(5,3)*avec(3)
- > - ablock(5,4)*avec(4)
- > - ablock(5,5)*avec(5)
-
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine matmul_sub(ablock, bblock, cblock)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c subtracts a(i,j,k) X b(i,j,k) from c(i,j,k)
-c---------------------------------------------------------------------
-
- implicit none
-
- double precision ablock, bblock, cblock
- dimension ablock(5,5), bblock(5,5), cblock(5,5)
-
-
- cblock(1,1) = cblock(1,1) - ablock(1,1)*bblock(1,1)
- > - ablock(1,2)*bblock(2,1)
- > - ablock(1,3)*bblock(3,1)
- > - ablock(1,4)*bblock(4,1)
- > - ablock(1,5)*bblock(5,1)
- cblock(2,1) = cblock(2,1) - ablock(2,1)*bblock(1,1)
- > - ablock(2,2)*bblock(2,1)
- > - ablock(2,3)*bblock(3,1)
- > - ablock(2,4)*bblock(4,1)
- > - ablock(2,5)*bblock(5,1)
- cblock(3,1) = cblock(3,1) - ablock(3,1)*bblock(1,1)
- > - ablock(3,2)*bblock(2,1)
- > - ablock(3,3)*bblock(3,1)
- > - ablock(3,4)*bblock(4,1)
- > - ablock(3,5)*bblock(5,1)
- cblock(4,1) = cblock(4,1) - ablock(4,1)*bblock(1,1)
- > - ablock(4,2)*bblock(2,1)
- > - ablock(4,3)*bblock(3,1)
- > - ablock(4,4)*bblock(4,1)
- > - ablock(4,5)*bblock(5,1)
- cblock(5,1) = cblock(5,1) - ablock(5,1)*bblock(1,1)
- > - ablock(5,2)*bblock(2,1)
- > - ablock(5,3)*bblock(3,1)
- > - ablock(5,4)*bblock(4,1)
- > - ablock(5,5)*bblock(5,1)
- cblock(1,2) = cblock(1,2) - ablock(1,1)*bblock(1,2)
- > - ablock(1,2)*bblock(2,2)
- > - ablock(1,3)*bblock(3,2)
- > - ablock(1,4)*bblock(4,2)
- > - ablock(1,5)*bblock(5,2)
- cblock(2,2) = cblock(2,2) - ablock(2,1)*bblock(1,2)
- > - ablock(2,2)*bblock(2,2)
- > - ablock(2,3)*bblock(3,2)
- > - ablock(2,4)*bblock(4,2)
- > - ablock(2,5)*bblock(5,2)
- cblock(3,2) = cblock(3,2) - ablock(3,1)*bblock(1,2)
- > - ablock(3,2)*bblock(2,2)
- > - ablock(3,3)*bblock(3,2)
- > - ablock(3,4)*bblock(4,2)
- > - ablock(3,5)*bblock(5,2)
- cblock(4,2) = cblock(4,2) - ablock(4,1)*bblock(1,2)
- > - ablock(4,2)*bblock(2,2)
- > - ablock(4,3)*bblock(3,2)
- > - ablock(4,4)*bblock(4,2)
- > - ablock(4,5)*bblock(5,2)
- cblock(5,2) = cblock(5,2) - ablock(5,1)*bblock(1,2)
- > - ablock(5,2)*bblock(2,2)
- > - ablock(5,3)*bblock(3,2)
- > - ablock(5,4)*bblock(4,2)
- > - ablock(5,5)*bblock(5,2)
- cblock(1,3) = cblock(1,3) - ablock(1,1)*bblock(1,3)
- > - ablock(1,2)*bblock(2,3)
- > - ablock(1,3)*bblock(3,3)
- > - ablock(1,4)*bblock(4,3)
- > - ablock(1,5)*bblock(5,3)
- cblock(2,3) = cblock(2,3) - ablock(2,1)*bblock(1,3)
- > - ablock(2,2)*bblock(2,3)
- > - ablock(2,3)*bblock(3,3)
- > - ablock(2,4)*bblock(4,3)
- > - ablock(2,5)*bblock(5,3)
- cblock(3,3) = cblock(3,3) - ablock(3,1)*bblock(1,3)
- > - ablock(3,2)*bblock(2,3)
- > - ablock(3,3)*bblock(3,3)
- > - ablock(3,4)*bblock(4,3)
- > - ablock(3,5)*bblock(5,3)
- cblock(4,3) = cblock(4,3) - ablock(4,1)*bblock(1,3)
- > - ablock(4,2)*bblock(2,3)
- > - ablock(4,3)*bblock(3,3)
- > - ablock(4,4)*bblock(4,3)
- > - ablock(4,5)*bblock(5,3)
- cblock(5,3) = cblock(5,3) - ablock(5,1)*bblock(1,3)
- > - ablock(5,2)*bblock(2,3)
- > - ablock(5,3)*bblock(3,3)
- > - ablock(5,4)*bblock(4,3)
- > - ablock(5,5)*bblock(5,3)
- cblock(1,4) = cblock(1,4) - ablock(1,1)*bblock(1,4)
- > - ablock(1,2)*bblock(2,4)
- > - ablock(1,3)*bblock(3,4)
- > - ablock(1,4)*bblock(4,4)
- > - ablock(1,5)*bblock(5,4)
- cblock(2,4) = cblock(2,4) - ablock(2,1)*bblock(1,4)
- > - ablock(2,2)*bblock(2,4)
- > - ablock(2,3)*bblock(3,4)
- > - ablock(2,4)*bblock(4,4)
- > - ablock(2,5)*bblock(5,4)
- cblock(3,4) = cblock(3,4) - ablock(3,1)*bblock(1,4)
- > - ablock(3,2)*bblock(2,4)
- > - ablock(3,3)*bblock(3,4)
- > - ablock(3,4)*bblock(4,4)
- > - ablock(3,5)*bblock(5,4)
- cblock(4,4) = cblock(4,4) - ablock(4,1)*bblock(1,4)
- > - ablock(4,2)*bblock(2,4)
- > - ablock(4,3)*bblock(3,4)
- > - ablock(4,4)*bblock(4,4)
- > - ablock(4,5)*bblock(5,4)
- cblock(5,4) = cblock(5,4) - ablock(5,1)*bblock(1,4)
- > - ablock(5,2)*bblock(2,4)
- > - ablock(5,3)*bblock(3,4)
- > - ablock(5,4)*bblock(4,4)
- > - ablock(5,5)*bblock(5,4)
- cblock(1,5) = cblock(1,5) - ablock(1,1)*bblock(1,5)
- > - ablock(1,2)*bblock(2,5)
- > - ablock(1,3)*bblock(3,5)
- > - ablock(1,4)*bblock(4,5)
- > - ablock(1,5)*bblock(5,5)
- cblock(2,5) = cblock(2,5) - ablock(2,1)*bblock(1,5)
- > - ablock(2,2)*bblock(2,5)
- > - ablock(2,3)*bblock(3,5)
- > - ablock(2,4)*bblock(4,5)
- > - ablock(2,5)*bblock(5,5)
- cblock(3,5) = cblock(3,5) - ablock(3,1)*bblock(1,5)
- > - ablock(3,2)*bblock(2,5)
- > - ablock(3,3)*bblock(3,5)
- > - ablock(3,4)*bblock(4,5)
- > - ablock(3,5)*bblock(5,5)
- cblock(4,5) = cblock(4,5) - ablock(4,1)*bblock(1,5)
- > - ablock(4,2)*bblock(2,5)
- > - ablock(4,3)*bblock(3,5)
- > - ablock(4,4)*bblock(4,5)
- > - ablock(4,5)*bblock(5,5)
- cblock(5,5) = cblock(5,5) - ablock(5,1)*bblock(1,5)
- > - ablock(5,2)*bblock(2,5)
- > - ablock(5,3)*bblock(3,5)
- > - ablock(5,4)*bblock(4,5)
- > - ablock(5,5)*bblock(5,5)
-
-
- return
- end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine binvcrhs( lhs,c,r )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- double precision pivot, coeff, lhs
- dimension lhs(5,5)
- double precision c(5,5), r(5)
-
-c---------------------------------------------------------------------
-c
-c---------------------------------------------------------------------
-
- pivot = 1.00d0/lhs(1,1)
- lhs(1,2) = lhs(1,2)*pivot
- lhs(1,3) = lhs(1,3)*pivot
- lhs(1,4) = lhs(1,4)*pivot
- lhs(1,5) = lhs(1,5)*pivot
- c(1,1) = c(1,1)*pivot
- c(1,2) = c(1,2)*pivot
- c(1,3) = c(1,3)*pivot
- c(1,4) = c(1,4)*pivot
- c(1,5) = c(1,5)*pivot
- r(1) = r(1) *pivot
-
- coeff = lhs(2,1)
- lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
- lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
- lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
- lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
- c(2,1) = c(2,1) - coeff*c(1,1)
- c(2,2) = c(2,2) - coeff*c(1,2)
- c(2,3) = c(2,3) - coeff*c(1,3)
- c(2,4) = c(2,4) - coeff*c(1,4)
- c(2,5) = c(2,5) - coeff*c(1,5)
- r(2) = r(2) - coeff*r(1)
-
- coeff = lhs(3,1)
- lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
- lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
- lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
- lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
- c(3,1) = c(3,1) - coeff*c(1,1)
- c(3,2) = c(3,2) - coeff*c(1,2)
- c(3,3) = c(3,3) - coeff*c(1,3)
- c(3,4) = c(3,4) - coeff*c(1,4)
- c(3,5) = c(3,5) - coeff*c(1,5)
- r(3) = r(3) - coeff*r(1)
-
- coeff = lhs(4,1)
- lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
- lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
- lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
- lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
- c(4,1) = c(4,1) - coeff*c(1,1)
- c(4,2) = c(4,2) - coeff*c(1,2)
- c(4,3) = c(4,3) - coeff*c(1,3)
- c(4,4) = c(4,4) - coeff*c(1,4)
- c(4,5) = c(4,5) - coeff*c(1,5)
- r(4) = r(4) - coeff*r(1)
-
- coeff = lhs(5,1)
- lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
- lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
- lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
- lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
- c(5,1) = c(5,1) - coeff*c(1,1)
- c(5,2) = c(5,2) - coeff*c(1,2)
- c(5,3) = c(5,3) - coeff*c(1,3)
- c(5,4) = c(5,4) - coeff*c(1,4)
- c(5,5) = c(5,5) - coeff*c(1,5)
- r(5) = r(5) - coeff*r(1)
-
-
- pivot = 1.00d0/lhs(2,2)
- lhs(2,3) = lhs(2,3)*pivot
- lhs(2,4) = lhs(2,4)*pivot
- lhs(2,5) = lhs(2,5)*pivot
- c(2,1) = c(2,1)*pivot
- c(2,2) = c(2,2)*pivot
- c(2,3) = c(2,3)*pivot
- c(2,4) = c(2,4)*pivot
- c(2,5) = c(2,5)*pivot
- r(2) = r(2) *pivot
-
- coeff = lhs(1,2)
- lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
- lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
- lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
- c(1,1) = c(1,1) - coeff*c(2,1)
- c(1,2) = c(1,2) - coeff*c(2,2)
- c(1,3) = c(1,3) - coeff*c(2,3)
- c(1,4) = c(1,4) - coeff*c(2,4)
- c(1,5) = c(1,5) - coeff*c(2,5)
- r(1) = r(1) - coeff*r(2)
-
- coeff = lhs(3,2)
- lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
- lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
- lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
- c(3,1) = c(3,1) - coeff*c(2,1)
- c(3,2) = c(3,2) - coeff*c(2,2)
- c(3,3) = c(3,3) - coeff*c(2,3)
- c(3,4) = c(3,4) - coeff*c(2,4)
- c(3,5) = c(3,5) - coeff*c(2,5)
- r(3) = r(3) - coeff*r(2)
-
- coeff = lhs(4,2)
- lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
- lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
- lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
- c(4,1) = c(4,1) - coeff*c(2,1)
- c(4,2) = c(4,2) - coeff*c(2,2)
- c(4,3) = c(4,3) - coeff*c(2,3)
- c(4,4) = c(4,4) - coeff*c(2,4)
- c(4,5) = c(4,5) - coeff*c(2,5)
- r(4) = r(4) - coeff*r(2)
-
- coeff = lhs(5,2)
- lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
- lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
- lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
- c(5,1) = c(5,1) - coeff*c(2,1)
- c(5,2) = c(5,2) - coeff*c(2,2)
- c(5,3) = c(5,3) - coeff*c(2,3)
- c(5,4) = c(5,4) - coeff*c(2,4)
- c(5,5) = c(5,5) - coeff*c(2,5)
- r(5) = r(5) - coeff*r(2)
-
-
- pivot = 1.00d0/lhs(3,3)
- lhs(3,4) = lhs(3,4)*pivot
- lhs(3,5) = lhs(3,5)*pivot
- c(3,1) = c(3,1)*pivot
- c(3,2) = c(3,2)*pivot
- c(3,3) = c(3,3)*pivot
- c(3,4) = c(3,4)*pivot
- c(3,5) = c(3,5)*pivot
- r(3) = r(3) *pivot
-
- coeff = lhs(1,3)
- lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
- lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
- c(1,1) = c(1,1) - coeff*c(3,1)
- c(1,2) = c(1,2) - coeff*c(3,2)
- c(1,3) = c(1,3) - coeff*c(3,3)
- c(1,4) = c(1,4) - coeff*c(3,4)
- c(1,5) = c(1,5) - coeff*c(3,5)
- r(1) = r(1) - coeff*r(3)
-
- coeff = lhs(2,3)
- lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
- lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
- c(2,1) = c(2,1) - coeff*c(3,1)
- c(2,2) = c(2,2) - coeff*c(3,2)
- c(2,3) = c(2,3) - coeff*c(3,3)
- c(2,4) = c(2,4) - coeff*c(3,4)
- c(2,5) = c(2,5) - coeff*c(3,5)
- r(2) = r(2) - coeff*r(3)
-
- coeff = lhs(4,3)
- lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
- lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
- c(4,1) = c(4,1) - coeff*c(3,1)
- c(4,2) = c(4,2) - coeff*c(3,2)
- c(4,3) = c(4,3) - coeff*c(3,3)
- c(4,4) = c(4,4) - coeff*c(3,4)
- c(4,5) = c(4,5) - coeff*c(3,5)
- r(4) = r(4) - coeff*r(3)
-
- coeff = lhs(5,3)
- lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
- lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
- c(5,1) = c(5,1) - coeff*c(3,1)
- c(5,2) = c(5,2) - coeff*c(3,2)
- c(5,3) = c(5,3) - coeff*c(3,3)
- c(5,4) = c(5,4) - coeff*c(3,4)
- c(5,5) = c(5,5) - coeff*c(3,5)
- r(5) = r(5) - coeff*r(3)
-
-
- pivot = 1.00d0/lhs(4,4)
- lhs(4,5) = lhs(4,5)*pivot
- c(4,1) = c(4,1)*pivot
- c(4,2) = c(4,2)*pivot
- c(4,3) = c(4,3)*pivot
- c(4,4) = c(4,4)*pivot
- c(4,5) = c(4,5)*pivot
- r(4) = r(4) *pivot
-
- coeff = lhs(1,4)
- lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
- c(1,1) = c(1,1) - coeff*c(4,1)
- c(1,2) = c(1,2) - coeff*c(4,2)
- c(1,3) = c(1,3) - coeff*c(4,3)
- c(1,4) = c(1,4) - coeff*c(4,4)
- c(1,5) = c(1,5) - coeff*c(4,5)
- r(1) = r(1) - coeff*r(4)
-
- coeff = lhs(2,4)
- lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
- c(2,1) = c(2,1) - coeff*c(4,1)
- c(2,2) = c(2,2) - coeff*c(4,2)
- c(2,3) = c(2,3) - coeff*c(4,3)
- c(2,4) = c(2,4) - coeff*c(4,4)
- c(2,5) = c(2,5) - coeff*c(4,5)
- r(2) = r(2) - coeff*r(4)
-
- coeff = lhs(3,4)
- lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
- c(3,1) = c(3,1) - coeff*c(4,1)
- c(3,2) = c(3,2) - coeff*c(4,2)
- c(3,3) = c(3,3) - coeff*c(4,3)
- c(3,4) = c(3,4) - coeff*c(4,4)
- c(3,5) = c(3,5) - coeff*c(4,5)
- r(3) = r(3) - coeff*r(4)
-
- coeff = lhs(5,4)
- lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
- c(5,1) = c(5,1) - coeff*c(4,1)
- c(5,2) = c(5,2) - coeff*c(4,2)
- c(5,3) = c(5,3) - coeff*c(4,3)
- c(5,4) = c(5,4) - coeff*c(4,4)
- c(5,5) = c(5,5) - coeff*c(4,5)
- r(5) = r(5) - coeff*r(4)
-
-
- pivot = 1.00d0/lhs(5,5)
- c(5,1) = c(5,1)*pivot
- c(5,2) = c(5,2)*pivot
- c(5,3) = c(5,3)*pivot
- c(5,4) = c(5,4)*pivot
- c(5,5) = c(5,5)*pivot
- r(5) = r(5) *pivot
-
- coeff = lhs(1,5)
- c(1,1) = c(1,1) - coeff*c(5,1)
- c(1,2) = c(1,2) - coeff*c(5,2)
- c(1,3) = c(1,3) - coeff*c(5,3)
- c(1,4) = c(1,4) - coeff*c(5,4)
- c(1,5) = c(1,5) - coeff*c(5,5)
- r(1) = r(1) - coeff*r(5)
-
- coeff = lhs(2,5)
- c(2,1) = c(2,1) - coeff*c(5,1)
- c(2,2) = c(2,2) - coeff*c(5,2)
- c(2,3) = c(2,3) - coeff*c(5,3)
- c(2,4) = c(2,4) - coeff*c(5,4)
- c(2,5) = c(2,5) - coeff*c(5,5)
- r(2) = r(2) - coeff*r(5)
-
- coeff = lhs(3,5)
- c(3,1) = c(3,1) - coeff*c(5,1)
- c(3,2) = c(3,2) - coeff*c(5,2)
- c(3,3) = c(3,3) - coeff*c(5,3)
- c(3,4) = c(3,4) - coeff*c(5,4)
- c(3,5) = c(3,5) - coeff*c(5,5)
- r(3) = r(3) - coeff*r(5)
-
- coeff = lhs(4,5)
- c(4,1) = c(4,1) - coeff*c(5,1)
- c(4,2) = c(4,2) - coeff*c(5,2)
- c(4,3) = c(4,3) - coeff*c(5,3)
- c(4,4) = c(4,4) - coeff*c(5,4)
- c(4,5) = c(4,5) - coeff*c(5,5)
- r(4) = r(4) - coeff*r(5)
-
-
- return
- end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine binvrhs( lhs,r )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- double precision pivot, coeff, lhs
- dimension lhs(5,5)
- double precision r(5)
-
-c---------------------------------------------------------------------
-c
-c---------------------------------------------------------------------
-
-
- pivot = 1.00d0/lhs(1,1)
- lhs(1,2) = lhs(1,2)*pivot
- lhs(1,3) = lhs(1,3)*pivot
- lhs(1,4) = lhs(1,4)*pivot
- lhs(1,5) = lhs(1,5)*pivot
- r(1) = r(1) *pivot
-
- coeff = lhs(2,1)
- lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
- lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
- lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
- lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
- r(2) = r(2) - coeff*r(1)
-
- coeff = lhs(3,1)
- lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
- lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
- lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
- lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
- r(3) = r(3) - coeff*r(1)
-
- coeff = lhs(4,1)
- lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
- lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
- lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
- lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
- r(4) = r(4) - coeff*r(1)
-
- coeff = lhs(5,1)
- lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
- lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
- lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
- lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
- r(5) = r(5) - coeff*r(1)
-
-
- pivot = 1.00d0/lhs(2,2)
- lhs(2,3) = lhs(2,3)*pivot
- lhs(2,4) = lhs(2,4)*pivot
- lhs(2,5) = lhs(2,5)*pivot
- r(2) = r(2) *pivot
-
- coeff = lhs(1,2)
- lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
- lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
- lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
- r(1) = r(1) - coeff*r(2)
-
- coeff = lhs(3,2)
- lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
- lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
- lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
- r(3) = r(3) - coeff*r(2)
-
- coeff = lhs(4,2)
- lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
- lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
- lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
- r(4) = r(4) - coeff*r(2)
-
- coeff = lhs(5,2)
- lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
- lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
- lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
- r(5) = r(5) - coeff*r(2)
-
-
- pivot = 1.00d0/lhs(3,3)
- lhs(3,4) = lhs(3,4)*pivot
- lhs(3,5) = lhs(3,5)*pivot
- r(3) = r(3) *pivot
-
- coeff = lhs(1,3)
- lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
- lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
- r(1) = r(1) - coeff*r(3)
-
- coeff = lhs(2,3)
- lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
- lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
- r(2) = r(2) - coeff*r(3)
-
- coeff = lhs(4,3)
- lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
- lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
- r(4) = r(4) - coeff*r(3)
-
- coeff = lhs(5,3)
- lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
- lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
- r(5) = r(5) - coeff*r(3)
-
-
- pivot = 1.00d0/lhs(4,4)
- lhs(4,5) = lhs(4,5)*pivot
- r(4) = r(4) *pivot
-
- coeff = lhs(1,4)
- lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
- r(1) = r(1) - coeff*r(4)
-
- coeff = lhs(2,4)
- lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
- r(2) = r(2) - coeff*r(4)
-
- coeff = lhs(3,4)
- lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
- r(3) = r(3) - coeff*r(4)
-
- coeff = lhs(5,4)
- lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
- r(5) = r(5) - coeff*r(4)
-
-
- pivot = 1.00d0/lhs(5,5)
- r(5) = r(5) *pivot
-
- coeff = lhs(1,5)
- r(1) = r(1) - coeff*r(5)
-
- coeff = lhs(2,5)
- r(2) = r(2) - coeff*r(5)
-
- coeff = lhs(3,5)
- r(3) = r(3) - coeff*r(5)
-
- coeff = lhs(4,5)
- r(4) = r(4) - coeff*r(5)
-
-
- return
- end
-
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine verify(no_time_steps, class, verified)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c verification routine
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5),
- > epsilon, xce(5), xcr(5), dtref
- integer m, no_time_steps
- character class
- logical verified
-
-c---------------------------------------------------------------------
-c tolerance level
-c---------------------------------------------------------------------
- epsilon = 1.0d-08
- verified = .true.
-
-c---------------------------------------------------------------------
-c compute the error norm and the residual norm, and exit if not printing
-c---------------------------------------------------------------------
-
- if (iotype .ne. 0) then
- call accumulate_norms(xce)
- else
- call error_norm(xce)
- endif
-
- call copy_faces
-
- call rhs_norm(xcr)
-
- do m = 1, 5
- xcr(m) = xcr(m) / dt
- enddo
-
- if (node .ne. 0) return
-
- class = 'U'
-
- do m = 1,5
- xcrref(m) = 1.0
- xceref(m) = 1.0
- end do
-
-c---------------------------------------------------------------------
-c reference data for 12X12X12 grids after 60 time steps, with DT = 1.0d-02
-c---------------------------------------------------------------------
- if ( (grid_points(1) .eq. 12 ) .and.
- > (grid_points(2) .eq. 12 ) .and.
- > (grid_points(3) .eq. 12 ) .and.
- > (no_time_steps .eq. 60 )) then
-
- class = 'S'
- dtref = 1.0d-2
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 1.7034283709541311d-01
- xcrref(2) = 1.2975252070034097d-02
- xcrref(3) = 3.2527926989486055d-02
- xcrref(4) = 2.6436421275166801d-02
- xcrref(5) = 1.9211784131744430d-01
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
- if (iotype .eq. 0) then
- xceref(1) = 4.9976913345811579d-04
- xceref(2) = 4.5195666782961927d-05
- xceref(3) = 7.3973765172921357d-05
- xceref(4) = 7.3821238632439731d-05
- xceref(5) = 8.9269630987491446d-04
- else
- xceref(1) = 0.1149036328945d+02
- xceref(2) = 0.9156788904727d+00
- xceref(3) = 0.2857899428614d+01
- xceref(4) = 0.2598273346734d+01
- xceref(5) = 0.2652795397547d+02
- endif
-
-c---------------------------------------------------------------------
-c reference data for 24X24X24 grids after 200 time steps, with DT = 0.8d-3
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 24) .and.
- > (grid_points(2) .eq. 24) .and.
- > (grid_points(3) .eq. 24) .and.
- > (no_time_steps . eq. 200) ) then
-
- class = 'W'
- dtref = 0.8d-3
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 0.1125590409344d+03
- xcrref(2) = 0.1180007595731d+02
- xcrref(3) = 0.2710329767846d+02
- xcrref(4) = 0.2469174937669d+02
- xcrref(5) = 0.2638427874317d+03
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
- if (iotype .eq. 0) then
- xceref(1) = 0.4419655736008d+01
- xceref(2) = 0.4638531260002d+00
- xceref(3) = 0.1011551749967d+01
- xceref(4) = 0.9235878729944d+00
- xceref(5) = 0.1018045837718d+02
- else
- xceref(1) = 0.6729594398612d+02
- xceref(2) = 0.5264523081690d+01
- xceref(3) = 0.1677107142637d+02
- xceref(4) = 0.1508721463436d+02
- xceref(5) = 0.1477018363393d+03
- endif
-
-
-c---------------------------------------------------------------------
-c reference data for 64X64X64 grids after 200 time steps, with DT = 0.8d-3
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 64) .and.
- > (grid_points(2) .eq. 64) .and.
- > (grid_points(3) .eq. 64) .and.
- > (no_time_steps . eq. 200) ) then
-
- class = 'A'
- dtref = 0.8d-3
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 1.0806346714637264d+02
- xcrref(2) = 1.1319730901220813d+01
- xcrref(3) = 2.5974354511582465d+01
- xcrref(4) = 2.3665622544678910d+01
- xcrref(5) = 2.5278963211748344d+02
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
- if (iotype .eq. 0) then
- xceref(1) = 4.2348416040525025d+00
- xceref(2) = 4.4390282496995698d-01
- xceref(3) = 9.6692480136345650d-01
- xceref(4) = 8.8302063039765474d-01
- xceref(5) = 9.7379901770829278d+00
- else
- xceref(1) = 0.6482218724961d+02
- xceref(2) = 0.5066461714527d+01
- xceref(3) = 0.1613931961359d+02
- xceref(4) = 0.1452010201481d+02
- xceref(5) = 0.1420099377681d+03
- endif
-
-c---------------------------------------------------------------------
-c reference data for 102X102X102 grids after 200 time steps,
-c with DT = 3.0d-04
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 102) .and.
- > (grid_points(2) .eq. 102) .and.
- > (grid_points(3) .eq. 102) .and.
- > (no_time_steps . eq. 200) ) then
-
- class = 'B'
- dtref = 3.0d-4
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 1.4233597229287254d+03
- xcrref(2) = 9.9330522590150238d+01
- xcrref(3) = 3.5646025644535285d+02
- xcrref(4) = 3.2485447959084092d+02
- xcrref(5) = 3.2707541254659363d+03
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
- if (iotype .eq. 0) then
- xceref(1) = 5.2969847140936856d+01
- xceref(2) = 4.4632896115670668d+00
- xceref(3) = 1.3122573342210174d+01
- xceref(4) = 1.2006925323559144d+01
- xceref(5) = 1.2459576151035986d+02
- else
- xceref(1) = 0.1477545106464d+03
- xceref(2) = 0.1108895555053d+02
- xceref(3) = 0.3698065590331d+02
- xceref(4) = 0.3310505581440d+02
- xceref(5) = 0.3157928282563d+03
- endif
-
-c---------------------------------------------------------------------
-c reference data for 162X162X162 grids after 200 time steps,
-c with DT = 1.0d-04
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 162) .and.
- > (grid_points(2) .eq. 162) .and.
- > (grid_points(3) .eq. 162) .and.
- > (no_time_steps . eq. 200) ) then
-
- class = 'C'
- dtref = 1.0d-4
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 0.62398116551764615d+04
- xcrref(2) = 0.50793239190423964d+03
- xcrref(3) = 0.15423530093013596d+04
- xcrref(4) = 0.13302387929291190d+04
- xcrref(5) = 0.11604087428436455d+05
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
- if (iotype .eq. 0) then
- xceref(1) = 0.16462008369091265d+03
- xceref(2) = 0.11497107903824313d+02
- xceref(3) = 0.41207446207461508d+02
- xceref(4) = 0.37087651059694167d+02
- xceref(5) = 0.36211053051841265d+03
- else
- xceref(1) = 0.2597156483475d+03
- xceref(2) = 0.1985384289495d+02
- xceref(3) = 0.6517950485788d+02
- xceref(4) = 0.5757235541520d+02
- xceref(5) = 0.5215668188726d+03
- endif
-
-
-c---------------------------------------------------------------------
-c reference data for 408x408x408 grids after 250 time steps,
-c with DT = 0.2d-04
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 408) .and.
- > (grid_points(2) .eq. 408) .and.
- > (grid_points(3) .eq. 408) .and.
- > (no_time_steps . eq. 250) ) then
-
- class = 'D'
- dtref = 0.2d-4
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 0.2533188551738d+05
- xcrref(2) = 0.2346393716980d+04
- xcrref(3) = 0.6294554366904d+04
- xcrref(4) = 0.5352565376030d+04
- xcrref(5) = 0.3905864038618d+05
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
- if (iotype .eq. 0) then
- xceref(1) = 0.3100009377557d+03
- xceref(2) = 0.2424086324913d+02
- xceref(3) = 0.7782212022645d+02
- xceref(4) = 0.6835623860116d+02
- xceref(5) = 0.6065737200368d+03
- else
- xceref(1) = 0.3813781566713d+03
- xceref(2) = 0.3160872966198d+02
- xceref(3) = 0.9593576357290d+02
- xceref(4) = 0.8363391989815d+02
- xceref(5) = 0.7063466087423d+03
- endif
-
-
-c---------------------------------------------------------------------
-c reference data for 1020x1020x1020 grids after 250 time steps,
-c with DT = 0.4d-05
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 1020) .and.
- > (grid_points(2) .eq. 1020) .and.
- > (grid_points(3) .eq. 1020) .and.
- > (no_time_steps . eq. 250) ) then
-
- class = 'E'
- dtref = 0.4d-5
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 0.9795372484517d+05
- xcrref(2) = 0.9739814511521d+04
- xcrref(3) = 0.2467606342965d+05
- xcrref(4) = 0.2092419572860d+05
- xcrref(5) = 0.1392138856939d+06
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
-
- if (iotype .eq. 0) then
- xceref(1) = 0.4327562208414d+03
- xceref(2) = 0.3699051964887d+02
- xceref(3) = 0.1089845040954d+03
- xceref(4) = 0.9462517622043d+02
- xceref(5) = 0.7765512765309d+03
- else
-c wr_interval = 5
- xceref(1) = 0.4729898413058d+03
- xceref(2) = 0.4145899331704d+02
- xceref(3) = 0.1192850917138d+03
- xceref(4) = 0.1032746026932d+03
- xceref(5) = 0.8270322177634d+03
-c wr_interval = 10
-c xceref(1) = 0.4718135916251d+03
-c xceref(2) = 0.4132620259096d+02
-c xceref(3) = 0.1189831133503d+03
-c xceref(4) = 0.1030212798803d+03
-c xceref(5) = 0.8255924078458d+03
- endif
-
- else
- verified = .false.
- endif
-
-c---------------------------------------------------------------------
-c verification test for residuals if gridsize is one of
-c the defined grid sizes above (class .ne. 'U')
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Compute the difference of solution values and the known reference
-c values.
-c---------------------------------------------------------------------
- do m = 1, 5
-
- xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m))
- xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
-
- enddo
-
-c---------------------------------------------------------------------
-c Output the comparison of computed results to known cases.
-c---------------------------------------------------------------------
-
- if (class .ne. 'U') then
- write(*, 1990) class
- 1990 format(' Verification being performed for class ', a)
- write (*,2000) epsilon
- 2000 format(' accuracy setting for epsilon = ', E20.13)
- verified = (dabs(dt-dtref) .le. epsilon)
- if (.not.verified) then
- verified = .false.
- class = 'U'
- write (*,1000) dtref
- 1000 format(' DT does not match the reference value of ',
- > E15.8)
- endif
- else
- write(*, 1995)
- 1995 format(' Unknown class')
- endif
-
-
- if (class .ne. 'U') then
- write (*,2001)
- else
- write (*, 2005)
- endif
-
- 2001 format(' Comparison of RMS-norms of residual')
- 2005 format(' RMS-norms of residual')
- do m = 1, 5
- if (class .eq. 'U') then
- write(*, 2015) m, xcr(m)
- else if (xcrdif(m) .le. epsilon) then
- write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
- else
- verified = .false.
- write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
- endif
- enddo
-
- if (class .ne. 'U') then
- write (*,2002)
- else
- write (*,2006)
- endif
- 2002 format(' Comparison of RMS-norms of solution error')
- 2006 format(' RMS-norms of solution error')
-
- do m = 1, 5
- if (class .eq. 'U') then
- write(*, 2015) m, xce(m)
- else if (xcedif(m) .le. epsilon) then
- write (*,2011) m,xce(m),xceref(m),xcedif(m)
- else
- verified = .false.
- write (*,2010) m,xce(m),xceref(m),xcedif(m)
- endif
- enddo
-
- 2010 format(' FAILURE: ', i2, E20.13, E20.13, E20.13)
- 2011 format(' ', i2, E20.13, E20.13, E20.13)
- 2015 format(' ', i2, E20.13)
-
- if (class .eq. 'U') then
- write(*, 2022)
- write(*, 2023)
- 2022 format(' No reference values provided')
- 2023 format(' No verification performed')
- else if (verified) then
- write(*, 2020)
- 2020 format(' Verification Successful')
- else
- write(*, 2021)
- 2021 format(' Verification failed')
- endif
-
- return
-
-
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-c
-c work_lhs.h
-c
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- double precision fjac(5, 5, -2:MAX_CELL_DIM+1),
- > njac(5, 5, -2:MAX_CELL_DIM+1),
- > lhsa(5, 5, -1:MAX_CELL_DIM),
- > lhsb(5, 5, -1:MAX_CELL_DIM),
- > tmp1, tmp2, tmp3
- common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-c
-c work_lhs_vec.h
-c
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- double precision fjac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
- > njac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
- > lhsa(5, 5, -1:MAX_CELL_DIM, -1:MAX_CELL_DIM),
- > lhsb(5, 5, -1:MAX_CELL_DIM, -1:MAX_CELL_DIM),
- > tmp1, tmp2, tmp3
- common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c Performs line solves in X direction by first factoring
-c the block-tridiagonal matrix into an upper triangular matrix,
-c and then performing back substitution to solve for the unknow
-c vectors of each line.
-c
-c Make sure we treat elements zero to cell_size in the direction
-c of the sweep.
-c
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
- integer c, istart, stage,
- > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
- > isize,jsize,ksize,send_id
-
- istart = 0
-
-c---------------------------------------------------------------------
-c in our terminology stage is the number of the cell in the x-direction
-c i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
- do stage = 1,ncells
- c = slice(1,stage)
- isize = cell_size(1,c) - 1
- jsize = cell_size(2,c) - 1
- ksize = cell_size(3,c) - 1
-
-c---------------------------------------------------------------------
-c set last-cell flag
-c---------------------------------------------------------------------
- if (stage .eq. ncells) then
- last = 1
- else
- last = 0
- endif
-
- if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
- first = 1
-c call lhsx(c)
- call x_solve_cell(first,last,c)
- else
-c---------------------------------------------------------------------
-c Not the first cell of this line, so receive info from
-c processor working on preceeding cell
-c---------------------------------------------------------------------
- first = 0
- call x_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c overlap computations and communications
-c---------------------------------------------------------------------
-c call lhsx(c)
-c---------------------------------------------------------------------
-c wait for completion
-c---------------------------------------------------------------------
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c install C'(istart) and rhs'(istart) to be used in this cell
-c---------------------------------------------------------------------
- call x_unpack_solve_info(c)
- call x_solve_cell(first,last,c)
- endif
-
- if (last .eq. 0) call x_send_solve_info(send_id,c)
- enddo
-
-c---------------------------------------------------------------------
-c now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
- do stage = ncells, 1, -1
- c = slice(1,stage)
- first = 0
- last = 0
- if (stage .eq. 1) first = 1
- if (stage .eq. ncells) then
- last = 1
-c---------------------------------------------------------------------
-c last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
- call x_backsubstitute(first, last,c)
- else
- call x_receive_backsub_info(recv_id,c)
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
- call x_unpack_backsub_info(c)
- call x_backsubstitute(first,last,c)
- endif
- if (first .eq. 0) call x_send_backsub_info(send_id,c)
- enddo
-
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_unpack_solve_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack C'(-1) and rhs'(-1) for
-c all j and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- integer j,k,m,n,ptr,c,istart
-
- istart = 0
- ptr = 0
- do k=0,KMAX-1
- do j=0,JMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send C'(iend) and rhs'(iend) for
-c all j and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer j,k,m,n,isize,ptr,c,jp,kp
- integer error,send_id,buffer_size
-
- isize = cell_size(1,c)-1
- jp = cell_coord(2,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c pack up buffer
-c---------------------------------------------------------------------
- ptr = 0
- do k=0,KMAX-1
- do j=0,JMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,isize,j,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c send buffer
-c---------------------------------------------------------------------
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, successor(1),
- > WEST+jp+kp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send U(istart) for all j and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer j,k,n,ptr,c,istart,jp,kp
- integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c Send element 0 to previous processor
-c---------------------------------------------------------------------
- istart = 0
- jp = cell_coord(2,c)-1
- kp = cell_coord(3,c)-1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- ptr = 0
- do k=0,KMAX-1
- do j=0,JMAX-1
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,istart,j,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, predecessor(1),
- > EAST+jp+kp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack U(isize) for all j and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- integer j,k,n,ptr,c
-
- ptr = 0
- do k=0,KMAX-1
- do j=0,JMAX-1
- do n=1,BLOCK_SIZE
- backsub_info(n,j,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer error,recv_id,jp,kp,c,buffer_size
- jp = cell_coord(2,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, successor(1),
- > EAST+jp+kp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer jp,kp,recv_id,error,c,buffer_size
- jp = cell_coord(2,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, predecessor(1),
- > WEST+jp+kp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c back solve: if last cell, then generate U(isize)=rhs(isize)
-c else assume U(isize) is loaded in un pack backsub_info
-c so just use it
-c after call u(istart) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer first, last, c, i, j, k
- integer m,n,isize,jsize,ksize,istart
-
- istart = 0
- isize = cell_size(1,c)-1
- jsize = cell_size(2,c)-end(2,c)-1
- ksize = cell_size(3,c)-end(3,c)-1
- if (last .eq. 0) then
- do k=start(3,c),ksize
- do j=start(2,c),jsize
-c---------------------------------------------------------------------
-c U(isize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c)
- > - lhsc(m,n,isize,j,k,c)*
- > backsub_info(n,j,k,c)
-c---------------------------------------------------------------------
-c rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c)
-c $ - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
-c---------------------------------------------------------------------
- enddo
- enddo
- enddo
- enddo
- endif
- do k=start(3,c),ksize
- do j=start(2,c),jsize
- do i=isize-1,istart,-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
- > - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
- enddo
- enddo
- enddo
- enddo
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c performs guaussian elimination on this cell.
-c
-c assumes that unpacking routines for non-first cells
-c preload C' and rhs' from previous cell.
-c
-c assumed send happens outside this routine, but that
-c c'(IMAX) and rhs'(IMAX) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'work_lhs.h'
-
- integer first,last,c
- integer i,j,k,isize,ksize,jsize,istart
-
- istart = 0
- isize = cell_size(1,c)-1
- jsize = cell_size(2,c)-end(2,c)-1
- ksize = cell_size(3,c)-end(3,c)-1
-
- call lhsabinit(lhsa, lhsb, isize)
-
- do k=start(3,c),ksize
- do j=start(2,c),jsize
-
-c---------------------------------------------------------------------
-c This function computes the left hand side in the xi-direction
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c determine a (labeled f) and n jacobians for cell c
-c---------------------------------------------------------------------
- do i = start(1,c)-1, cell_size(1,c) - end(1,c)
-
- tmp1 = rho_i(i,j,k,c)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-c---------------------------------------------------------------------
-c
-c---------------------------------------------------------------------
- fjac(1,1,i) = 0.0d+00
- fjac(1,2,i) = 1.0d+00
- fjac(1,3,i) = 0.0d+00
- fjac(1,4,i) = 0.0d+00
- fjac(1,5,i) = 0.0d+00
-
- fjac(2,1,i) = -(u(2,i,j,k,c) * tmp2 *
- > u(2,i,j,k,c))
- > + c2 * qs(i,j,k,c)
- fjac(2,2,i) = ( 2.0d+00 - c2 )
- > * ( u(2,i,j,k,c) * tmp1 )
- fjac(2,3,i) = - c2 * ( u(3,i,j,k,c) * tmp1 )
- fjac(2,4,i) = - c2 * ( u(4,i,j,k,c) * tmp1 )
- fjac(2,5,i) = c2
-
- fjac(3,1,i) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
- fjac(3,2,i) = u(3,i,j,k,c) * tmp1
- fjac(3,3,i) = u(2,i,j,k,c) * tmp1
- fjac(3,4,i) = 0.0d+00
- fjac(3,5,i) = 0.0d+00
-
- fjac(4,1,i) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
- fjac(4,2,i) = u(4,i,j,k,c) * tmp1
- fjac(4,3,i) = 0.0d+00
- fjac(4,4,i) = u(2,i,j,k,c) * tmp1
- fjac(4,5,i) = 0.0d+00
-
- fjac(5,1,i) = ( c2 * 2.0d0 * qs(i,j,k,c)
- > - c1 * ( u(5,i,j,k,c) * tmp1 ) )
- > * ( u(2,i,j,k,c) * tmp1 )
- fjac(5,2,i) = c1 * u(5,i,j,k,c) * tmp1
- > - c2
- > * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
- > + qs(i,j,k,c) )
- fjac(5,3,i) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
- > * tmp2
- fjac(5,4,i) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
- > * tmp2
- fjac(5,5,i) = c1 * ( u(2,i,j,k,c) * tmp1 )
-
- njac(1,1,i) = 0.0d+00
- njac(1,2,i) = 0.0d+00
- njac(1,3,i) = 0.0d+00
- njac(1,4,i) = 0.0d+00
- njac(1,5,i) = 0.0d+00
-
- njac(2,1,i) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
- njac(2,2,i) = con43 * c3c4 * tmp1
- njac(2,3,i) = 0.0d+00
- njac(2,4,i) = 0.0d+00
- njac(2,5,i) = 0.0d+00
-
- njac(3,1,i) = - c3c4 * tmp2 * u(3,i,j,k,c)
- njac(3,2,i) = 0.0d+00
- njac(3,3,i) = c3c4 * tmp1
- njac(3,4,i) = 0.0d+00
- njac(3,5,i) = 0.0d+00
-
- njac(4,1,i) = - c3c4 * tmp2 * u(4,i,j,k,c)
- njac(4,2,i) = 0.0d+00
- njac(4,3,i) = 0.0d+00
- njac(4,4,i) = c3c4 * tmp1
- njac(4,5,i) = 0.0d+00
-
- njac(5,1,i) = - ( con43 * c3c4
- > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
- > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
- > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
- > - c1345 * tmp2 * u(5,i,j,k,c)
-
- njac(5,2,i) = ( con43 * c3c4
- > - c1345 ) * tmp2 * u(2,i,j,k,c)
- njac(5,3,i) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
- njac(5,4,i) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
- njac(5,5,i) = ( c1345 ) * tmp1
-
- enddo
-c---------------------------------------------------------------------
-c now jacobians set, so form left hand side in x direction
-c---------------------------------------------------------------------
- do i = start(1,c), isize - end(1,c)
-
- tmp1 = dt * tx1
- tmp2 = dt * tx2
-
- lhsa(1,1,i) = - tmp2 * fjac(1,1,i-1)
- > - tmp1 * njac(1,1,i-1)
- > - tmp1 * dx1
- lhsa(1,2,i) = - tmp2 * fjac(1,2,i-1)
- > - tmp1 * njac(1,2,i-1)
- lhsa(1,3,i) = - tmp2 * fjac(1,3,i-1)
- > - tmp1 * njac(1,3,i-1)
- lhsa(1,4,i) = - tmp2 * fjac(1,4,i-1)
- > - tmp1 * njac(1,4,i-1)
- lhsa(1,5,i) = - tmp2 * fjac(1,5,i-1)
- > - tmp1 * njac(1,5,i-1)
-
- lhsa(2,1,i) = - tmp2 * fjac(2,1,i-1)
- > - tmp1 * njac(2,1,i-1)
- lhsa(2,2,i) = - tmp2 * fjac(2,2,i-1)
- > - tmp1 * njac(2,2,i-1)
- > - tmp1 * dx2
- lhsa(2,3,i) = - tmp2 * fjac(2,3,i-1)
- > - tmp1 * njac(2,3,i-1)
- lhsa(2,4,i) = - tmp2 * fjac(2,4,i-1)
- > - tmp1 * njac(2,4,i-1)
- lhsa(2,5,i) = - tmp2 * fjac(2,5,i-1)
- > - tmp1 * njac(2,5,i-1)
-
- lhsa(3,1,i) = - tmp2 * fjac(3,1,i-1)
- > - tmp1 * njac(3,1,i-1)
- lhsa(3,2,i) = - tmp2 * fjac(3,2,i-1)
- > - tmp1 * njac(3,2,i-1)
- lhsa(3,3,i) = - tmp2 * fjac(3,3,i-1)
- > - tmp1 * njac(3,3,i-1)
- > - tmp1 * dx3
- lhsa(3,4,i) = - tmp2 * fjac(3,4,i-1)
- > - tmp1 * njac(3,4,i-1)
- lhsa(3,5,i) = - tmp2 * fjac(3,5,i-1)
- > - tmp1 * njac(3,5,i-1)
-
- lhsa(4,1,i) = - tmp2 * fjac(4,1,i-1)
- > - tmp1 * njac(4,1,i-1)
- lhsa(4,2,i) = - tmp2 * fjac(4,2,i-1)
- > - tmp1 * njac(4,2,i-1)
- lhsa(4,3,i) = - tmp2 * fjac(4,3,i-1)
- > - tmp1 * njac(4,3,i-1)
- lhsa(4,4,i) = - tmp2 * fjac(4,4,i-1)
- > - tmp1 * njac(4,4,i-1)
- > - tmp1 * dx4
- lhsa(4,5,i) = - tmp2 * fjac(4,5,i-1)
- > - tmp1 * njac(4,5,i-1)
-
- lhsa(5,1,i) = - tmp2 * fjac(5,1,i-1)
- > - tmp1 * njac(5,1,i-1)
- lhsa(5,2,i) = - tmp2 * fjac(5,2,i-1)
- > - tmp1 * njac(5,2,i-1)
- lhsa(5,3,i) = - tmp2 * fjac(5,3,i-1)
- > - tmp1 * njac(5,3,i-1)
- lhsa(5,4,i) = - tmp2 * fjac(5,4,i-1)
- > - tmp1 * njac(5,4,i-1)
- lhsa(5,5,i) = - tmp2 * fjac(5,5,i-1)
- > - tmp1 * njac(5,5,i-1)
- > - tmp1 * dx5
-
- lhsb(1,1,i) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(1,1,i)
- > + tmp1 * 2.0d+00 * dx1
- lhsb(1,2,i) = tmp1 * 2.0d+00 * njac(1,2,i)
- lhsb(1,3,i) = tmp1 * 2.0d+00 * njac(1,3,i)
- lhsb(1,4,i) = tmp1 * 2.0d+00 * njac(1,4,i)
- lhsb(1,5,i) = tmp1 * 2.0d+00 * njac(1,5,i)
-
- lhsb(2,1,i) = tmp1 * 2.0d+00 * njac(2,1,i)
- lhsb(2,2,i) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(2,2,i)
- > + tmp1 * 2.0d+00 * dx2
- lhsb(2,3,i) = tmp1 * 2.0d+00 * njac(2,3,i)
- lhsb(2,4,i) = tmp1 * 2.0d+00 * njac(2,4,i)
- lhsb(2,5,i) = tmp1 * 2.0d+00 * njac(2,5,i)
-
- lhsb(3,1,i) = tmp1 * 2.0d+00 * njac(3,1,i)
- lhsb(3,2,i) = tmp1 * 2.0d+00 * njac(3,2,i)
- lhsb(3,3,i) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(3,3,i)
- > + tmp1 * 2.0d+00 * dx3
- lhsb(3,4,i) = tmp1 * 2.0d+00 * njac(3,4,i)
- lhsb(3,5,i) = tmp1 * 2.0d+00 * njac(3,5,i)
-
- lhsb(4,1,i) = tmp1 * 2.0d+00 * njac(4,1,i)
- lhsb(4,2,i) = tmp1 * 2.0d+00 * njac(4,2,i)
- lhsb(4,3,i) = tmp1 * 2.0d+00 * njac(4,3,i)
- lhsb(4,4,i) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(4,4,i)
- > + tmp1 * 2.0d+00 * dx4
- lhsb(4,5,i) = tmp1 * 2.0d+00 * njac(4,5,i)
-
- lhsb(5,1,i) = tmp1 * 2.0d+00 * njac(5,1,i)
- lhsb(5,2,i) = tmp1 * 2.0d+00 * njac(5,2,i)
- lhsb(5,3,i) = tmp1 * 2.0d+00 * njac(5,3,i)
- lhsb(5,4,i) = tmp1 * 2.0d+00 * njac(5,4,i)
- lhsb(5,5,i) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(5,5,i)
- > + tmp1 * 2.0d+00 * dx5
-
- lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i+1)
- > - tmp1 * njac(1,1,i+1)
- > - tmp1 * dx1
- lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i+1)
- > - tmp1 * njac(1,2,i+1)
- lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i+1)
- > - tmp1 * njac(1,3,i+1)
- lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i+1)
- > - tmp1 * njac(1,4,i+1)
- lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i+1)
- > - tmp1 * njac(1,5,i+1)
-
- lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i+1)
- > - tmp1 * njac(2,1,i+1)
- lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i+1)
- > - tmp1 * njac(2,2,i+1)
- > - tmp1 * dx2
- lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i+1)
- > - tmp1 * njac(2,3,i+1)
- lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i+1)
- > - tmp1 * njac(2,4,i+1)
- lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i+1)
- > - tmp1 * njac(2,5,i+1)
-
- lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i+1)
- > - tmp1 * njac(3,1,i+1)
- lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i+1)
- > - tmp1 * njac(3,2,i+1)
- lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i+1)
- > - tmp1 * njac(3,3,i+1)
- > - tmp1 * dx3
- lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i+1)
- > - tmp1 * njac(3,4,i+1)
- lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i+1)
- > - tmp1 * njac(3,5,i+1)
-
- lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i+1)
- > - tmp1 * njac(4,1,i+1)
- lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i+1)
- > - tmp1 * njac(4,2,i+1)
- lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i+1)
- > - tmp1 * njac(4,3,i+1)
- lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i+1)
- > - tmp1 * njac(4,4,i+1)
- > - tmp1 * dx4
- lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i+1)
- > - tmp1 * njac(4,5,i+1)
-
- lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i+1)
- > - tmp1 * njac(5,1,i+1)
- lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i+1)
- > - tmp1 * njac(5,2,i+1)
- lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i+1)
- > - tmp1 * njac(5,3,i+1)
- lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i+1)
- > - tmp1 * njac(5,4,i+1)
- lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i+1)
- > - tmp1 * njac(5,5,i+1)
- > - tmp1 * dx5
-
- enddo
-
-
-c---------------------------------------------------------------------
-c outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
- if (first .eq. 1) then
-
-c---------------------------------------------------------------------
-c multiply c(istart,j,k) by b_inverse and copy back to c
-c multiply rhs(istart) by b_inverse(istart) and copy to rhs
-c---------------------------------------------------------------------
- call binvcrhs( lhsb(1,1,istart),
- > lhsc(1,1,istart,j,k,c),
- > rhs(1,istart,j,k,c) )
-
- endif
-
-c---------------------------------------------------------------------
-c begin inner most do loop
-c do all the elements of the cell unless last
-c---------------------------------------------------------------------
- do i=istart+first,isize-last
-
-c---------------------------------------------------------------------
-c rhs(i) = rhs(i) - A*rhs(i-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,i),
- > rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c B(i) = B(i) - C(i-1)*A(i)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,i),
- > lhsc(1,1,i-1,j,k,c),
- > lhsb(1,1,i))
-
-
-c---------------------------------------------------------------------
-c multiply c(i,j,k) by b_inverse and copy back to c
-c multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
-c---------------------------------------------------------------------
- call binvcrhs( lhsb(1,1,i),
- > lhsc(1,1,i,j,k,c),
- > rhs(1,i,j,k,c) )
-
- enddo
-
-c---------------------------------------------------------------------
-c Now finish up special cases for last cell
-c---------------------------------------------------------------------
- if (last .eq. 1) then
-
-c---------------------------------------------------------------------
-c rhs(isize) = rhs(isize) - A*rhs(isize-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,isize),
- > rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
-
-c---------------------------------------------------------------------
-c B(isize) = B(isize) - C(isize-1)*A(isize)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,isize),
- > lhsc(1,1,isize-1,j,k,c),
- > lhsb(1,1,isize))
-
-c---------------------------------------------------------------------
-c multiply rhs() by b_inverse() and copy to rhs
-c---------------------------------------------------------------------
- call binvrhs( lhsb(1,1,isize),
- > rhs(1,isize,j,k,c) )
-
- endif
- enddo
- enddo
-
-
- return
- end
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c Performs line solves in X direction by first factoring
-c the block-tridiagonal matrix into an upper triangular matrix,
-c and then performing back substitution to solve for the unknow
-c vectors of each line.
-c
-c Make sure we treat elements zero to cell_size in the direction
-c of the sweep.
-c
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
- integer c, istart, stage,
- > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
- > isize,jsize,ksize,send_id
-
- istart = 0
-
-c---------------------------------------------------------------------
-c in our terminology stage is the number of the cell in the x-direct
-c i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
- do stage = 1,ncells
- c = slice(1,stage)
- isize = cell_size(1,c) - 1
- jsize = cell_size(2,c) - 1
- ksize = cell_size(3,c) - 1
-
-c---------------------------------------------------------------------
-c set last-cell flag
-c---------------------------------------------------------------------
- if (stage .eq. ncells) then
- last = 1
- else
- last = 0
- endif
-
- if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
- first = 1
-c call lhsx(c)
- call x_solve_cell(first,last,c)
- else
-c---------------------------------------------------------------------
-c Not the first cell of this line, so receive info from
-c processor working on preceeding cell
-c---------------------------------------------------------------------
- first = 0
- call x_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c overlap computations and communications
-c---------------------------------------------------------------------
-c call lhsx(c)
-c---------------------------------------------------------------------
-c wait for completion
-c---------------------------------------------------------------------
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c install C'(istart) and rhs'(istart) to be used in this cell
-c---------------------------------------------------------------------
- call x_unpack_solve_info(c)
- call x_solve_cell(first,last,c)
- endif
-
- if (last .eq. 0) call x_send_solve_info(send_id,c)
- enddo
-
-c---------------------------------------------------------------------
-c now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
- do stage = ncells, 1, -1
- c = slice(1,stage)
- first = 0
- last = 0
- if (stage .eq. 1) first = 1
- if (stage .eq. ncells) then
- last = 1
-c---------------------------------------------------------------------
-c last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
- call x_backsubstitute(first, last,c)
- else
- call x_receive_backsub_info(recv_id,c)
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
- call x_unpack_backsub_info(c)
- call x_backsubstitute(first,last,c)
- endif
- if (first .eq. 0) call x_send_backsub_info(send_id,c)
- enddo
-
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_unpack_solve_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack C'(-1) and rhs'(-1) for
-c all j and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- integer j,k,m,n,ptr,c,istart
-
- istart = 0
- ptr = 0
- do k=0,KMAX-1
- do j=0,JMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send C'(iend) and rhs'(iend) for
-c all j and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer j,k,m,n,isize,ptr,c,jp,kp
- integer error,send_id,buffer_size
-
- isize = cell_size(1,c)-1
- jp = cell_coord(2,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c pack up buffer
-c---------------------------------------------------------------------
- ptr = 0
- do k=0,KMAX-1
- do j=0,JMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,isize,j,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c send buffer
-c---------------------------------------------------------------------
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, successor(1),
- > WEST+jp+kp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send U(istart) for all j and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer j,k,n,ptr,c,istart,jp,kp
- integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c Send element 0 to previous processor
-c---------------------------------------------------------------------
- istart = 0
- jp = cell_coord(2,c)-1
- kp = cell_coord(3,c)-1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- ptr = 0
- do k=0,KMAX-1
- do j=0,JMAX-1
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,istart,j,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, predecessor(1),
- > EAST+jp+kp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack U(isize) for all j and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- integer j,k,n,ptr,c
-
- ptr = 0
- do k=0,KMAX-1
- do j=0,JMAX-1
- do n=1,BLOCK_SIZE
- backsub_info(n,j,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer error,recv_id,jp,kp,c,buffer_size
- jp = cell_coord(2,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, successor(1),
- > EAST+jp+kp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer jp,kp,recv_id,error,c,buffer_size
- jp = cell_coord(2,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, predecessor(1),
- > WEST+jp+kp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c back solve: if last cell, then generate U(isize)=rhs(isize)
-c else assume U(isize) is loaded in un pack backsub_info
-c so just use it
-c after call u(istart) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer first, last, c, i, j, k
- integer m,n,isize,jsize,ksize,istart
-
- istart = 0
- isize = cell_size(1,c)-1
- jsize = cell_size(2,c)-end(2,c)-1
- ksize = cell_size(3,c)-end(3,c)-1
- if (last .eq. 0) then
- do k=start(3,c),ksize
- do j=start(2,c),jsize
-c---------------------------------------------------------------------
-c U(isize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c)
- > - lhsc(m,n,isize,j,k,c)*
- > backsub_info(n,j,k,c)
-c---------------------------------------------------------------------
-c rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c)
-c $ - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
-c---------------------------------------------------------------------
- enddo
- enddo
- enddo
- enddo
- endif
- do k=start(3,c),ksize
- do j=start(2,c),jsize
- do i=isize-1,istart,-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
- > - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
- enddo
- enddo
- enddo
- enddo
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c performs guaussian elimination on this cell.
-c
-c assumes that unpacking routines for non-first cells
-c preload C' and rhs' from previous cell.
-c
-c assumed send happens outside this routine, but that
-c c'(IMAX) and rhs'(IMAX) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'work_lhs_vec.h'
-
- integer first,last,c
- integer i,j,k,m,n,isize,ksize,jsize,istart
-
- istart = 0
- isize = cell_size(1,c)-1
- jsize = cell_size(2,c)-end(2,c)-1
- ksize = cell_size(3,c)-end(3,c)-1
-
-c---------------------------------------------------------------------
-c zero the left hand side for starters
-c set diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
- do j = 0, jsize
- do m = 1, 5
- do n = 1, 5
- lhsa(m,n,0,j) = 0.0d0
- lhsb(m,n,0,j) = 0.0d0
- lhsa(m,n,isize,j) = 0.0d0
- lhsb(m,n,isize,j) = 0.0d0
- enddo
- lhsb(m,m,0,j) = 1.0d0
- lhsb(m,m,isize,j) = 1.0d0
- enddo
- enddo
-
- do k=start(3,c),ksize
-
-c---------------------------------------------------------------------
-c This function computes the left hand side in the xi-direction
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c determine a (labeled f) and n jacobians for cell c
-c---------------------------------------------------------------------
- do j=start(2,c),jsize
- do i = start(1,c)-1, cell_size(1,c) - end(1,c)
-
- tmp1 = rho_i(i,j,k,c)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-c---------------------------------------------------------------------
-c
-c---------------------------------------------------------------------
- fjac(1,1,i,j) = 0.0d+00
- fjac(1,2,i,j) = 1.0d+00
- fjac(1,3,i,j) = 0.0d+00
- fjac(1,4,i,j) = 0.0d+00
- fjac(1,5,i,j) = 0.0d+00
-
- fjac(2,1,i,j) = -(u(2,i,j,k,c) * tmp2 *
- > u(2,i,j,k,c))
- > + c2 * qs(i,j,k,c)
- fjac(2,2,i,j) = ( 2.0d+00 - c2 )
- > * ( u(2,i,j,k,c) * tmp1 )
- fjac(2,3,i,j) = - c2 * ( u(3,i,j,k,c) * tmp1 )
- fjac(2,4,i,j) = - c2 * ( u(4,i,j,k,c) * tmp1 )
- fjac(2,5,i,j) = c2
-
- fjac(3,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
- fjac(3,2,i,j) = u(3,i,j,k,c) * tmp1
- fjac(3,3,i,j) = u(2,i,j,k,c) * tmp1
- fjac(3,4,i,j) = 0.0d+00
- fjac(3,5,i,j) = 0.0d+00
-
- fjac(4,1,i,j) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
- fjac(4,2,i,j) = u(4,i,j,k,c) * tmp1
- fjac(4,3,i,j) = 0.0d+00
- fjac(4,4,i,j) = u(2,i,j,k,c) * tmp1
- fjac(4,5,i,j) = 0.0d+00
-
- fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
- > - c1 * ( u(5,i,j,k,c) * tmp1 ) )
- > * ( u(2,i,j,k,c) * tmp1 )
- fjac(5,2,i,j) = c1 * u(5,i,j,k,c) * tmp1
- > - c2
- > * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
- > + qs(i,j,k,c) )
- fjac(5,3,i,j) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
- > * tmp2
- fjac(5,4,i,j) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
- > * tmp2
- fjac(5,5,i,j) = c1 * ( u(2,i,j,k,c) * tmp1 )
-
- njac(1,1,i,j) = 0.0d+00
- njac(1,2,i,j) = 0.0d+00
- njac(1,3,i,j) = 0.0d+00
- njac(1,4,i,j) = 0.0d+00
- njac(1,5,i,j) = 0.0d+00
-
- njac(2,1,i,j) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
- njac(2,2,i,j) = con43 * c3c4 * tmp1
- njac(2,3,i,j) = 0.0d+00
- njac(2,4,i,j) = 0.0d+00
- njac(2,5,i,j) = 0.0d+00
-
- njac(3,1,i,j) = - c3c4 * tmp2 * u(3,i,j,k,c)
- njac(3,2,i,j) = 0.0d+00
- njac(3,3,i,j) = c3c4 * tmp1
- njac(3,4,i,j) = 0.0d+00
- njac(3,5,i,j) = 0.0d+00
-
- njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
- njac(4,2,i,j) = 0.0d+00
- njac(4,3,i,j) = 0.0d+00
- njac(4,4,i,j) = c3c4 * tmp1
- njac(4,5,i,j) = 0.0d+00
-
- njac(5,1,i,j) = - ( con43 * c3c4
- > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
- > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
- > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
- > - c1345 * tmp2 * u(5,i,j,k,c)
-
- njac(5,2,i,j) = ( con43 * c3c4
- > - c1345 ) * tmp2 * u(2,i,j,k,c)
- njac(5,3,i,j) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
- njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
- njac(5,5,i,j) = ( c1345 ) * tmp1
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c now jacobians set, so form left hand side in x direction
-c---------------------------------------------------------------------
- do j=start(2,c),jsize
- do i = start(1,c), isize - end(1,c)
-
- tmp1 = dt * tx1
- tmp2 = dt * tx2
-
- lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i-1,j)
- > - tmp1 * njac(1,1,i-1,j)
- > - tmp1 * dx1
- lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i-1,j)
- > - tmp1 * njac(1,2,i-1,j)
- lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i-1,j)
- > - tmp1 * njac(1,3,i-1,j)
- lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i-1,j)
- > - tmp1 * njac(1,4,i-1,j)
- lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i-1,j)
- > - tmp1 * njac(1,5,i-1,j)
-
- lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i-1,j)
- > - tmp1 * njac(2,1,i-1,j)
- lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i-1,j)
- > - tmp1 * njac(2,2,i-1,j)
- > - tmp1 * dx2
- lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i-1,j)
- > - tmp1 * njac(2,3,i-1,j)
- lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i-1,j)
- > - tmp1 * njac(2,4,i-1,j)
- lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i-1,j)
- > - tmp1 * njac(2,5,i-1,j)
-
- lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i-1,j)
- > - tmp1 * njac(3,1,i-1,j)
- lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i-1,j)
- > - tmp1 * njac(3,2,i-1,j)
- lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i-1,j)
- > - tmp1 * njac(3,3,i-1,j)
- > - tmp1 * dx3
- lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i-1,j)
- > - tmp1 * njac(3,4,i-1,j)
- lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i-1,j)
- > - tmp1 * njac(3,5,i-1,j)
-
- lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i-1,j)
- > - tmp1 * njac(4,1,i-1,j)
- lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i-1,j)
- > - tmp1 * njac(4,2,i-1,j)
- lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i-1,j)
- > - tmp1 * njac(4,3,i-1,j)
- lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i-1,j)
- > - tmp1 * njac(4,4,i-1,j)
- > - tmp1 * dx4
- lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i-1,j)
- > - tmp1 * njac(4,5,i-1,j)
-
- lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i-1,j)
- > - tmp1 * njac(5,1,i-1,j)
- lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i-1,j)
- > - tmp1 * njac(5,2,i-1,j)
- lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i-1,j)
- > - tmp1 * njac(5,3,i-1,j)
- lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i-1,j)
- > - tmp1 * njac(5,4,i-1,j)
- lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i-1,j)
- > - tmp1 * njac(5,5,i-1,j)
- > - tmp1 * dx5
-
- lhsb(1,1,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(1,1,i,j)
- > + tmp1 * 2.0d+00 * dx1
- lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
- lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
- lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
- lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
-
- lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
- lhsb(2,2,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(2,2,i,j)
- > + tmp1 * 2.0d+00 * dx2
- lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
- lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
- lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
-
- lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
- lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
- lhsb(3,3,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(3,3,i,j)
- > + tmp1 * 2.0d+00 * dx3
- lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
- lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
-
- lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
- lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
- lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
- lhsb(4,4,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(4,4,i,j)
- > + tmp1 * 2.0d+00 * dx4
- lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
-
- lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
- lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
- lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
- lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
- lhsb(5,5,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(5,5,i,j)
- > + tmp1 * 2.0d+00 * dx5
-
- lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i+1,j)
- > - tmp1 * njac(1,1,i+1,j)
- > - tmp1 * dx1
- lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i+1,j)
- > - tmp1 * njac(1,2,i+1,j)
- lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i+1,j)
- > - tmp1 * njac(1,3,i+1,j)
- lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i+1,j)
- > - tmp1 * njac(1,4,i+1,j)
- lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i+1,j)
- > - tmp1 * njac(1,5,i+1,j)
-
- lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i+1,j)
- > - tmp1 * njac(2,1,i+1,j)
- lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i+1,j)
- > - tmp1 * njac(2,2,i+1,j)
- > - tmp1 * dx2
- lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i+1,j)
- > - tmp1 * njac(2,3,i+1,j)
- lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i+1,j)
- > - tmp1 * njac(2,4,i+1,j)
- lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i+1,j)
- > - tmp1 * njac(2,5,i+1,j)
-
- lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i+1,j)
- > - tmp1 * njac(3,1,i+1,j)
- lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i+1,j)
- > - tmp1 * njac(3,2,i+1,j)
- lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i+1,j)
- > - tmp1 * njac(3,3,i+1,j)
- > - tmp1 * dx3
- lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i+1,j)
- > - tmp1 * njac(3,4,i+1,j)
- lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i+1,j)
- > - tmp1 * njac(3,5,i+1,j)
-
- lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i+1,j)
- > - tmp1 * njac(4,1,i+1,j)
- lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i+1,j)
- > - tmp1 * njac(4,2,i+1,j)
- lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i+1,j)
- > - tmp1 * njac(4,3,i+1,j)
- lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i+1,j)
- > - tmp1 * njac(4,4,i+1,j)
- > - tmp1 * dx4
- lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i+1,j)
- > - tmp1 * njac(4,5,i+1,j)
-
- lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i+1,j)
- > - tmp1 * njac(5,1,i+1,j)
- lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i+1,j)
- > - tmp1 * njac(5,2,i+1,j)
- lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i+1,j)
- > - tmp1 * njac(5,3,i+1,j)
- lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i+1,j)
- > - tmp1 * njac(5,4,i+1,j)
- lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i+1,j)
- > - tmp1 * njac(5,5,i+1,j)
- > - tmp1 * dx5
-
- enddo
- enddo
-
-
-c---------------------------------------------------------------------
-c outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
- if (first .eq. 1) then
-
-c---------------------------------------------------------------------
-c multiply c(istart,j,k) by b_inverse and copy back to c
-c multiply rhs(istart) by b_inverse(istart) and copy to rhs
-c---------------------------------------------------------------------
-!dir$ ivdep
- do j=start(2,c),jsize
- call binvcrhs( lhsb(1,1,istart,j),
- > lhsc(1,1,istart,j,k,c),
- > rhs(1,istart,j,k,c) )
- enddo
-
- endif
-
-c---------------------------------------------------------------------
-c begin inner most do loop
-c do all the elements of the cell unless last
-c---------------------------------------------------------------------
-!dir$ ivdep
-!dir$ interchange(i,j)
- do j=start(2,c),jsize
- do i=istart+first,isize-last
-
-c---------------------------------------------------------------------
-c rhs(i) = rhs(i) - A*rhs(i-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,i,j),
- > rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c B(i) = B(i) - C(i-1)*A(i)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,i,j),
- > lhsc(1,1,i-1,j,k,c),
- > lhsb(1,1,i,j))
-
-
-c---------------------------------------------------------------------
-c multiply c(i,j,k) by b_inverse and copy back to c
-c multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
-c---------------------------------------------------------------------
- call binvcrhs( lhsb(1,1,i,j),
- > lhsc(1,1,i,j,k,c),
- > rhs(1,i,j,k,c) )
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c Now finish up special cases for last cell
-c---------------------------------------------------------------------
- if (last .eq. 1) then
-
-!dir$ ivdep
- do j=start(2,c),jsize
-c---------------------------------------------------------------------
-c rhs(isize) = rhs(isize) - A*rhs(isize-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,isize,j),
- > rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
-
-c---------------------------------------------------------------------
-c B(isize) = B(isize) - C(isize-1)*A(isize)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,isize,j),
- > lhsc(1,1,isize-1,j,k,c),
- > lhsb(1,1,isize,j))
-
-c---------------------------------------------------------------------
-c multiply rhs() by b_inverse() and copy to rhs
-c---------------------------------------------------------------------
- call binvrhs( lhsb(1,1,isize,j),
- > rhs(1,isize,j,k,c) )
- enddo
-
- endif
- enddo
-
-
- return
- end
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Performs line solves in Y direction by first factoring
-c the block-tridiagonal matrix into an upper triangular matrix,
-c and then performing back substitution to solve for the unknow
-c vectors of each line.
-c
-c Make sure we treat elements zero to cell_size in the direction
-c of the sweep.
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer
- > c, jstart, stage,
- > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
- > isize,jsize,ksize,send_id
-
- jstart = 0
-
-c---------------------------------------------------------------------
-c in our terminology stage is the number of the cell in the y-direction
-c i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
- do stage = 1,ncells
- c = slice(2,stage)
- isize = cell_size(1,c) - 1
- jsize = cell_size(2,c) - 1
- ksize = cell_size(3,c) - 1
-
-c---------------------------------------------------------------------
-c set last-cell flag
-c---------------------------------------------------------------------
- if (stage .eq. ncells) then
- last = 1
- else
- last = 0
- endif
-
- if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
- first = 1
-c call lhsy(c)
- call y_solve_cell(first,last,c)
- else
-c---------------------------------------------------------------------
-c Not the first cell of this line, so receive info from
-c processor working on preceeding cell
-c---------------------------------------------------------------------
- first = 0
- call y_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c overlap computations and communications
-c---------------------------------------------------------------------
-c call lhsy(c)
-c---------------------------------------------------------------------
-c wait for completion
-c---------------------------------------------------------------------
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
-c---------------------------------------------------------------------
- call y_unpack_solve_info(c)
- call y_solve_cell(first,last,c)
- endif
-
- if (last .eq. 0) call y_send_solve_info(send_id,c)
- enddo
-
-c---------------------------------------------------------------------
-c now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
- do stage = ncells, 1, -1
- c = slice(2,stage)
- first = 0
- last = 0
- if (stage .eq. 1) first = 1
- if (stage .eq. ncells) then
- last = 1
-c---------------------------------------------------------------------
-c last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
- call y_backsubstitute(first, last,c)
- else
- call y_receive_backsub_info(recv_id,c)
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
- call y_unpack_backsub_info(c)
- call y_backsubstitute(first,last,c)
- endif
- if (first .eq. 0) call y_send_backsub_info(send_id,c)
- enddo
-
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_unpack_solve_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack C'(-1) and rhs'(-1) for
-c all i and k
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i,k,m,n,ptr,c,jstart
-
- jstart = 0
- ptr = 0
- do k=0,KMAX-1
- do i=0,IMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send C'(jend) and rhs'(jend) for
-c all i and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i,k,m,n,jsize,ptr,c,ip,kp
- integer error,send_id,buffer_size
-
- jsize = cell_size(2,c)-1
- ip = cell_coord(1,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c pack up buffer
-c---------------------------------------------------------------------
- ptr = 0
- do k=0,KMAX-1
- do i=0,IMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c send buffer
-c---------------------------------------------------------------------
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, successor(2),
- > SOUTH+ip+kp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send U(jstart) for all i and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i,k,n,ptr,c,jstart,ip,kp
- integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c Send element 0 to previous processor
-c---------------------------------------------------------------------
- jstart = 0
- ip = cell_coord(1,c)-1
- kp = cell_coord(3,c)-1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- ptr = 0
- do k=0,KMAX-1
- do i=0,IMAX-1
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, predecessor(2),
- > NORTH+ip+kp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack U(jsize) for all i and k
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i,k,n,ptr,c
-
- ptr = 0
- do k=0,KMAX-1
- do i=0,IMAX-1
- do n=1,BLOCK_SIZE
- backsub_info(n,i,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer error,recv_id,ip,kp,c,buffer_size
- ip = cell_coord(1,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, successor(2),
- > NORTH+ip+kp*NCELLS, comm_solve,
- > recv_id, error)
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer ip,kp,recv_id,error,c,buffer_size
- ip = cell_coord(1,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, predecessor(2),
- > SOUTH+ip+kp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c back solve: if last cell, then generate U(jsize)=rhs(jsize)
-c else assume U(jsize) is loaded in un pack backsub_info
-c so just use it
-c after call u(jstart) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer first, last, c, i, k
- integer m,n,j,jsize,isize,ksize,jstart
-
- jstart = 0
- isize = cell_size(1,c)-end(1,c)-1
- jsize = cell_size(2,c)-1
- ksize = cell_size(3,c)-end(3,c)-1
- if (last .eq. 0) then
- do k=start(3,c),ksize
- do i=start(1,c),isize
-c---------------------------------------------------------------------
-c U(jsize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c)
- > - lhsc(m,n,i,jsize,k,c)*
- > backsub_info(n,i,k,c)
- enddo
- enddo
- enddo
- enddo
- endif
- do k=start(3,c),ksize
- do j=jsize-1,jstart,-1
- do i=start(1,c),isize
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
- > - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
- enddo
- enddo
- enddo
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c performs guaussian elimination on this cell.
-c
-c assumes that unpacking routines for non-first cells
-c preload C' and rhs' from previous cell.
-c
-c assumed send happens outside this routine, but that
-c c'(JMAX) and rhs'(JMAX) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'work_lhs.h'
-
- integer first,last,c
- integer i,j,k,isize,ksize,jsize,jstart
- double precision utmp(6,-2:JMAX+1)
-
- jstart = 0
- isize = cell_size(1,c)-end(1,c)-1
- jsize = cell_size(2,c)-1
- ksize = cell_size(3,c)-end(3,c)-1
-
- call lhsabinit(lhsa, lhsb, jsize)
-
- do k=start(3,c),ksize
- do i=start(1,c),isize
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three y-factors
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Compute the indices for storing the tri-diagonal matrix;
-c determine a (labeled f) and n jacobians for cell c
-c---------------------------------------------------------------------
- do j = start(2,c)-1, cell_size(2,c)-end(2,c)
- utmp(1,j) = 1.0d0 / u(1,i,j,k,c)
- utmp(2,j) = u(2,i,j,k,c)
- utmp(3,j) = u(3,i,j,k,c)
- utmp(4,j) = u(4,i,j,k,c)
- utmp(5,j) = u(5,i,j,k,c)
- utmp(6,j) = qs(i,j,k,c)
- end do
-
- do j = start(2,c)-1, cell_size(2,c)-end(2,c)
-
- tmp1 = utmp(1,j)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- fjac(1,1,j) = 0.0d+00
- fjac(1,2,j) = 0.0d+00
- fjac(1,3,j) = 1.0d+00
- fjac(1,4,j) = 0.0d+00
- fjac(1,5,j) = 0.0d+00
-
- fjac(2,1,j) = - ( utmp(2,j)*utmp(3,j) )
- > * tmp2
- fjac(2,2,j) = utmp(3,j) * tmp1
- fjac(2,3,j) = utmp(2,j) * tmp1
- fjac(2,4,j) = 0.0d+00
- fjac(2,5,j) = 0.0d+00
-
- fjac(3,1,j) = - ( utmp(3,j)*utmp(3,j)*tmp2)
- > + c2 * utmp(6,j)
- fjac(3,2,j) = - c2 * utmp(2,j) * tmp1
- fjac(3,3,j) = ( 2.0d+00 - c2 )
- > * utmp(3,j) * tmp1
- fjac(3,4,j) = - c2 * utmp(4,j) * tmp1
- fjac(3,5,j) = c2
-
- fjac(4,1,j) = - ( utmp(3,j)*utmp(4,j) )
- > * tmp2
- fjac(4,2,j) = 0.0d+00
- fjac(4,3,j) = utmp(4,j) * tmp1
- fjac(4,4,j) = utmp(3,j) * tmp1
- fjac(4,5,j) = 0.0d+00
-
- fjac(5,1,j) = ( c2 * 2.0d0 * utmp(6,j)
- > - c1 * utmp(5,j) * tmp1 )
- > * utmp(3,j) * tmp1
- fjac(5,2,j) = - c2 * utmp(2,j)*utmp(3,j)
- > * tmp2
- fjac(5,3,j) = c1 * utmp(5,j) * tmp1
- > - c2 * ( utmp(6,j)
- > + utmp(3,j)*utmp(3,j) * tmp2 )
- fjac(5,4,j) = - c2 * ( utmp(3,j)*utmp(4,j) )
- > * tmp2
- fjac(5,5,j) = c1 * utmp(3,j) * tmp1
-
- njac(1,1,j) = 0.0d+00
- njac(1,2,j) = 0.0d+00
- njac(1,3,j) = 0.0d+00
- njac(1,4,j) = 0.0d+00
- njac(1,5,j) = 0.0d+00
-
- njac(2,1,j) = - c3c4 * tmp2 * utmp(2,j)
- njac(2,2,j) = c3c4 * tmp1
- njac(2,3,j) = 0.0d+00
- njac(2,4,j) = 0.0d+00
- njac(2,5,j) = 0.0d+00
-
- njac(3,1,j) = - con43 * c3c4 * tmp2 * utmp(3,j)
- njac(3,2,j) = 0.0d+00
- njac(3,3,j) = con43 * c3c4 * tmp1
- njac(3,4,j) = 0.0d+00
- njac(3,5,j) = 0.0d+00
-
- njac(4,1,j) = - c3c4 * tmp2 * utmp(4,j)
- njac(4,2,j) = 0.0d+00
- njac(4,3,j) = 0.0d+00
- njac(4,4,j) = c3c4 * tmp1
- njac(4,5,j) = 0.0d+00
-
- njac(5,1,j) = - ( c3c4
- > - c1345 ) * tmp3 * (utmp(2,j)**2)
- > - ( con43 * c3c4
- > - c1345 ) * tmp3 * (utmp(3,j)**2)
- > - ( c3c4 - c1345 ) * tmp3 * (utmp(4,j)**2)
- > - c1345 * tmp2 * utmp(5,j)
-
- njac(5,2,j) = ( c3c4 - c1345 ) * tmp2 * utmp(2,j)
- njac(5,3,j) = ( con43 * c3c4
- > - c1345 ) * tmp2 * utmp(3,j)
- njac(5,4,j) = ( c3c4 - c1345 ) * tmp2 * utmp(4,j)
- njac(5,5,j) = ( c1345 ) * tmp1
-
- enddo
-
-c---------------------------------------------------------------------
-c now joacobians set, so form left hand side in y direction
-c---------------------------------------------------------------------
- do j = start(2,c), jsize-end(2,c)
-
- tmp1 = dt * ty1
- tmp2 = dt * ty2
-
- lhsa(1,1,j) = - tmp2 * fjac(1,1,j-1)
- > - tmp1 * njac(1,1,j-1)
- > - tmp1 * dy1
- lhsa(1,2,j) = - tmp2 * fjac(1,2,j-1)
- > - tmp1 * njac(1,2,j-1)
- lhsa(1,3,j) = - tmp2 * fjac(1,3,j-1)
- > - tmp1 * njac(1,3,j-1)
- lhsa(1,4,j) = - tmp2 * fjac(1,4,j-1)
- > - tmp1 * njac(1,4,j-1)
- lhsa(1,5,j) = - tmp2 * fjac(1,5,j-1)
- > - tmp1 * njac(1,5,j-1)
-
- lhsa(2,1,j) = - tmp2 * fjac(2,1,j-1)
- > - tmp1 * njac(2,1,j-1)
- lhsa(2,2,j) = - tmp2 * fjac(2,2,j-1)
- > - tmp1 * njac(2,2,j-1)
- > - tmp1 * dy2
- lhsa(2,3,j) = - tmp2 * fjac(2,3,j-1)
- > - tmp1 * njac(2,3,j-1)
- lhsa(2,4,j) = - tmp2 * fjac(2,4,j-1)
- > - tmp1 * njac(2,4,j-1)
- lhsa(2,5,j) = - tmp2 * fjac(2,5,j-1)
- > - tmp1 * njac(2,5,j-1)
-
- lhsa(3,1,j) = - tmp2 * fjac(3,1,j-1)
- > - tmp1 * njac(3,1,j-1)
- lhsa(3,2,j) = - tmp2 * fjac(3,2,j-1)
- > - tmp1 * njac(3,2,j-1)
- lhsa(3,3,j) = - tmp2 * fjac(3,3,j-1)
- > - tmp1 * njac(3,3,j-1)
- > - tmp1 * dy3
- lhsa(3,4,j) = - tmp2 * fjac(3,4,j-1)
- > - tmp1 * njac(3,4,j-1)
- lhsa(3,5,j) = - tmp2 * fjac(3,5,j-1)
- > - tmp1 * njac(3,5,j-1)
-
- lhsa(4,1,j) = - tmp2 * fjac(4,1,j-1)
- > - tmp1 * njac(4,1,j-1)
- lhsa(4,2,j) = - tmp2 * fjac(4,2,j-1)
- > - tmp1 * njac(4,2,j-1)
- lhsa(4,3,j) = - tmp2 * fjac(4,3,j-1)
- > - tmp1 * njac(4,3,j-1)
- lhsa(4,4,j) = - tmp2 * fjac(4,4,j-1)
- > - tmp1 * njac(4,4,j-1)
- > - tmp1 * dy4
- lhsa(4,5,j) = - tmp2 * fjac(4,5,j-1)
- > - tmp1 * njac(4,5,j-1)
-
- lhsa(5,1,j) = - tmp2 * fjac(5,1,j-1)
- > - tmp1 * njac(5,1,j-1)
- lhsa(5,2,j) = - tmp2 * fjac(5,2,j-1)
- > - tmp1 * njac(5,2,j-1)
- lhsa(5,3,j) = - tmp2 * fjac(5,3,j-1)
- > - tmp1 * njac(5,3,j-1)
- lhsa(5,4,j) = - tmp2 * fjac(5,4,j-1)
- > - tmp1 * njac(5,4,j-1)
- lhsa(5,5,j) = - tmp2 * fjac(5,5,j-1)
- > - tmp1 * njac(5,5,j-1)
- > - tmp1 * dy5
-
- lhsb(1,1,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(1,1,j)
- > + tmp1 * 2.0d+00 * dy1
- lhsb(1,2,j) = tmp1 * 2.0d+00 * njac(1,2,j)
- lhsb(1,3,j) = tmp1 * 2.0d+00 * njac(1,3,j)
- lhsb(1,4,j) = tmp1 * 2.0d+00 * njac(1,4,j)
- lhsb(1,5,j) = tmp1 * 2.0d+00 * njac(1,5,j)
-
- lhsb(2,1,j) = tmp1 * 2.0d+00 * njac(2,1,j)
- lhsb(2,2,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(2,2,j)
- > + tmp1 * 2.0d+00 * dy2
- lhsb(2,3,j) = tmp1 * 2.0d+00 * njac(2,3,j)
- lhsb(2,4,j) = tmp1 * 2.0d+00 * njac(2,4,j)
- lhsb(2,5,j) = tmp1 * 2.0d+00 * njac(2,5,j)
-
- lhsb(3,1,j) = tmp1 * 2.0d+00 * njac(3,1,j)
- lhsb(3,2,j) = tmp1 * 2.0d+00 * njac(3,2,j)
- lhsb(3,3,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(3,3,j)
- > + tmp1 * 2.0d+00 * dy3
- lhsb(3,4,j) = tmp1 * 2.0d+00 * njac(3,4,j)
- lhsb(3,5,j) = tmp1 * 2.0d+00 * njac(3,5,j)
-
- lhsb(4,1,j) = tmp1 * 2.0d+00 * njac(4,1,j)
- lhsb(4,2,j) = tmp1 * 2.0d+00 * njac(4,2,j)
- lhsb(4,3,j) = tmp1 * 2.0d+00 * njac(4,3,j)
- lhsb(4,4,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(4,4,j)
- > + tmp1 * 2.0d+00 * dy4
- lhsb(4,5,j) = tmp1 * 2.0d+00 * njac(4,5,j)
-
- lhsb(5,1,j) = tmp1 * 2.0d+00 * njac(5,1,j)
- lhsb(5,2,j) = tmp1 * 2.0d+00 * njac(5,2,j)
- lhsb(5,3,j) = tmp1 * 2.0d+00 * njac(5,3,j)
- lhsb(5,4,j) = tmp1 * 2.0d+00 * njac(5,4,j)
- lhsb(5,5,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(5,5,j)
- > + tmp1 * 2.0d+00 * dy5
-
- lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,j+1)
- > - tmp1 * njac(1,1,j+1)
- > - tmp1 * dy1
- lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,j+1)
- > - tmp1 * njac(1,2,j+1)
- lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,j+1)
- > - tmp1 * njac(1,3,j+1)
- lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,j+1)
- > - tmp1 * njac(1,4,j+1)
- lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,j+1)
- > - tmp1 * njac(1,5,j+1)
-
- lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,j+1)
- > - tmp1 * njac(2,1,j+1)
- lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,j+1)
- > - tmp1 * njac(2,2,j+1)
- > - tmp1 * dy2
- lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,j+1)
- > - tmp1 * njac(2,3,j+1)
- lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,j+1)
- > - tmp1 * njac(2,4,j+1)
- lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,j+1)
- > - tmp1 * njac(2,5,j+1)
-
- lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,j+1)
- > - tmp1 * njac(3,1,j+1)
- lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,j+1)
- > - tmp1 * njac(3,2,j+1)
- lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,j+1)
- > - tmp1 * njac(3,3,j+1)
- > - tmp1 * dy3
- lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,j+1)
- > - tmp1 * njac(3,4,j+1)
- lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,j+1)
- > - tmp1 * njac(3,5,j+1)
-
- lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,j+1)
- > - tmp1 * njac(4,1,j+1)
- lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,j+1)
- > - tmp1 * njac(4,2,j+1)
- lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,j+1)
- > - tmp1 * njac(4,3,j+1)
- lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,j+1)
- > - tmp1 * njac(4,4,j+1)
- > - tmp1 * dy4
- lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,j+1)
- > - tmp1 * njac(4,5,j+1)
-
- lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,j+1)
- > - tmp1 * njac(5,1,j+1)
- lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,j+1)
- > - tmp1 * njac(5,2,j+1)
- lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,j+1)
- > - tmp1 * njac(5,3,j+1)
- lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,j+1)
- > - tmp1 * njac(5,4,j+1)
- lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,j+1)
- > - tmp1 * njac(5,5,j+1)
- > - tmp1 * dy5
-
- enddo
-
-
-c---------------------------------------------------------------------
-c outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
- if (first .eq. 1) then
-
-c---------------------------------------------------------------------
-c multiply c(i,jstart,k) by b_inverse and copy back to c
-c multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
-c---------------------------------------------------------------------
- call binvcrhs( lhsb(1,1,jstart),
- > lhsc(1,1,i,jstart,k,c),
- > rhs(1,i,jstart,k,c) )
-
- endif
-
-c---------------------------------------------------------------------
-c begin inner most do loop
-c do all the elements of the cell unless last
-c---------------------------------------------------------------------
- do j=jstart+first,jsize-last
-
-c---------------------------------------------------------------------
-c subtract A*lhs_vector(j-1) from lhs_vector(j)
-c
-c rhs(j) = rhs(j) - A*rhs(j-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,j),
- > rhs(1,i,j-1,k,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c B(j) = B(j) - C(j-1)*A(j)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,j),
- > lhsc(1,1,i,j-1,k,c),
- > lhsb(1,1,j))
-
-c---------------------------------------------------------------------
-c multiply c(i,j,k) by b_inverse and copy back to c
-c multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
-c---------------------------------------------------------------------
- call binvcrhs( lhsb(1,1,j),
- > lhsc(1,1,i,j,k,c),
- > rhs(1,i,j,k,c) )
-
- enddo
-
-c---------------------------------------------------------------------
-c Now finish up special cases for last cell
-c---------------------------------------------------------------------
- if (last .eq. 1) then
-
-c---------------------------------------------------------------------
-c rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,jsize),
- > rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c))
-
-c---------------------------------------------------------------------
-c B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
-c call matmul_sub(aa,i,jsize,k,c,
-c $ cc,i,jsize-1,k,c,bb,i,jsize,k,c)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,jsize),
- > lhsc(1,1,i,jsize-1,k,c),
- > lhsb(1,1,jsize))
-
-c---------------------------------------------------------------------
-c multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
-c---------------------------------------------------------------------
- call binvrhs( lhsb(1,1,jsize),
- > rhs(1,i,jsize,k,c) )
-
- endif
- enddo
- enddo
-
-
- return
- end
-
-
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Performs line solves in Y direction by first factoring
-c the block-tridiagonal matrix into an upper triangular matrix,
-c and then performing back substitution to solve for the unknow
-c vectors of each line.
-c
-c Make sure we treat elements zero to cell_size in the direction
-c of the sweep.
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer
- > c, jstart, stage,
- > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
- > isize,jsize,ksize,send_id
-
- jstart = 0
-
-c---------------------------------------------------------------------
-c in our terminology stage is the number of the cell in the y-direct
-c i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
- do stage = 1,ncells
- c = slice(2,stage)
- isize = cell_size(1,c) - 1
- jsize = cell_size(2,c) - 1
- ksize = cell_size(3,c) - 1
-
-c---------------------------------------------------------------------
-c set last-cell flag
-c---------------------------------------------------------------------
- if (stage .eq. ncells) then
- last = 1
- else
- last = 0
- endif
-
- if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
- first = 1
-c call lhsy(c)
- call y_solve_cell(first,last,c)
- else
-c---------------------------------------------------------------------
-c Not the first cell of this line, so receive info from
-c processor working on preceeding cell
-c---------------------------------------------------------------------
- first = 0
- call y_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c overlap computations and communications
-c---------------------------------------------------------------------
-c call lhsy(c)
-c---------------------------------------------------------------------
-c wait for completion
-c---------------------------------------------------------------------
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
-c---------------------------------------------------------------------
- call y_unpack_solve_info(c)
- call y_solve_cell(first,last,c)
- endif
-
- if (last .eq. 0) call y_send_solve_info(send_id,c)
- enddo
-
-c---------------------------------------------------------------------
-c now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
- do stage = ncells, 1, -1
- c = slice(2,stage)
- first = 0
- last = 0
- if (stage .eq. 1) first = 1
- if (stage .eq. ncells) then
- last = 1
-c---------------------------------------------------------------------
-c last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
- call y_backsubstitute(first, last,c)
- else
- call y_receive_backsub_info(recv_id,c)
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
- call y_unpack_backsub_info(c)
- call y_backsubstitute(first,last,c)
- endif
- if (first .eq. 0) call y_send_backsub_info(send_id,c)
- enddo
-
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_unpack_solve_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack C'(-1) and rhs'(-1) for
-c all i and k
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i,k,m,n,ptr,c,jstart
-
- jstart = 0
- ptr = 0
- do k=0,KMAX-1
- do i=0,IMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send C'(jend) and rhs'(jend) for
-c all i and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i,k,m,n,jsize,ptr,c,ip,kp
- integer error,send_id,buffer_size
-
- jsize = cell_size(2,c)-1
- ip = cell_coord(1,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c pack up buffer
-c---------------------------------------------------------------------
- ptr = 0
- do k=0,KMAX-1
- do i=0,IMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c send buffer
-c---------------------------------------------------------------------
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, successor(2),
- > SOUTH+ip+kp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send U(jstart) for all i and k
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i,k,n,ptr,c,jstart,ip,kp
- integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c Send element 0 to previous processor
-c---------------------------------------------------------------------
- jstart = 0
- ip = cell_coord(1,c)-1
- kp = cell_coord(3,c)-1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- ptr = 0
- do k=0,KMAX-1
- do i=0,IMAX-1
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, predecessor(2),
- > NORTH+ip+kp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack U(jsize) for all i and k
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i,k,n,ptr,c
-
- ptr = 0
- do k=0,KMAX-1
- do i=0,IMAX-1
- do n=1,BLOCK_SIZE
- backsub_info(n,i,k,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer error,recv_id,ip,kp,c,buffer_size
- ip = cell_coord(1,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, successor(2),
- > NORTH+ip+kp*NCELLS, comm_solve,
- > recv_id, error)
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer ip,kp,recv_id,error,c,buffer_size
- ip = cell_coord(1,c) - 1
- kp = cell_coord(3,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, predecessor(2),
- > SOUTH+ip+kp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c back solve: if last cell, then generate U(jsize)=rhs(jsize)
-c else assume U(jsize) is loaded in un pack backsub_info
-c so just use it
-c after call u(jstart) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer first, last, c, i, k
- integer m,n,j,jsize,isize,ksize,jstart
-
- jstart = 0
- isize = cell_size(1,c)-end(1,c)-1
- jsize = cell_size(2,c)-1
- ksize = cell_size(3,c)-end(3,c)-1
- if (last .eq. 0) then
- do k=start(3,c),ksize
- do i=start(1,c),isize
-c---------------------------------------------------------------------
-c U(jsize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c)
- > - lhsc(m,n,i,jsize,k,c)*
- > backsub_info(n,i,k,c)
- enddo
- enddo
- enddo
- enddo
- endif
- do k=start(3,c),ksize
- do j=jsize-1,jstart,-1
- do i=start(1,c),isize
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
- > - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
- enddo
- enddo
- enddo
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c performs guaussian elimination on this cell.
-c
-c assumes that unpacking routines for non-first cells
-c preload C' and rhs' from previous cell.
-c
-c assumed send happens outside this routine, but that
-c c'(JMAX) and rhs'(JMAX) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'work_lhs_vec.h'
-
- integer first,last,c
- integer i,j,k,m,n,isize,ksize,jsize,jstart
-
- jstart = 0
- isize = cell_size(1,c)-end(1,c)-1
- jsize = cell_size(2,c)-1
- ksize = cell_size(3,c)-end(3,c)-1
-
-c---------------------------------------------------------------------
-c zero the left hand side for starters
-c set diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
- do i = 0, isize
- do m = 1, 5
- do n = 1, 5
- lhsa(m,n,i,0) = 0.0d0
- lhsb(m,n,i,0) = 0.0d0
- lhsa(m,n,i,jsize) = 0.0d0
- lhsb(m,n,i,jsize) = 0.0d0
- enddo
- lhsb(m,m,i,0) = 1.0d0
- lhsb(m,m,i,jsize) = 1.0d0
- enddo
- enddo
-
- do k=start(3,c),ksize
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three y-factors
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Compute the indices for storing the tri-diagonal matrix;
-c determine a (labeled f) and n jacobians for cell c
-c---------------------------------------------------------------------
-
- do j = start(2,c)-1, cell_size(2,c)-end(2,c)
- do i=start(1,c),isize
-
- tmp1 = 1.0d0 / u(1,i,j,k,c)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- fjac(1,1,i,j) = 0.0d+00
- fjac(1,2,i,j) = 0.0d+00
- fjac(1,3,i,j) = 1.0d+00
- fjac(1,4,i,j) = 0.0d+00
- fjac(1,5,i,j) = 0.0d+00
-
- fjac(2,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) )
- > * tmp2
- fjac(2,2,i,j) = u(3,i,j,k,c) * tmp1
- fjac(2,3,i,j) = u(2,i,j,k,c) * tmp1
- fjac(2,4,i,j) = 0.0d+00
- fjac(2,5,i,j) = 0.0d+00
-
- fjac(3,1,i,j) = - ( u(3,i,j,k,c)*u(3,i,j,k,c)*tmp2)
- > + c2 * qs(i,j,k,c)
- fjac(3,2,i,j) = - c2 * u(2,i,j,k,c) * tmp1
- fjac(3,3,i,j) = ( 2.0d+00 - c2 )
- > * u(3,i,j,k,c) * tmp1
- fjac(3,4,i,j) = - c2 * u(4,i,j,k,c) * tmp1
- fjac(3,5,i,j) = c2
-
- fjac(4,1,i,j) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
- > * tmp2
- fjac(4,2,i,j) = 0.0d+00
- fjac(4,3,i,j) = u(4,i,j,k,c) * tmp1
- fjac(4,4,i,j) = u(3,i,j,k,c) * tmp1
- fjac(4,5,i,j) = 0.0d+00
-
- fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
- > - c1 * u(5,i,j,k,c) * tmp1 )
- > * u(3,i,j,k,c) * tmp1
- fjac(5,2,i,j) = - c2 * u(2,i,j,k,c)*u(3,i,j,k,c)
- > * tmp2
- fjac(5,3,i,j) = c1 * u(5,i,j,k,c) * tmp1
- > - c2 * ( qs(i,j,k,c)
- > + u(3,i,j,k,c)*u(3,i,j,k,c) * tmp2 )
- fjac(5,4,i,j) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
- > * tmp2
- fjac(5,5,i,j) = c1 * u(3,i,j,k,c) * tmp1
-
- njac(1,1,i,j) = 0.0d+00
- njac(1,2,i,j) = 0.0d+00
- njac(1,3,i,j) = 0.0d+00
- njac(1,4,i,j) = 0.0d+00
- njac(1,5,i,j) = 0.0d+00
-
- njac(2,1,i,j) = - c3c4 * tmp2 * u(2,i,j,k,c)
- njac(2,2,i,j) = c3c4 * tmp1
- njac(2,3,i,j) = 0.0d+00
- njac(2,4,i,j) = 0.0d+00
- njac(2,5,i,j) = 0.0d+00
-
- njac(3,1,i,j) = - con43 * c3c4 * tmp2 * u(3,i,j,k,c)
- njac(3,2,i,j) = 0.0d+00
- njac(3,3,i,j) = con43 * c3c4 * tmp1
- njac(3,4,i,j) = 0.0d+00
- njac(3,5,i,j) = 0.0d+00
-
- njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
- njac(4,2,i,j) = 0.0d+00
- njac(4,3,i,j) = 0.0d+00
- njac(4,4,i,j) = c3c4 * tmp1
- njac(4,5,i,j) = 0.0d+00
-
- njac(5,1,i,j) = - ( c3c4
- > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
- > - ( con43 * c3c4
- > - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
- > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
- > - c1345 * tmp2 * u(5,i,j,k,c)
-
- njac(5,2,i,j) = ( c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
- njac(5,3,i,j) = ( con43 * c3c4
- > - c1345 ) * tmp2 * u(3,i,j,k,c)
- njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
- njac(5,5,i,j) = ( c1345 ) * tmp1
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c now joacobians set, so form left hand side in y direction
-c---------------------------------------------------------------------
- do j = start(2,c), jsize-end(2,c)
- do i=start(1,c),isize
-
- tmp1 = dt * ty1
- tmp2 = dt * ty2
-
- lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i,j-1)
- > - tmp1 * njac(1,1,i,j-1)
- > - tmp1 * dy1
- lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i,j-1)
- > - tmp1 * njac(1,2,i,j-1)
- lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i,j-1)
- > - tmp1 * njac(1,3,i,j-1)
- lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i,j-1)
- > - tmp1 * njac(1,4,i,j-1)
- lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i,j-1)
- > - tmp1 * njac(1,5,i,j-1)
-
- lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i,j-1)
- > - tmp1 * njac(2,1,i,j-1)
- lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i,j-1)
- > - tmp1 * njac(2,2,i,j-1)
- > - tmp1 * dy2
- lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i,j-1)
- > - tmp1 * njac(2,3,i,j-1)
- lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i,j-1)
- > - tmp1 * njac(2,4,i,j-1)
- lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i,j-1)
- > - tmp1 * njac(2,5,i,j-1)
-
- lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i,j-1)
- > - tmp1 * njac(3,1,i,j-1)
- lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i,j-1)
- > - tmp1 * njac(3,2,i,j-1)
- lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i,j-1)
- > - tmp1 * njac(3,3,i,j-1)
- > - tmp1 * dy3
- lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i,j-1)
- > - tmp1 * njac(3,4,i,j-1)
- lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i,j-1)
- > - tmp1 * njac(3,5,i,j-1)
-
- lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i,j-1)
- > - tmp1 * njac(4,1,i,j-1)
- lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i,j-1)
- > - tmp1 * njac(4,2,i,j-1)
- lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i,j-1)
- > - tmp1 * njac(4,3,i,j-1)
- lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i,j-1)
- > - tmp1 * njac(4,4,i,j-1)
- > - tmp1 * dy4
- lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i,j-1)
- > - tmp1 * njac(4,5,i,j-1)
-
- lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i,j-1)
- > - tmp1 * njac(5,1,i,j-1)
- lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i,j-1)
- > - tmp1 * njac(5,2,i,j-1)
- lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i,j-1)
- > - tmp1 * njac(5,3,i,j-1)
- lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i,j-1)
- > - tmp1 * njac(5,4,i,j-1)
- lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i,j-1)
- > - tmp1 * njac(5,5,i,j-1)
- > - tmp1 * dy5
-
- lhsb(1,1,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(1,1,i,j)
- > + tmp1 * 2.0d+00 * dy1
- lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
- lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
- lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
- lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
-
- lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
- lhsb(2,2,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(2,2,i,j)
- > + tmp1 * 2.0d+00 * dy2
- lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
- lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
- lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
-
- lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
- lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
- lhsb(3,3,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(3,3,i,j)
- > + tmp1 * 2.0d+00 * dy3
- lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
- lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
-
- lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
- lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
- lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
- lhsb(4,4,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(4,4,i,j)
- > + tmp1 * 2.0d+00 * dy4
- lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
-
- lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
- lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
- lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
- lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
- lhsb(5,5,i,j) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(5,5,i,j)
- > + tmp1 * 2.0d+00 * dy5
-
- lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i,j+1)
- > - tmp1 * njac(1,1,i,j+1)
- > - tmp1 * dy1
- lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i,j+1)
- > - tmp1 * njac(1,2,i,j+1)
- lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i,j+1)
- > - tmp1 * njac(1,3,i,j+1)
- lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i,j+1)
- > - tmp1 * njac(1,4,i,j+1)
- lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i,j+1)
- > - tmp1 * njac(1,5,i,j+1)
-
- lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i,j+1)
- > - tmp1 * njac(2,1,i,j+1)
- lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i,j+1)
- > - tmp1 * njac(2,2,i,j+1)
- > - tmp1 * dy2
- lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i,j+1)
- > - tmp1 * njac(2,3,i,j+1)
- lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i,j+1)
- > - tmp1 * njac(2,4,i,j+1)
- lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i,j+1)
- > - tmp1 * njac(2,5,i,j+1)
-
- lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i,j+1)
- > - tmp1 * njac(3,1,i,j+1)
- lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i,j+1)
- > - tmp1 * njac(3,2,i,j+1)
- lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i,j+1)
- > - tmp1 * njac(3,3,i,j+1)
- > - tmp1 * dy3
- lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i,j+1)
- > - tmp1 * njac(3,4,i,j+1)
- lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i,j+1)
- > - tmp1 * njac(3,5,i,j+1)
-
- lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i,j+1)
- > - tmp1 * njac(4,1,i,j+1)
- lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i,j+1)
- > - tmp1 * njac(4,2,i,j+1)
- lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i,j+1)
- > - tmp1 * njac(4,3,i,j+1)
- lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i,j+1)
- > - tmp1 * njac(4,4,i,j+1)
- > - tmp1 * dy4
- lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i,j+1)
- > - tmp1 * njac(4,5,i,j+1)
-
- lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i,j+1)
- > - tmp1 * njac(5,1,i,j+1)
- lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i,j+1)
- > - tmp1 * njac(5,2,i,j+1)
- lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i,j+1)
- > - tmp1 * njac(5,3,i,j+1)
- lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i,j+1)
- > - tmp1 * njac(5,4,i,j+1)
- lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i,j+1)
- > - tmp1 * njac(5,5,i,j+1)
- > - tmp1 * dy5
-
- enddo
- enddo
-
-
-c---------------------------------------------------------------------
-c outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
- if (first .eq. 1) then
-
-c---------------------------------------------------------------------
-c multiply c(i,jstart,k) by b_inverse and copy back to c
-c multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
-c---------------------------------------------------------------------
-!dir$ ivdep
- do i=start(1,c),isize
- call binvcrhs( lhsb(1,1,i,jstart),
- > lhsc(1,1,i,jstart,k,c),
- > rhs(1,i,jstart,k,c) )
- enddo
-
- endif
-
-c---------------------------------------------------------------------
-c begin inner most do loop
-c do all the elements of the cell unless last
-c---------------------------------------------------------------------
- do j=jstart+first,jsize-last
-!dir$ ivdep
- do i=start(1,c),isize
-
-c---------------------------------------------------------------------
-c subtract A*lhs_vector(j-1) from lhs_vector(j)
-c
-c rhs(j) = rhs(j) - A*rhs(j-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,i,j),
- > rhs(1,i,j-1,k,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c B(j) = B(j) - C(j-1)*A(j)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,i,j),
- > lhsc(1,1,i,j-1,k,c),
- > lhsb(1,1,i,j))
-
-c---------------------------------------------------------------------
-c multiply c(i,j,k) by b_inverse and copy back to c
-c multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
-c---------------------------------------------------------------------
- call binvcrhs( lhsb(1,1,i,j),
- > lhsc(1,1,i,j,k,c),
- > rhs(1,i,j,k,c) )
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c Now finish up special cases for last cell
-c---------------------------------------------------------------------
- if (last .eq. 1) then
-
-!dir$ ivdep
- do i=start(1,c),isize
-c---------------------------------------------------------------------
-c rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,i,jsize),
- > rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c))
-
-c---------------------------------------------------------------------
-c B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
-c call matmul_sub(aa,i,jsize,k,c,
-c $ cc,i,jsize-1,k,c,bb,i,jsize,k,c)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,i,jsize),
- > lhsc(1,1,i,jsize-1,k,c),
- > lhsb(1,1,i,jsize))
-
-c---------------------------------------------------------------------
-c multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
-c---------------------------------------------------------------------
- call binvrhs( lhsb(1,1,i,jsize),
- > rhs(1,i,jsize,k,c) )
- enddo
-
- endif
- enddo
-
-
- return
- end
-
-
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Performs line solves in Z direction by first factoring
-c the block-tridiagonal matrix into an upper triangular matrix,
-c and then performing back substitution to solve for the unknow
-c vectors of each line.
-c
-c Make sure we treat elements zero to cell_size in the direction
-c of the sweep.
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer c, kstart, stage,
- > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
- > isize,jsize,ksize,send_id
-
- kstart = 0
-
-c---------------------------------------------------------------------
-c in our terminology stage is the number of the cell in the y-direction
-c i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
- do stage = 1,ncells
- c = slice(3,stage)
- isize = cell_size(1,c) - 1
- jsize = cell_size(2,c) - 1
- ksize = cell_size(3,c) - 1
-c---------------------------------------------------------------------
-c set last-cell flag
-c---------------------------------------------------------------------
- if (stage .eq. ncells) then
- last = 1
- else
- last = 0
- endif
-
- if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
- first = 1
-c call lhsz(c)
- call z_solve_cell(first,last,c)
- else
-c---------------------------------------------------------------------
-c Not the first cell of this line, so receive info from
-c processor working on preceeding cell
-c---------------------------------------------------------------------
- first = 0
- call z_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c overlap computations and communications
-c---------------------------------------------------------------------
-c call lhsz(c)
-c---------------------------------------------------------------------
-c wait for completion
-c---------------------------------------------------------------------
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c install C'(kstart+1) and rhs'(kstart+1) to be used in this cell
-c---------------------------------------------------------------------
- call z_unpack_solve_info(c)
- call z_solve_cell(first,last,c)
- endif
-
- if (last .eq. 0) call z_send_solve_info(send_id,c)
- enddo
-
-c---------------------------------------------------------------------
-c now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
- do stage = ncells, 1, -1
- c = slice(3,stage)
- first = 0
- last = 0
- if (stage .eq. 1) first = 1
- if (stage .eq. ncells) then
- last = 1
-c---------------------------------------------------------------------
-c last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
- call z_backsubstitute(first, last,c)
- else
- call z_receive_backsub_info(recv_id,c)
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
- call z_unpack_backsub_info(c)
- call z_backsubstitute(first,last,c)
- endif
- if (first .eq. 0) call z_send_backsub_info(send_id,c)
- enddo
-
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_unpack_solve_info(c)
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack C'(-1) and rhs'(-1) for
-c all i and j
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i,j,m,n,ptr,c,kstart
-
- kstart = 0
- ptr = 0
- do j=0,JMAX-1
- do i=0,IMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send C'(kend) and rhs'(kend) for
-c all i and j
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i,j,m,n,ksize,ptr,c,ip,jp
- integer error,send_id,buffer_size
-
- ksize = cell_size(3,c)-1
- ip = cell_coord(1,c) - 1
- jp = cell_coord(2,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c pack up buffer
-c---------------------------------------------------------------------
- ptr = 0
- do j=0,JMAX-1
- do i=0,IMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,i,j,ksize,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c send buffer
-c---------------------------------------------------------------------
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, successor(3),
- > BOTTOM+ip+jp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send U(jstart) for all i and j
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i,j,n,ptr,c,kstart,ip,jp
- integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c Send element 0 to previous processor
-c---------------------------------------------------------------------
- kstart = 0
- ip = cell_coord(1,c)-1
- jp = cell_coord(2,c)-1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- ptr = 0
- do j=0,JMAX-1
- do i=0,IMAX-1
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,i,j,kstart,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, predecessor(3),
- > TOP+ip+jp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack U(ksize) for all i and j
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i,j,n,ptr,c
-
- ptr = 0
- do j=0,JMAX-1
- do i=0,IMAX-1
- do n=1,BLOCK_SIZE
- backsub_info(n,i,j,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer error,recv_id,ip,jp,c,buffer_size
- ip = cell_coord(1,c) - 1
- jp = cell_coord(2,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, successor(3),
- > TOP+ip+jp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer ip,jp,recv_id,error,c,buffer_size
- ip = cell_coord(1,c) - 1
- jp = cell_coord(2,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, predecessor(3),
- > BOTTOM+ip+jp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c back solve: if last cell, then generate U(ksize)=rhs(ksize)
-c else assume U(ksize) is loaded in un pack backsub_info
-c so just use it
-c after call u(kstart) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer first, last, c, i, k
- integer m,n,j,jsize,isize,ksize,kstart
-
- kstart = 0
- isize = cell_size(1,c)-end(1,c)-1
- jsize = cell_size(2,c)-end(2,c)-1
- ksize = cell_size(3,c)-1
- if (last .eq. 0) then
- do j=start(2,c),jsize
- do i=start(1,c),isize
-c---------------------------------------------------------------------
-c U(jsize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c)
- > - lhsc(m,n,i,j,ksize,c)*
- > backsub_info(n,i,j,c)
- enddo
- enddo
- enddo
- enddo
- endif
- do k=ksize-1,kstart,-1
- do j=start(2,c),jsize
- do i=start(1,c),isize
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
- > - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c)
- enddo
- enddo
- enddo
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c performs guaussian elimination on this cell.
-c
-c assumes that unpacking routines for non-first cells
-c preload C' and rhs' from previous cell.
-c
-c assumed send happens outside this routine, but that
-c c'(KMAX) and rhs'(KMAX) will be sent to next cell.
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'work_lhs.h'
-
- integer first,last,c
- integer i,j,k,isize,ksize,jsize,kstart
- double precision utmp(6,-2:KMAX+1)
-
- kstart = 0
- isize = cell_size(1,c)-end(1,c)-1
- jsize = cell_size(2,c)-end(2,c)-1
- ksize = cell_size(3,c)-1
-
- call lhsabinit(lhsa, lhsb, ksize)
-
- do j=start(2,c),jsize
- do i=start(1,c),isize
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three z-factors
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Compute the indices for storing the block-diagonal matrix;
-c determine c (labeled f) and s jacobians for cell c
-c---------------------------------------------------------------------
- do k = start(3,c)-1, cell_size(3,c)-end(3,c)
- utmp(1,k) = 1.0d0 / u(1,i,j,k,c)
- utmp(2,k) = u(2,i,j,k,c)
- utmp(3,k) = u(3,i,j,k,c)
- utmp(4,k) = u(4,i,j,k,c)
- utmp(5,k) = u(5,i,j,k,c)
- utmp(6,k) = qs(i,j,k,c)
- end do
-
- do k = start(3,c)-1, cell_size(3,c)-end(3,c)
-
- tmp1 = utmp(1,k)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- fjac(1,1,k) = 0.0d+00
- fjac(1,2,k) = 0.0d+00
- fjac(1,3,k) = 0.0d+00
- fjac(1,4,k) = 1.0d+00
- fjac(1,5,k) = 0.0d+00
-
- fjac(2,1,k) = - ( utmp(2,k)*utmp(4,k) )
- > * tmp2
- fjac(2,2,k) = utmp(4,k) * tmp1
- fjac(2,3,k) = 0.0d+00
- fjac(2,4,k) = utmp(2,k) * tmp1
- fjac(2,5,k) = 0.0d+00
-
- fjac(3,1,k) = - ( utmp(3,k)*utmp(4,k) )
- > * tmp2
- fjac(3,2,k) = 0.0d+00
- fjac(3,3,k) = utmp(4,k) * tmp1
- fjac(3,4,k) = utmp(3,k) * tmp1
- fjac(3,5,k) = 0.0d+00
-
- fjac(4,1,k) = - (utmp(4,k)*utmp(4,k) * tmp2 )
- > + c2 * utmp(6,k)
- fjac(4,2,k) = - c2 * utmp(2,k) * tmp1
- fjac(4,3,k) = - c2 * utmp(3,k) * tmp1
- fjac(4,4,k) = ( 2.0d+00 - c2 )
- > * utmp(4,k) * tmp1
- fjac(4,5,k) = c2
-
- fjac(5,1,k) = ( c2 * 2.0d0 * utmp(6,k)
- > - c1 * ( utmp(5,k) * tmp1 ) )
- > * ( utmp(4,k) * tmp1 )
- fjac(5,2,k) = - c2 * ( utmp(2,k)*utmp(4,k) )
- > * tmp2
- fjac(5,3,k) = - c2 * ( utmp(3,k)*utmp(4,k) )
- > * tmp2
- fjac(5,4,k) = c1 * ( utmp(5,k) * tmp1 )
- > - c2 * ( utmp(6,k)
- > + utmp(4,k)*utmp(4,k) * tmp2 )
- fjac(5,5,k) = c1 * utmp(4,k) * tmp1
-
- njac(1,1,k) = 0.0d+00
- njac(1,2,k) = 0.0d+00
- njac(1,3,k) = 0.0d+00
- njac(1,4,k) = 0.0d+00
- njac(1,5,k) = 0.0d+00
-
- njac(2,1,k) = - c3c4 * tmp2 * utmp(2,k)
- njac(2,2,k) = c3c4 * tmp1
- njac(2,3,k) = 0.0d+00
- njac(2,4,k) = 0.0d+00
- njac(2,5,k) = 0.0d+00
-
- njac(3,1,k) = - c3c4 * tmp2 * utmp(3,k)
- njac(3,2,k) = 0.0d+00
- njac(3,3,k) = c3c4 * tmp1
- njac(3,4,k) = 0.0d+00
- njac(3,5,k) = 0.0d+00
-
- njac(4,1,k) = - con43 * c3c4 * tmp2 * utmp(4,k)
- njac(4,2,k) = 0.0d+00
- njac(4,3,k) = 0.0d+00
- njac(4,4,k) = con43 * c3 * c4 * tmp1
- njac(4,5,k) = 0.0d+00
-
- njac(5,1,k) = - ( c3c4
- > - c1345 ) * tmp3 * (utmp(2,k)**2)
- > - ( c3c4 - c1345 ) * tmp3 * (utmp(3,k)**2)
- > - ( con43 * c3c4
- > - c1345 ) * tmp3 * (utmp(4,k)**2)
- > - c1345 * tmp2 * utmp(5,k)
-
- njac(5,2,k) = ( c3c4 - c1345 ) * tmp2 * utmp(2,k)
- njac(5,3,k) = ( c3c4 - c1345 ) * tmp2 * utmp(3,k)
- njac(5,4,k) = ( con43 * c3c4
- > - c1345 ) * tmp2 * utmp(4,k)
- njac(5,5,k) = ( c1345 )* tmp1
-
-
- enddo
-
-c---------------------------------------------------------------------
-c now joacobians set, so form left hand side in z direction
-c---------------------------------------------------------------------
- do k = start(3,c), ksize-end(3,c)
-
- tmp1 = dt * tz1
- tmp2 = dt * tz2
-
- lhsa(1,1,k) = - tmp2 * fjac(1,1,k-1)
- > - tmp1 * njac(1,1,k-1)
- > - tmp1 * dz1
- lhsa(1,2,k) = - tmp2 * fjac(1,2,k-1)
- > - tmp1 * njac(1,2,k-1)
- lhsa(1,3,k) = - tmp2 * fjac(1,3,k-1)
- > - tmp1 * njac(1,3,k-1)
- lhsa(1,4,k) = - tmp2 * fjac(1,4,k-1)
- > - tmp1 * njac(1,4,k-1)
- lhsa(1,5,k) = - tmp2 * fjac(1,5,k-1)
- > - tmp1 * njac(1,5,k-1)
-
- lhsa(2,1,k) = - tmp2 * fjac(2,1,k-1)
- > - tmp1 * njac(2,1,k-1)
- lhsa(2,2,k) = - tmp2 * fjac(2,2,k-1)
- > - tmp1 * njac(2,2,k-1)
- > - tmp1 * dz2
- lhsa(2,3,k) = - tmp2 * fjac(2,3,k-1)
- > - tmp1 * njac(2,3,k-1)
- lhsa(2,4,k) = - tmp2 * fjac(2,4,k-1)
- > - tmp1 * njac(2,4,k-1)
- lhsa(2,5,k) = - tmp2 * fjac(2,5,k-1)
- > - tmp1 * njac(2,5,k-1)
-
- lhsa(3,1,k) = - tmp2 * fjac(3,1,k-1)
- > - tmp1 * njac(3,1,k-1)
- lhsa(3,2,k) = - tmp2 * fjac(3,2,k-1)
- > - tmp1 * njac(3,2,k-1)
- lhsa(3,3,k) = - tmp2 * fjac(3,3,k-1)
- > - tmp1 * njac(3,3,k-1)
- > - tmp1 * dz3
- lhsa(3,4,k) = - tmp2 * fjac(3,4,k-1)
- > - tmp1 * njac(3,4,k-1)
- lhsa(3,5,k) = - tmp2 * fjac(3,5,k-1)
- > - tmp1 * njac(3,5,k-1)
-
- lhsa(4,1,k) = - tmp2 * fjac(4,1,k-1)
- > - tmp1 * njac(4,1,k-1)
- lhsa(4,2,k) = - tmp2 * fjac(4,2,k-1)
- > - tmp1 * njac(4,2,k-1)
- lhsa(4,3,k) = - tmp2 * fjac(4,3,k-1)
- > - tmp1 * njac(4,3,k-1)
- lhsa(4,4,k) = - tmp2 * fjac(4,4,k-1)
- > - tmp1 * njac(4,4,k-1)
- > - tmp1 * dz4
- lhsa(4,5,k) = - tmp2 * fjac(4,5,k-1)
- > - tmp1 * njac(4,5,k-1)
-
- lhsa(5,1,k) = - tmp2 * fjac(5,1,k-1)
- > - tmp1 * njac(5,1,k-1)
- lhsa(5,2,k) = - tmp2 * fjac(5,2,k-1)
- > - tmp1 * njac(5,2,k-1)
- lhsa(5,3,k) = - tmp2 * fjac(5,3,k-1)
- > - tmp1 * njac(5,3,k-1)
- lhsa(5,4,k) = - tmp2 * fjac(5,4,k-1)
- > - tmp1 * njac(5,4,k-1)
- lhsa(5,5,k) = - tmp2 * fjac(5,5,k-1)
- > - tmp1 * njac(5,5,k-1)
- > - tmp1 * dz5
-
- lhsb(1,1,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(1,1,k)
- > + tmp1 * 2.0d+00 * dz1
- lhsb(1,2,k) = tmp1 * 2.0d+00 * njac(1,2,k)
- lhsb(1,3,k) = tmp1 * 2.0d+00 * njac(1,3,k)
- lhsb(1,4,k) = tmp1 * 2.0d+00 * njac(1,4,k)
- lhsb(1,5,k) = tmp1 * 2.0d+00 * njac(1,5,k)
-
- lhsb(2,1,k) = tmp1 * 2.0d+00 * njac(2,1,k)
- lhsb(2,2,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(2,2,k)
- > + tmp1 * 2.0d+00 * dz2
- lhsb(2,3,k) = tmp1 * 2.0d+00 * njac(2,3,k)
- lhsb(2,4,k) = tmp1 * 2.0d+00 * njac(2,4,k)
- lhsb(2,5,k) = tmp1 * 2.0d+00 * njac(2,5,k)
-
- lhsb(3,1,k) = tmp1 * 2.0d+00 * njac(3,1,k)
- lhsb(3,2,k) = tmp1 * 2.0d+00 * njac(3,2,k)
- lhsb(3,3,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(3,3,k)
- > + tmp1 * 2.0d+00 * dz3
- lhsb(3,4,k) = tmp1 * 2.0d+00 * njac(3,4,k)
- lhsb(3,5,k) = tmp1 * 2.0d+00 * njac(3,5,k)
-
- lhsb(4,1,k) = tmp1 * 2.0d+00 * njac(4,1,k)
- lhsb(4,2,k) = tmp1 * 2.0d+00 * njac(4,2,k)
- lhsb(4,3,k) = tmp1 * 2.0d+00 * njac(4,3,k)
- lhsb(4,4,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(4,4,k)
- > + tmp1 * 2.0d+00 * dz4
- lhsb(4,5,k) = tmp1 * 2.0d+00 * njac(4,5,k)
-
- lhsb(5,1,k) = tmp1 * 2.0d+00 * njac(5,1,k)
- lhsb(5,2,k) = tmp1 * 2.0d+00 * njac(5,2,k)
- lhsb(5,3,k) = tmp1 * 2.0d+00 * njac(5,3,k)
- lhsb(5,4,k) = tmp1 * 2.0d+00 * njac(5,4,k)
- lhsb(5,5,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(5,5,k)
- > + tmp1 * 2.0d+00 * dz5
-
- lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,k+1)
- > - tmp1 * njac(1,1,k+1)
- > - tmp1 * dz1
- lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,k+1)
- > - tmp1 * njac(1,2,k+1)
- lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,k+1)
- > - tmp1 * njac(1,3,k+1)
- lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,k+1)
- > - tmp1 * njac(1,4,k+1)
- lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,k+1)
- > - tmp1 * njac(1,5,k+1)
-
- lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,k+1)
- > - tmp1 * njac(2,1,k+1)
- lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,k+1)
- > - tmp1 * njac(2,2,k+1)
- > - tmp1 * dz2
- lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,k+1)
- > - tmp1 * njac(2,3,k+1)
- lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,k+1)
- > - tmp1 * njac(2,4,k+1)
- lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,k+1)
- > - tmp1 * njac(2,5,k+1)
-
- lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,k+1)
- > - tmp1 * njac(3,1,k+1)
- lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,k+1)
- > - tmp1 * njac(3,2,k+1)
- lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,k+1)
- > - tmp1 * njac(3,3,k+1)
- > - tmp1 * dz3
- lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,k+1)
- > - tmp1 * njac(3,4,k+1)
- lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,k+1)
- > - tmp1 * njac(3,5,k+1)
-
- lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,k+1)
- > - tmp1 * njac(4,1,k+1)
- lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,k+1)
- > - tmp1 * njac(4,2,k+1)
- lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,k+1)
- > - tmp1 * njac(4,3,k+1)
- lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,k+1)
- > - tmp1 * njac(4,4,k+1)
- > - tmp1 * dz4
- lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,k+1)
- > - tmp1 * njac(4,5,k+1)
-
- lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,k+1)
- > - tmp1 * njac(5,1,k+1)
- lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,k+1)
- > - tmp1 * njac(5,2,k+1)
- lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,k+1)
- > - tmp1 * njac(5,3,k+1)
- lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,k+1)
- > - tmp1 * njac(5,4,k+1)
- lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,k+1)
- > - tmp1 * njac(5,5,k+1)
- > - tmp1 * dz5
-
- enddo
-
-
-c---------------------------------------------------------------------
-c outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
- if (first .eq. 1) then
-
-c---------------------------------------------------------------------
-c multiply c(i,j,kstart) by b_inverse and copy back to c
-c multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
-c---------------------------------------------------------------------
- call binvcrhs( lhsb(1,1,kstart),
- > lhsc(1,1,i,j,kstart,c),
- > rhs(1,i,j,kstart,c) )
-
- endif
-
-c---------------------------------------------------------------------
-c begin inner most do loop
-c do all the elements of the cell unless last
-c---------------------------------------------------------------------
- do k=kstart+first,ksize-last
-
-c---------------------------------------------------------------------
-c subtract A*lhs_vector(k-1) from lhs_vector(k)
-c
-c rhs(k) = rhs(k) - A*rhs(k-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,k),
- > rhs(1,i,j,k-1,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c B(k) = B(k) - C(k-1)*A(k)
-c call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,k),
- > lhsc(1,1,i,j,k-1,c),
- > lhsb(1,1,k))
-
-c---------------------------------------------------------------------
-c multiply c(i,j,k) by b_inverse and copy back to c
-c multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
-c---------------------------------------------------------------------
- call binvcrhs( lhsb(1,1,k),
- > lhsc(1,1,i,j,k,c),
- > rhs(1,i,j,k,c) )
-
- enddo
-
-c---------------------------------------------------------------------
-c Now finish up special cases for last cell
-c---------------------------------------------------------------------
- if (last .eq. 1) then
-
-c---------------------------------------------------------------------
-c rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,ksize),
- > rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c))
-
-c---------------------------------------------------------------------
-c B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
-c call matmul_sub(aa,i,j,ksize,c,
-c $ cc,i,j,ksize-1,c,bb,i,j,ksize,c)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,ksize),
- > lhsc(1,1,i,j,ksize-1,c),
- > lhsb(1,1,ksize))
-
-c---------------------------------------------------------------------
-c multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
-c---------------------------------------------------------------------
- call binvrhs( lhsb(1,1,ksize),
- > rhs(1,i,j,ksize,c) )
-
- endif
- enddo
- enddo
-
-
- return
- end
-
-
-
-
-
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Performs line solves in Z direction by first factoring
-c the block-tridiagonal matrix into an upper triangular matrix,
-c and then performing back substitution to solve for the unknow
-c vectors of each line.
-c
-c Make sure we treat elements zero to cell_size in the direction
-c of the sweep.
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer c, kstart, stage,
- > first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
- > isize,jsize,ksize,send_id
-
- kstart = 0
-
-c---------------------------------------------------------------------
-c in our terminology stage is the number of the cell in the y-direct
-c i.e. stage = 1 means the start of the line stage=ncells means end
-c---------------------------------------------------------------------
- do stage = 1,ncells
- c = slice(3,stage)
- isize = cell_size(1,c) - 1
- jsize = cell_size(2,c) - 1
- ksize = cell_size(3,c) - 1
-c---------------------------------------------------------------------
-c set last-cell flag
-c---------------------------------------------------------------------
- if (stage .eq. ncells) then
- last = 1
- else
- last = 0
- endif
-
- if (stage .eq. 1) then
-c---------------------------------------------------------------------
-c This is the first cell, so solve without receiving data
-c---------------------------------------------------------------------
- first = 1
-c call lhsz(c)
- call z_solve_cell(first,last,c)
- else
-c---------------------------------------------------------------------
-c Not the first cell of this line, so receive info from
-c processor working on preceeding cell
-c---------------------------------------------------------------------
- first = 0
- call z_receive_solve_info(recv_id,c)
-c---------------------------------------------------------------------
-c overlap computations and communications
-c---------------------------------------------------------------------
-c call lhsz(c)
-c---------------------------------------------------------------------
-c wait for completion
-c---------------------------------------------------------------------
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
-c---------------------------------------------------------------------
-c install C'(kstart+1) and rhs'(kstart+1) to be used in this cell
-c---------------------------------------------------------------------
- call z_unpack_solve_info(c)
- call z_solve_cell(first,last,c)
- endif
-
- if (last .eq. 0) call z_send_solve_info(send_id,c)
- enddo
-
-c---------------------------------------------------------------------
-c now perform backsubstitution in reverse direction
-c---------------------------------------------------------------------
- do stage = ncells, 1, -1
- c = slice(3,stage)
- first = 0
- last = 0
- if (stage .eq. 1) first = 1
- if (stage .eq. ncells) then
- last = 1
-c---------------------------------------------------------------------
-c last cell, so perform back substitute without waiting
-c---------------------------------------------------------------------
- call z_backsubstitute(first, last,c)
- else
- call z_receive_backsub_info(recv_id,c)
- call mpi_wait(send_id,r_status,error)
- call mpi_wait(recv_id,r_status,error)
- call z_unpack_backsub_info(c)
- call z_backsubstitute(first,last,c)
- endif
- if (first .eq. 0) call z_send_backsub_info(send_id,c)
- enddo
-
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_unpack_solve_info(c)
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack C'(-1) and rhs'(-1) for
-c all i and j
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i,j,m,n,ptr,c,kstart
-
- kstart = 0
- ptr = 0
- do j=0,JMAX-1
- do i=0,IMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_send_solve_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send C'(kend) and rhs'(kend) for
-c all i and j
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i,j,m,n,ksize,ptr,c,ip,jp
- integer error,send_id,buffer_size
-
- ksize = cell_size(3,c)-1
- ip = cell_coord(1,c) - 1
- jp = cell_coord(2,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
-
-c---------------------------------------------------------------------
-c pack up buffer
-c---------------------------------------------------------------------
- ptr = 0
- do j=0,JMAX-1
- do i=0,IMAX-1
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,i,j,ksize,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c send buffer
-c---------------------------------------------------------------------
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, successor(3),
- > BOTTOM+ip+jp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_send_backsub_info(send_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c pack up and send U(jstart) for all i and j
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i,j,n,ptr,c,kstart,ip,jp
- integer error,send_id,buffer_size
-
-c---------------------------------------------------------------------
-c Send element 0 to previous processor
-c---------------------------------------------------------------------
- kstart = 0
- ip = cell_coord(1,c)-1
- jp = cell_coord(2,c)-1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- ptr = 0
- do j=0,JMAX-1
- do i=0,IMAX-1
- do n=1,BLOCK_SIZE
- in_buffer(ptr+n) = rhs(n,i,j,kstart,c)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- call mpi_isend(in_buffer, buffer_size,
- > dp_type, predecessor(3),
- > TOP+ip+jp*NCELLS, comm_solve,
- > send_id,error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_unpack_backsub_info(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c unpack U(ksize) for all i and j
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i,j,n,ptr,c
-
- ptr = 0
- do j=0,JMAX-1
- do i=0,IMAX-1
- do n=1,BLOCK_SIZE
- backsub_info(n,i,j,c) = out_buffer(ptr+n)
- enddo
- ptr = ptr+BLOCK_SIZE
- enddo
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_receive_backsub_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer error,recv_id,ip,jp,c,buffer_size
- ip = cell_coord(1,c) - 1
- jp = cell_coord(2,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, successor(3),
- > TOP+ip+jp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_receive_solve_info(recv_id,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c post mpi receives
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer ip,jp,recv_id,error,c,buffer_size
- ip = cell_coord(1,c) - 1
- jp = cell_coord(2,c) - 1
- buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
- > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
- call mpi_irecv(out_buffer, buffer_size,
- > dp_type, predecessor(3),
- > BOTTOM+ip+jp*NCELLS, comm_solve,
- > recv_id, error)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_backsubstitute(first, last, c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c back solve: if last cell, then generate U(ksize)=rhs(ksize)
-c else assume U(ksize) is loaded in un pack backsub_info
-c so just use it
-c after call u(kstart) will be sent to next cell
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer first, last, c, i, k
- integer m,n,j,jsize,isize,ksize,kstart
-
- kstart = 0
- isize = cell_size(1,c)-end(1,c)-1
- jsize = cell_size(2,c)-end(2,c)-1
- ksize = cell_size(3,c)-1
- if (last .eq. 0) then
- do j=start(2,c),jsize
- do i=start(1,c),isize
-c---------------------------------------------------------------------
-c U(jsize) uses info from previous cell if not last cell
-c---------------------------------------------------------------------
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c)
- > - lhsc(m,n,i,j,ksize,c)*
- > backsub_info(n,i,j,c)
- enddo
- enddo
- enddo
- enddo
- endif
- do k=ksize-1,kstart,-1
- do j=start(2,c),jsize
- do i=start(1,c),isize
- do m=1,BLOCK_SIZE
- do n=1,BLOCK_SIZE
- rhs(m,i,j,k,c) = rhs(m,i,j,k,c)
- > - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c)
- enddo
- enddo
- enddo
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_solve_cell(first,last,c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c performs guaussian elimination on this cell.
-c
-c assumes that unpacking routines for non-first cells
-c preload C' and rhs' from previous cell.
-c
-c assumed send happens outside this routine, but that
-c c'(KMAX) and rhs'(KMAX) will be sent to next cell.
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'work_lhs_vec.h'
-
- integer first,last,c
- integer i,j,k,m,n,isize,ksize,jsize,kstart
-
- kstart = 0
- isize = cell_size(1,c)-end(1,c)-1
- jsize = cell_size(2,c)-end(2,c)-1
- ksize = cell_size(3,c)-1
-
-c---------------------------------------------------------------------
-c zero the left hand side for starters
-c set diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
- do i = 0, isize
- do m = 1, 5
- do n = 1, 5
- lhsa(m,n,i,0) = 0.0d0
- lhsb(m,n,i,0) = 0.0d0
- lhsa(m,n,i,ksize) = 0.0d0
- lhsb(m,n,i,ksize) = 0.0d0
- enddo
- lhsb(m,m,i,0) = 1.0d0
- lhsb(m,m,i,ksize) = 1.0d0
- enddo
- enddo
-
- do j=start(2,c),jsize
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three z-factors
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Compute the indices for storing the block-diagonal matrix;
-c determine c (labeled f) and s jacobians for cell c
-c---------------------------------------------------------------------
-
- do k = start(3,c)-1, cell_size(3,c)-end(3,c)
- do i=start(1,c),isize
-
- tmp1 = 1.0d0 / u(1,i,j,k,c)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- fjac(1,1,i,k) = 0.0d+00
- fjac(1,2,i,k) = 0.0d+00
- fjac(1,3,i,k) = 0.0d+00
- fjac(1,4,i,k) = 1.0d+00
- fjac(1,5,i,k) = 0.0d+00
-
- fjac(2,1,i,k) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) )
- > * tmp2
- fjac(2,2,i,k) = u(4,i,j,k,c) * tmp1
- fjac(2,3,i,k) = 0.0d+00
- fjac(2,4,i,k) = u(2,i,j,k,c) * tmp1
- fjac(2,5,i,k) = 0.0d+00
-
- fjac(3,1,i,k) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
- > * tmp2
- fjac(3,2,i,k) = 0.0d+00
- fjac(3,3,i,k) = u(4,i,j,k,c) * tmp1
- fjac(3,4,i,k) = u(3,i,j,k,c) * tmp1
- fjac(3,5,i,k) = 0.0d+00
-
- fjac(4,1,i,k) = - (u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 )
- > + c2 * qs(i,j,k,c)
- fjac(4,2,i,k) = - c2 * u(2,i,j,k,c) * tmp1
- fjac(4,3,i,k) = - c2 * u(3,i,j,k,c) * tmp1
- fjac(4,4,i,k) = ( 2.0d+00 - c2 )
- > * u(4,i,j,k,c) * tmp1
- fjac(4,5,i,k) = c2
-
- fjac(5,1,i,k) = ( c2 * 2.0d0 * qs(i,j,k,c)
- > - c1 * ( u(5,i,j,k,c) * tmp1 ) )
- > * ( u(4,i,j,k,c) * tmp1 )
- fjac(5,2,i,k) = - c2 * ( u(2,i,j,k,c)*u(4,i,j,k,c) )
- > * tmp2
- fjac(5,3,i,k) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
- > * tmp2
- fjac(5,4,i,k) = c1 * ( u(5,i,j,k,c) * tmp1 )
- > - c2 * ( qs(i,j,k,c)
- > + u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 )
- fjac(5,5,i,k) = c1 * u(4,i,j,k,c) * tmp1
-
- njac(1,1,i,k) = 0.0d+00
- njac(1,2,i,k) = 0.0d+00
- njac(1,3,i,k) = 0.0d+00
- njac(1,4,i,k) = 0.0d+00
- njac(1,5,i,k) = 0.0d+00
-
- njac(2,1,i,k) = - c3c4 * tmp2 * u(2,i,j,k,c)
- njac(2,2,i,k) = c3c4 * tmp1
- njac(2,3,i,k) = 0.0d+00
- njac(2,4,i,k) = 0.0d+00
- njac(2,5,i,k) = 0.0d+00
-
- njac(3,1,i,k) = - c3c4 * tmp2 * u(3,i,j,k,c)
- njac(3,2,i,k) = 0.0d+00
- njac(3,3,i,k) = c3c4 * tmp1
- njac(3,4,i,k) = 0.0d+00
- njac(3,5,i,k) = 0.0d+00
-
- njac(4,1,i,k) = - con43 * c3c4 * tmp2 * u(4,i,j,k,c)
- njac(4,2,i,k) = 0.0d+00
- njac(4,3,i,k) = 0.0d+00
- njac(4,4,i,k) = con43 * c3 * c4 * tmp1
- njac(4,5,i,k) = 0.0d+00
-
- njac(5,1,i,k) = - ( c3c4
- > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
- > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
- > - ( con43 * c3c4
- > - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
- > - c1345 * tmp2 * u(5,i,j,k,c)
-
- njac(5,2,i,k) = ( c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
- njac(5,3,i,k) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
- njac(5,4,i,k) = ( con43 * c3c4
- > - c1345 ) * tmp2 * u(4,i,j,k,c)
- njac(5,5,i,k) = ( c1345 )* tmp1
-
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c now joacobians set, so form left hand side in z direction
-c---------------------------------------------------------------------
- do k = start(3,c), ksize-end(3,c)
- do i=start(1,c),isize
-
- tmp1 = dt * tz1
- tmp2 = dt * tz2
-
- lhsa(1,1,i,k) = - tmp2 * fjac(1,1,i,k-1)
- > - tmp1 * njac(1,1,i,k-1)
- > - tmp1 * dz1
- lhsa(1,2,i,k) = - tmp2 * fjac(1,2,i,k-1)
- > - tmp1 * njac(1,2,i,k-1)
- lhsa(1,3,i,k) = - tmp2 * fjac(1,3,i,k-1)
- > - tmp1 * njac(1,3,i,k-1)
- lhsa(1,4,i,k) = - tmp2 * fjac(1,4,i,k-1)
- > - tmp1 * njac(1,4,i,k-1)
- lhsa(1,5,i,k) = - tmp2 * fjac(1,5,i,k-1)
- > - tmp1 * njac(1,5,i,k-1)
-
- lhsa(2,1,i,k) = - tmp2 * fjac(2,1,i,k-1)
- > - tmp1 * njac(2,1,i,k-1)
- lhsa(2,2,i,k) = - tmp2 * fjac(2,2,i,k-1)
- > - tmp1 * njac(2,2,i,k-1)
- > - tmp1 * dz2
- lhsa(2,3,i,k) = - tmp2 * fjac(2,3,i,k-1)
- > - tmp1 * njac(2,3,i,k-1)
- lhsa(2,4,i,k) = - tmp2 * fjac(2,4,i,k-1)
- > - tmp1 * njac(2,4,i,k-1)
- lhsa(2,5,i,k) = - tmp2 * fjac(2,5,i,k-1)
- > - tmp1 * njac(2,5,i,k-1)
-
- lhsa(3,1,i,k) = - tmp2 * fjac(3,1,i,k-1)
- > - tmp1 * njac(3,1,i,k-1)
- lhsa(3,2,i,k) = - tmp2 * fjac(3,2,i,k-1)
- > - tmp1 * njac(3,2,i,k-1)
- lhsa(3,3,i,k) = - tmp2 * fjac(3,3,i,k-1)
- > - tmp1 * njac(3,3,i,k-1)
- > - tmp1 * dz3
- lhsa(3,4,i,k) = - tmp2 * fjac(3,4,i,k-1)
- > - tmp1 * njac(3,4,i,k-1)
- lhsa(3,5,i,k) = - tmp2 * fjac(3,5,i,k-1)
- > - tmp1 * njac(3,5,i,k-1)
-
- lhsa(4,1,i,k) = - tmp2 * fjac(4,1,i,k-1)
- > - tmp1 * njac(4,1,i,k-1)
- lhsa(4,2,i,k) = - tmp2 * fjac(4,2,i,k-1)
- > - tmp1 * njac(4,2,i,k-1)
- lhsa(4,3,i,k) = - tmp2 * fjac(4,3,i,k-1)
- > - tmp1 * njac(4,3,i,k-1)
- lhsa(4,4,i,k) = - tmp2 * fjac(4,4,i,k-1)
- > - tmp1 * njac(4,4,i,k-1)
- > - tmp1 * dz4
- lhsa(4,5,i,k) = - tmp2 * fjac(4,5,i,k-1)
- > - tmp1 * njac(4,5,i,k-1)
-
- lhsa(5,1,i,k) = - tmp2 * fjac(5,1,i,k-1)
- > - tmp1 * njac(5,1,i,k-1)
- lhsa(5,2,i,k) = - tmp2 * fjac(5,2,i,k-1)
- > - tmp1 * njac(5,2,i,k-1)
- lhsa(5,3,i,k) = - tmp2 * fjac(5,3,i,k-1)
- > - tmp1 * njac(5,3,i,k-1)
- lhsa(5,4,i,k) = - tmp2 * fjac(5,4,i,k-1)
- > - tmp1 * njac(5,4,i,k-1)
- lhsa(5,5,i,k) = - tmp2 * fjac(5,5,i,k-1)
- > - tmp1 * njac(5,5,i,k-1)
- > - tmp1 * dz5
-
- lhsb(1,1,i,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(1,1,i,k)
- > + tmp1 * 2.0d+00 * dz1
- lhsb(1,2,i,k) = tmp1 * 2.0d+00 * njac(1,2,i,k)
- lhsb(1,3,i,k) = tmp1 * 2.0d+00 * njac(1,3,i,k)
- lhsb(1,4,i,k) = tmp1 * 2.0d+00 * njac(1,4,i,k)
- lhsb(1,5,i,k) = tmp1 * 2.0d+00 * njac(1,5,i,k)
-
- lhsb(2,1,i,k) = tmp1 * 2.0d+00 * njac(2,1,i,k)
- lhsb(2,2,i,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(2,2,i,k)
- > + tmp1 * 2.0d+00 * dz2
- lhsb(2,3,i,k) = tmp1 * 2.0d+00 * njac(2,3,i,k)
- lhsb(2,4,i,k) = tmp1 * 2.0d+00 * njac(2,4,i,k)
- lhsb(2,5,i,k) = tmp1 * 2.0d+00 * njac(2,5,i,k)
-
- lhsb(3,1,i,k) = tmp1 * 2.0d+00 * njac(3,1,i,k)
- lhsb(3,2,i,k) = tmp1 * 2.0d+00 * njac(3,2,i,k)
- lhsb(3,3,i,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(3,3,i,k)
- > + tmp1 * 2.0d+00 * dz3
- lhsb(3,4,i,k) = tmp1 * 2.0d+00 * njac(3,4,i,k)
- lhsb(3,5,i,k) = tmp1 * 2.0d+00 * njac(3,5,i,k)
-
- lhsb(4,1,i,k) = tmp1 * 2.0d+00 * njac(4,1,i,k)
- lhsb(4,2,i,k) = tmp1 * 2.0d+00 * njac(4,2,i,k)
- lhsb(4,3,i,k) = tmp1 * 2.0d+00 * njac(4,3,i,k)
- lhsb(4,4,i,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(4,4,i,k)
- > + tmp1 * 2.0d+00 * dz4
- lhsb(4,5,i,k) = tmp1 * 2.0d+00 * njac(4,5,i,k)
-
- lhsb(5,1,i,k) = tmp1 * 2.0d+00 * njac(5,1,i,k)
- lhsb(5,2,i,k) = tmp1 * 2.0d+00 * njac(5,2,i,k)
- lhsb(5,3,i,k) = tmp1 * 2.0d+00 * njac(5,3,i,k)
- lhsb(5,4,i,k) = tmp1 * 2.0d+00 * njac(5,4,i,k)
- lhsb(5,5,i,k) = 1.0d+00
- > + tmp1 * 2.0d+00 * njac(5,5,i,k)
- > + tmp1 * 2.0d+00 * dz5
-
- lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i,k+1)
- > - tmp1 * njac(1,1,i,k+1)
- > - tmp1 * dz1
- lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i,k+1)
- > - tmp1 * njac(1,2,i,k+1)
- lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i,k+1)
- > - tmp1 * njac(1,3,i,k+1)
- lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i,k+1)
- > - tmp1 * njac(1,4,i,k+1)
- lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i,k+1)
- > - tmp1 * njac(1,5,i,k+1)
-
- lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i,k+1)
- > - tmp1 * njac(2,1,i,k+1)
- lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i,k+1)
- > - tmp1 * njac(2,2,i,k+1)
- > - tmp1 * dz2
- lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i,k+1)
- > - tmp1 * njac(2,3,i,k+1)
- lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i,k+1)
- > - tmp1 * njac(2,4,i,k+1)
- lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i,k+1)
- > - tmp1 * njac(2,5,i,k+1)
-
- lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i,k+1)
- > - tmp1 * njac(3,1,i,k+1)
- lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i,k+1)
- > - tmp1 * njac(3,2,i,k+1)
- lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i,k+1)
- > - tmp1 * njac(3,3,i,k+1)
- > - tmp1 * dz3
- lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i,k+1)
- > - tmp1 * njac(3,4,i,k+1)
- lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i,k+1)
- > - tmp1 * njac(3,5,i,k+1)
-
- lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i,k+1)
- > - tmp1 * njac(4,1,i,k+1)
- lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i,k+1)
- > - tmp1 * njac(4,2,i,k+1)
- lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i,k+1)
- > - tmp1 * njac(4,3,i,k+1)
- lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i,k+1)
- > - tmp1 * njac(4,4,i,k+1)
- > - tmp1 * dz4
- lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i,k+1)
- > - tmp1 * njac(4,5,i,k+1)
-
- lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i,k+1)
- > - tmp1 * njac(5,1,i,k+1)
- lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i,k+1)
- > - tmp1 * njac(5,2,i,k+1)
- lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i,k+1)
- > - tmp1 * njac(5,3,i,k+1)
- lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i,k+1)
- > - tmp1 * njac(5,4,i,k+1)
- lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i,k+1)
- > - tmp1 * njac(5,5,i,k+1)
- > - tmp1 * dz5
-
- enddo
- enddo
-
-
-c---------------------------------------------------------------------
-c outer most do loops - sweeping in i direction
-c---------------------------------------------------------------------
- if (first .eq. 1) then
-
-c---------------------------------------------------------------------
-c multiply c(i,j,kstart) by b_inverse and copy back to c
-c multiply rhs(kstart) by b_inverse(kstart) and copy to rhs
-c---------------------------------------------------------------------
-!dir$ ivdep
- do i=start(1,c),isize
- call binvcrhs( lhsb(1,1,i,kstart),
- > lhsc(1,1,i,j,kstart,c),
- > rhs(1,i,j,kstart,c) )
- enddo
-
- endif
-
-c---------------------------------------------------------------------
-c begin inner most do loop
-c do all the elements of the cell unless last
-c---------------------------------------------------------------------
- do k=kstart+first,ksize-last
-!dir$ ivdep
- do i=start(1,c),isize
-
-c---------------------------------------------------------------------
-c subtract A*lhs_vector(k-1) from lhs_vector(k)
-c
-c rhs(k) = rhs(k) - A*rhs(k-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,i,k),
- > rhs(1,i,j,k-1,c),rhs(1,i,j,k,c))
-
-c---------------------------------------------------------------------
-c B(k) = B(k) - C(k-1)*A(k)
-c call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,i,k),
- > lhsc(1,1,i,j,k-1,c),
- > lhsb(1,1,i,k))
-
-c---------------------------------------------------------------------
-c multiply c(i,j,k) by b_inverse and copy back to c
-c multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs
-c---------------------------------------------------------------------
- call binvcrhs( lhsb(1,1,i,k),
- > lhsc(1,1,i,j,k,c),
- > rhs(1,i,j,k,c) )
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c Now finish up special cases for last cell
-c---------------------------------------------------------------------
- if (last .eq. 1) then
-
-!dir$ ivdep
- do i=start(1,c),isize
-c---------------------------------------------------------------------
-c rhs(ksize) = rhs(ksize) - A*rhs(ksize-1)
-c---------------------------------------------------------------------
- call matvec_sub(lhsa(1,1,i,ksize),
- > rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c))
-
-c---------------------------------------------------------------------
-c B(ksize) = B(ksize) - C(ksize-1)*A(ksize)
-c call matmul_sub(aa,i,j,ksize,c,
-c $ cc,i,j,ksize-1,c,bb,i,j,ksize,c)
-c---------------------------------------------------------------------
- call matmul_sub(lhsa(1,1,i,ksize),
- > lhsc(1,1,i,j,ksize-1,c),
- > lhsb(1,1,i,ksize))
-
-c---------------------------------------------------------------------
-c multiply rhs(ksize) by b_inverse(ksize) and copy to rhs
-c---------------------------------------------------------------------
- call binvrhs( lhsb(1,1,i,ksize),
- > rhs(1,i,j,ksize,c) )
- enddo
-
- endif
- enddo
-
-
- return
- end
-
-
-
-
-
-
+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=cg
-BENCHMARKU=CG
-
-include ../config/make.def
-
-OBJS = cg.o ${COMMON}/print_results.o \
- ${COMMON}/${RAND}.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-${PROGRAM}: config ${OBJS}
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-cg.o: cg.f mpinpb.h npbparams.h
- ${FCOMPILE} cg.f
-
-clean:
- - rm -f *.o *~
- - rm -f npbparams.h core
-
-
-
+++ /dev/null
-!-------------------------------------------------------------------------!
-! !
-! N A S P A R A L L E L B E N C H M A R K S 3.3 !
-! !
-! C G !
-! !
-!-------------------------------------------------------------------------!
-! !
-! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
-! It is described in NAS Technical Reports 95-020 and 02-007 !
-! !
-! Permission to use, copy, distribute and modify this software !
-! for any purpose with or without fee is hereby granted. We !
-! request, however, that all derived work reference the NAS !
-! Parallel Benchmarks 3.3. This software is provided "as is" !
-! without express or implied warranty. !
-! !
-! Information on NPB 3.3, including the technical report, the !
-! original specifications, source code, results and information !
-! on how to submit new results, is available at: !
-! !
-! http://www.nas.nasa.gov/Software/NPB/ !
-! !
-! Send comments or suggestions to npb@nas.nasa.gov !
-! !
-! NAS Parallel Benchmarks Group !
-! NASA Ames Research Center !
-! Mail Stop: T27A-1 !
-! Moffett Field, CA 94035-1000 !
-! !
-! E-mail: npb@nas.nasa.gov !
-! Fax: (650) 604-3957 !
-! !
-!-------------------------------------------------------------------------!
-
-
-c---------------------------------------------------------------------
-c
-c Authors: M. Yarrow
-c C. Kuszmaul
-c R. F. Van der Wijngaart
-c H. Jin
-c
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- program cg
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
- implicit none
-
- include 'mpinpb.h'
- integer status(MPI_STATUS_SIZE), request, ierr
-
- include 'npbparams.h'
-
-c---------------------------------------------------------------------
-c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows.
-c num_proc_cols and num_proc_cols are to be found in npbparams.h.
-c When num_procs is not square, then num_proc_cols must be = 2*num_proc_rows.
-c---------------------------------------------------------------------
- integer num_procs
- parameter( num_procs = num_proc_cols * num_proc_rows )
-
-
-
-c---------------------------------------------------------------------
-c Class specific parameters:
-c It appears here for reference only.
-c These are their values, however, this info is imported in the npbparams.h
-c include file, which is written by the sys/setparams.c program.
-c---------------------------------------------------------------------
-
-C----------
-C Class S:
-C----------
-CC parameter( na=1400,
-CC > nonzer=7,
-CC > shift=10.,
-CC > niter=15,
-CC > rcond=1.0d-1 )
-C----------
-C Class W:
-C----------
-CC parameter( na=7000,
-CC > nonzer=8,
-CC > shift=12.,
-CC > niter=15,
-CC > rcond=1.0d-1 )
-C----------
-C Class A:
-C----------
-CC parameter( na=14000,
-CC > nonzer=11,
-CC > shift=20.,
-CC > niter=15,
-CC > rcond=1.0d-1 )
-C----------
-C Class B:
-C----------
-CC parameter( na=75000,
-CC > nonzer=13,
-CC > shift=60.,
-CC > niter=75,
-CC > rcond=1.0d-1 )
-C----------
-C Class C:
-C----------
-CC parameter( na=150000,
-CC > nonzer=15,
-CC > shift=110.,
-CC > niter=75,
-CC > rcond=1.0d-1 )
-C----------
-C Class D:
-C----------
-CC parameter( na=1500000,
-CC > nonzer=21,
-CC > shift=500.,
-CC > niter=100,
-CC > rcond=1.0d-1 )
-C----------
-C Class E:
-C----------
-CC parameter( na=9000000,
-CC > nonzer=26,
-CC > shift=1500.,
-CC > niter=100,
-CC > rcond=1.0d-1 )
-
-
-
- integer nz
- parameter( nz = na*(nonzer+1)/num_procs*(nonzer+1)+nonzer
- > + na*(nonzer+2+num_procs/256)/num_proc_cols )
-
-
-
- common / partit_size / naa, nzz,
- > npcols, nprows,
- > proc_col, proc_row,
- > firstrow,
- > lastrow,
- > firstcol,
- > lastcol,
- > exch_proc,
- > exch_recv_length,
- > send_start,
- > send_len
- integer naa, nzz,
- > npcols, nprows,
- > proc_col, proc_row,
- > firstrow,
- > lastrow,
- > firstcol,
- > lastcol,
- > exch_proc,
- > exch_recv_length,
- > send_start,
- > send_len
-
-
- common / main_int_mem / colidx, rowstr,
- > iv, arow, acol
- integer colidx(nz), rowstr(na+1),
- > iv(2*na+1), arow(nz), acol(nz)
-
-
- common / main_flt_mem / v, aelt, a,
- > x,
- > z,
- > p,
- > q,
- > r,
- > w
- double precision v(na+1), aelt(nz), a(nz),
- > x(na/num_proc_rows+2),
- > z(na/num_proc_rows+2),
- > p(na/num_proc_rows+2),
- > q(na/num_proc_rows+2),
- > r(na/num_proc_rows+2),
- > w(na/num_proc_rows+2)
-
-
- common /urando/ amult, tran
- double precision amult, tran
-
-
-
- integer l2npcols
- integer reduce_exch_proc(num_proc_cols)
- integer reduce_send_starts(num_proc_cols)
- integer reduce_send_lengths(num_proc_cols)
- integer reduce_recv_starts(num_proc_cols)
- integer reduce_recv_lengths(num_proc_cols)
-
- integer i, j, k, it
-
- double precision zeta, randlc
- external randlc
- double precision rnorm
- double precision norm_temp1(2), norm_temp2(2)
-
- double precision t, tmax, mflops
- external timer_read
- double precision timer_read
- character class
- logical verified
- double precision zeta_verify_value, epsilon, err
-
-
-c---------------------------------------------------------------------
-c Set up mpi initialization and number of proc testing
-c---------------------------------------------------------------------
- call initialize_mpi
-
-
- if( na .eq. 1400 .and.
- & nonzer .eq. 7 .and.
- & niter .eq. 15 .and.
- & shift .eq. 10.d0 ) then
- class = 'S'
- zeta_verify_value = 8.5971775078648d0
- else if( na .eq. 7000 .and.
- & nonzer .eq. 8 .and.
- & niter .eq. 15 .and.
- & shift .eq. 12.d0 ) then
- class = 'W'
- zeta_verify_value = 10.362595087124d0
- else if( na .eq. 14000 .and.
- & nonzer .eq. 11 .and.
- & niter .eq. 15 .and.
- & shift .eq. 20.d0 ) then
- class = 'A'
- zeta_verify_value = 17.130235054029d0
- else if( na .eq. 75000 .and.
- & nonzer .eq. 13 .and.
- & niter .eq. 75 .and.
- & shift .eq. 60.d0 ) then
- class = 'B'
- zeta_verify_value = 22.712745482631d0
- else if( na .eq. 150000 .and.
- & nonzer .eq. 15 .and.
- & niter .eq. 75 .and.
- & shift .eq. 110.d0 ) then
- class = 'C'
- zeta_verify_value = 28.973605592845d0
- else if( na .eq. 1500000 .and.
- & nonzer .eq. 21 .and.
- & niter .eq. 100 .and.
- & shift .eq. 500.d0 ) then
- class = 'D'
- zeta_verify_value = 52.514532105794d0
- else if( na .eq. 9000000 .and.
- & nonzer .eq. 26 .and.
- & niter .eq. 100 .and.
- & shift .eq. 1.5d3 ) then
- class = 'E'
- zeta_verify_value = 77.522164599383d0
- else
- class = 'U'
- endif
-
- if( me .eq. root )then
- write( *,1000 )
- write( *,1001 ) na
- write( *,1002 ) niter
- write( *,1003 ) nprocs
- write( *,1004 ) nonzer
- write( *,1005 ) shift
- 1000 format(//,' NAS Parallel Benchmarks 3.3 -- CG Benchmark', /)
- 1001 format(' Size: ', i10 )
- 1002 format(' Iterations: ', i5 )
- 1003 format(' Number of active processes: ', i5 )
- 1004 format(' Number of nonzeroes per row: ', i8)
- 1005 format(' Eigenvalue shift: ', e8.3)
- endif
-
- if (.not. convertdouble) then
- dp_type = MPI_DOUBLE_PRECISION
- else
- dp_type = MPI_REAL
- endif
-
-
- naa = na
- nzz = nz
-
-
-c---------------------------------------------------------------------
-c Set up processor info, such as whether sq num of procs, etc
-c---------------------------------------------------------------------
- call setup_proc_info( num_procs,
- > num_proc_rows,
- > num_proc_cols )
-
-
-c---------------------------------------------------------------------
-c Set up partition's submatrix info: firstcol, lastcol, firstrow, lastrow
-c---------------------------------------------------------------------
- call setup_submatrix_info( l2npcols,
- > reduce_exch_proc,
- > reduce_send_starts,
- > reduce_send_lengths,
- > reduce_recv_starts,
- > reduce_recv_lengths )
-
-
-
-c---------------------------------------------------------------------
-c Inialize random number generator
-c---------------------------------------------------------------------
- tran = 314159265.0D0
- amult = 1220703125.0D0
- zeta = randlc( tran, amult )
-
-c---------------------------------------------------------------------
-c Set up partition's sparse random matrix for given class size
-c---------------------------------------------------------------------
- call makea(naa, nzz, a, colidx, rowstr, nonzer,
- > firstrow, lastrow, firstcol, lastcol,
- > rcond, arow, acol, aelt, v, iv, shift)
-
-
-
-c---------------------------------------------------------------------
-c Note: as a result of the above call to makea:
-c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1
-c values of colidx which are col indexes go from firstcol --> lastcol
-c So:
-c Shift the col index vals from actual (firstcol --> lastcol )
-c to local, i.e., (1 --> lastcol-firstcol+1)
-c---------------------------------------------------------------------
- do j=1,lastrow-firstrow+1
- do k=rowstr(j),rowstr(j+1)-1
- colidx(k) = colidx(k) - firstcol + 1
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c set starting vector to (1, 1, .... 1)
-c---------------------------------------------------------------------
- do i = 1, na/num_proc_rows+1
- x(i) = 1.0D0
- enddo
-
- zeta = 0.0d0
-
-c---------------------------------------------------------------------
-c---->
-c Do one iteration untimed to init all code and data page tables
-c----> (then reinit, start timing, to niter its)
-c---------------------------------------------------------------------
- do it = 1, 1
-
-c---------------------------------------------------------------------
-c The call to the conjugate gradient routine:
-c---------------------------------------------------------------------
- call conj_grad ( colidx,
- > rowstr,
- > x,
- > z,
- > a,
- > p,
- > q,
- > r,
- > w,
- > rnorm,
- > l2npcols,
- > reduce_exch_proc,
- > reduce_send_starts,
- > reduce_send_lengths,
- > reduce_recv_starts,
- > reduce_recv_lengths )
-
-c---------------------------------------------------------------------
-c zeta = shift + 1/(x.z)
-c So, first: (x.z)
-c Also, find norm of z
-c So, first: (z.z)
-c---------------------------------------------------------------------
- norm_temp1(1) = 0.0d0
- norm_temp1(2) = 0.0d0
- do j=1, lastcol-firstcol+1
- norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
- norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
- enddo
-
- do i = 1, l2npcols
- call mpi_irecv( norm_temp2,
- > 2,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > request,
- > ierr )
- call mpi_send( norm_temp1,
- > 2,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > ierr )
- call mpi_wait( request, status, ierr )
-
- norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
- norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
- enddo
-
- norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
-
-
-c---------------------------------------------------------------------
-c Normalize z to obtain x
-c---------------------------------------------------------------------
- do j=1, lastcol-firstcol+1
- x(j) = norm_temp1(2)*z(j)
- enddo
-
-
- enddo ! end of do one iteration untimed
-
-
-c---------------------------------------------------------------------
-c set starting vector to (1, 1, .... 1)
-c---------------------------------------------------------------------
-c
-c NOTE: a questionable limit on size: should this be na/num_proc_cols+1 ?
-c
- do i = 1, na/num_proc_rows+1
- x(i) = 1.0D0
- enddo
-
- zeta = 0.0d0
-
-c---------------------------------------------------------------------
-c Synchronize and start timing
-c---------------------------------------------------------------------
- call mpi_barrier( mpi_comm_world,
- > ierr )
-
- call timer_clear( 1 )
- call timer_start( 1 )
-
-c---------------------------------------------------------------------
-c---->
-c Main Iteration for inverse power method
-c---->
-c---------------------------------------------------------------------
- do it = 1, niter
-
-c---------------------------------------------------------------------
-c The call to the conjugate gradient routine:
-c---------------------------------------------------------------------
- call conj_grad ( colidx,
- > rowstr,
- > x,
- > z,
- > a,
- > p,
- > q,
- > r,
- > w,
- > rnorm,
- > l2npcols,
- > reduce_exch_proc,
- > reduce_send_starts,
- > reduce_send_lengths,
- > reduce_recv_starts,
- > reduce_recv_lengths )
-
-
-c---------------------------------------------------------------------
-c zeta = shift + 1/(x.z)
-c So, first: (x.z)
-c Also, find norm of z
-c So, first: (z.z)
-c---------------------------------------------------------------------
- norm_temp1(1) = 0.0d0
- norm_temp1(2) = 0.0d0
- do j=1, lastcol-firstcol+1
- norm_temp1(1) = norm_temp1(1) + x(j)*z(j)
- norm_temp1(2) = norm_temp1(2) + z(j)*z(j)
- enddo
-
- do i = 1, l2npcols
- call mpi_irecv( norm_temp2,
- > 2,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > request,
- > ierr )
- call mpi_send( norm_temp1,
- > 2,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > ierr )
- call mpi_wait( request, status, ierr )
-
- norm_temp1(1) = norm_temp1(1) + norm_temp2(1)
- norm_temp1(2) = norm_temp1(2) + norm_temp2(2)
- enddo
-
- norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) )
-
-
- if( me .eq. root )then
- zeta = shift + 1.0d0 / norm_temp1(1)
- if( it .eq. 1 ) write( *,9000 )
- write( *,9001 ) it, rnorm, zeta
- endif
- 9000 format( /,' iteration ||r|| zeta' )
- 9001 format( 4x, i5, 7x, e20.14, f20.13 )
-
-c---------------------------------------------------------------------
-c Normalize z to obtain x
-c---------------------------------------------------------------------
- do j=1, lastcol-firstcol+1
- x(j) = norm_temp1(2)*z(j)
- enddo
-
-
- enddo ! end of main iter inv pow meth
-
- call timer_stop( 1 )
-
-c---------------------------------------------------------------------
-c End of timed section
-c---------------------------------------------------------------------
-
- t = timer_read( 1 )
-
- call mpi_reduce( t,
- > tmax,
- > 1,
- > dp_type,
- > MPI_MAX,
- > root,
- > mpi_comm_world,
- > ierr )
-
- if( me .eq. root )then
- write(*,100)
- 100 format(' Benchmark completed ')
-
- epsilon = 1.d-10
- if (class .ne. 'U') then
-
- err = abs( zeta - zeta_verify_value )/zeta_verify_value
- if( err .le. epsilon ) then
- verified = .TRUE.
- write(*, 200)
- write(*, 201) zeta
- write(*, 202) err
- 200 format(' VERIFICATION SUCCESSFUL ')
- 201 format(' Zeta is ', E20.13)
- 202 format(' Error is ', E20.13)
- else
- verified = .FALSE.
- write(*, 300)
- write(*, 301) zeta
- write(*, 302) zeta_verify_value
- 300 format(' VERIFICATION FAILED')
- 301 format(' Zeta ', E20.13)
- 302 format(' The correct zeta is ', E20.13)
- endif
- else
- verified = .FALSE.
- write (*, 400)
- write (*, 401)
- write (*, 201) zeta
- 400 format(' Problem size unknown')
- 401 format(' NO VERIFICATION PERFORMED')
- endif
-
-
- if( tmax .ne. 0. ) then
- mflops = float( 2*niter*na )
- & * ( 3.+float( nonzer*(nonzer+1) )
- & + 25.*(5.+float( nonzer*(nonzer+1) ))
- & + 3. ) / tmax / 1000000.0
- else
- mflops = 0.0
- endif
-
- call print_results('CG', class, na, 0, 0,
- > niter, nnodes_compiled, nprocs, tmax,
- > mflops, ' floating point',
- > verified, npbversion, compiletime,
- > cs1, cs2, cs3, cs4, cs5, cs6, cs7)
-
-
- endif
-
-
- call mpi_finalize(ierr)
-
-
-
- end ! end main
-
-
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine initialize_mpi
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
-
- integer ierr
-
-
- call mpi_init( ierr )
- call mpi_comm_rank( mpi_comm_world, me, ierr )
- call mpi_comm_size( mpi_comm_world, nprocs, ierr )
- root = 0
-
-
- return
- end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine setup_proc_info( num_procs,
- > num_proc_rows,
- > num_proc_cols )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
-
- common / partit_size / naa, nzz,
- > npcols, nprows,
- > proc_col, proc_row,
- > firstrow,
- > lastrow,
- > firstcol,
- > lastcol,
- > exch_proc,
- > exch_recv_length,
- > send_start,
- > send_len
- integer naa, nzz,
- > npcols, nprows,
- > proc_col, proc_row,
- > firstrow,
- > lastrow,
- > firstcol,
- > lastcol,
- > exch_proc,
- > exch_recv_length,
- > send_start,
- > send_len
-
- integer num_procs, num_proc_cols, num_proc_rows
- integer i, ierr
- integer log2nprocs
-
-c---------------------------------------------------------------------
-c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows
-c When num_procs is not square, then num_proc_cols = 2*num_proc_rows
-c---------------------------------------------------------------------
-c First, number of procs must be power of two.
-c---------------------------------------------------------------------
- if( nprocs .ne. num_procs )then
- if( me .eq. root ) write( *,9000 ) nprocs, num_procs
- 9000 format( /,'Error: ',/,'num of procs allocated (',
- > i4, ' )',
- > /,'is not equal to',/,
- > 'compiled number of procs (',
- > i4, ' )',/ )
- call mpi_finalize(ierr)
- stop
- endif
-
-
- i = num_proc_cols
- 100 continue
- if( i .ne. 1 .and. i/2*2 .ne. i )then
- if ( me .eq. root ) then
- write( *,* ) 'Error: num_proc_cols is ',
- > num_proc_cols,
- > ' which is not a power of two'
- endif
- call mpi_finalize(ierr)
- stop
- endif
- i = i / 2
- if( i .ne. 0 )then
- goto 100
- endif
-
- i = num_proc_rows
- 200 continue
- if( i .ne. 1 .and. i/2*2 .ne. i )then
- if ( me .eq. root ) then
- write( *,* ) 'Error: num_proc_rows is ',
- > num_proc_rows,
- > ' which is not a power of two'
- endif
- call mpi_finalize(ierr)
- stop
- endif
- i = i / 2
- if( i .ne. 0 )then
- goto 200
- endif
-
- log2nprocs = 0
- i = nprocs
- 300 continue
- if( i .ne. 1 .and. i/2*2 .ne. i )then
- write( *,* ) 'Error: nprocs is ',
- > nprocs,
- > ' which is not a power of two'
- call mpi_finalize(ierr)
- stop
- endif
- i = i / 2
- if( i .ne. 0 )then
- log2nprocs = log2nprocs + 1
- goto 300
- endif
-
-CC write( *,* ) 'nprocs, log2nprocs: ',nprocs,log2nprocs
-
-
- npcols = num_proc_cols
- nprows = num_proc_rows
-
-
- return
- end
-
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine setup_submatrix_info( l2npcols,
- > reduce_exch_proc,
- > reduce_send_starts,
- > reduce_send_lengths,
- > reduce_recv_starts,
- > reduce_recv_lengths )
- >
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
-
- integer col_size, row_size
-
- common / partit_size / naa, nzz,
- > npcols, nprows,
- > proc_col, proc_row,
- > firstrow,
- > lastrow,
- > firstcol,
- > lastcol,
- > exch_proc,
- > exch_recv_length,
- > send_start,
- > send_len
- integer naa, nzz,
- > npcols, nprows,
- > proc_col, proc_row,
- > firstrow,
- > lastrow,
- > firstcol,
- > lastcol,
- > exch_proc,
- > exch_recv_length,
- > send_start,
- > send_len
-
- integer reduce_exch_proc(*)
- integer reduce_send_starts(*)
- integer reduce_send_lengths(*)
- integer reduce_recv_starts(*)
- integer reduce_recv_lengths(*)
-
- integer i, j
- integer div_factor
- integer l2npcols
-
-
- proc_row = me / npcols
- proc_col = me - proc_row*npcols
-
-
-
-c---------------------------------------------------------------------
-c If naa evenly divisible by npcols, then it is evenly divisible
-c by nprows
-c---------------------------------------------------------------------
-
- if( naa/npcols*npcols .eq. naa )then
- col_size = naa/npcols
- firstcol = proc_col*col_size + 1
- lastcol = firstcol - 1 + col_size
- row_size = naa/nprows
- firstrow = proc_row*row_size + 1
- lastrow = firstrow - 1 + row_size
-c---------------------------------------------------------------------
-c If naa not evenly divisible by npcols, then first subdivide for nprows
-c and then, if npcols not equal to nprows (i.e., not a sq number of procs),
-c get col subdivisions by dividing by 2 each row subdivision.
-c---------------------------------------------------------------------
- else
- if( proc_row .lt. naa - naa/nprows*nprows)then
- row_size = naa/nprows+ 1
- firstrow = proc_row*row_size + 1
- lastrow = firstrow - 1 + row_size
- else
- row_size = naa/nprows
- firstrow = (naa - naa/nprows*nprows)*(row_size+1)
- > + (proc_row-(naa-naa/nprows*nprows))
- > *row_size + 1
- lastrow = firstrow - 1 + row_size
- endif
- if( npcols .eq. nprows )then
- if( proc_col .lt. naa - naa/npcols*npcols )then
- col_size = naa/npcols+ 1
- firstcol = proc_col*col_size + 1
- lastcol = firstcol - 1 + col_size
- else
- col_size = naa/npcols
- firstcol = (naa - naa/npcols*npcols)*(col_size+1)
- > + (proc_col-(naa-naa/npcols*npcols))
- > *col_size + 1
- lastcol = firstcol - 1 + col_size
- endif
- else
- if( (proc_col/2) .lt.
- > naa - naa/(npcols/2)*(npcols/2) )then
- col_size = naa/(npcols/2) + 1
- firstcol = (proc_col/2)*col_size + 1
- lastcol = firstcol - 1 + col_size
- else
- col_size = naa/(npcols/2)
- firstcol = (naa - naa/(npcols/2)*(npcols/2))
- > *(col_size+1)
- > + ((proc_col/2)-(naa-naa/(npcols/2)*(npcols/2)))
- > *col_size + 1
- lastcol = firstcol - 1 + col_size
- endif
-CC write( *,* ) col_size,firstcol,lastcol
- if( mod( me,2 ) .eq. 0 )then
- lastcol = firstcol - 1 + (col_size-1)/2 + 1
- else
- firstcol = firstcol + (col_size-1)/2 + 1
- lastcol = firstcol - 1 + col_size/2
-CC write( *,* ) firstcol,lastcol
- endif
- endif
- endif
-
-
-
- if( npcols .eq. nprows )then
- send_start = 1
- send_len = lastrow - firstrow + 1
- else
- if( mod( me,2 ) .eq. 0 )then
- send_start = 1
- send_len = (1 + lastrow-firstrow+1)/2
- else
- send_start = (1 + lastrow-firstrow+1)/2 + 1
- send_len = (lastrow-firstrow+1)/2
- endif
- endif
-
-
-
-
-c---------------------------------------------------------------------
-c Transpose exchange processor
-c---------------------------------------------------------------------
-
- if( npcols .eq. nprows )then
- exch_proc = mod( me,nprows )*nprows + me/nprows
- else
- exch_proc = 2*(mod( me/2,nprows )*nprows + me/2/nprows)
- > + mod( me,2 )
- endif
-
-
-
- i = npcols / 2
- l2npcols = 0
- do while( i .gt. 0 )
- l2npcols = l2npcols + 1
- i = i / 2
- enddo
-
-
-c---------------------------------------------------------------------
-c Set up the reduce phase schedules...
-c---------------------------------------------------------------------
-
- div_factor = npcols
- do i = 1, l2npcols
-
- j = mod( proc_col+div_factor/2, div_factor )
- > + proc_col / div_factor * div_factor
- reduce_exch_proc(i) = proc_row*npcols + j
-
- div_factor = div_factor / 2
-
- enddo
-
-
- do i = l2npcols, 1, -1
-
- if( nprows .eq. npcols )then
- reduce_send_starts(i) = send_start
- reduce_send_lengths(i) = send_len
- reduce_recv_lengths(i) = lastrow - firstrow + 1
- else
- reduce_recv_lengths(i) = send_len
- if( i .eq. l2npcols )then
- reduce_send_lengths(i) = lastrow-firstrow+1 - send_len
- if( me/2*2 .eq. me )then
- reduce_send_starts(i) = send_start + send_len
- else
- reduce_send_starts(i) = 1
- endif
- else
- reduce_send_lengths(i) = send_len
- reduce_send_starts(i) = send_start
- endif
- endif
- reduce_recv_starts(i) = send_start
-
- enddo
-
-
- exch_recv_length = lastcol - firstcol + 1
-
-
- return
- end
-
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine conj_grad ( colidx,
- > rowstr,
- > x,
- > z,
- > a,
- > p,
- > q,
- > r,
- > w,
- > rnorm,
- > l2npcols,
- > reduce_exch_proc,
- > reduce_send_starts,
- > reduce_send_lengths,
- > reduce_recv_starts,
- > reduce_recv_lengths )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Floaging point arrays here are named as in NPB1 spec discussion of
-c CG algorithm
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
-
- integer status(MPI_STATUS_SIZE ), request
-
-
- common / partit_size / naa, nzz,
- > npcols, nprows,
- > proc_col, proc_row,
- > firstrow,
- > lastrow,
- > firstcol,
- > lastcol,
- > exch_proc,
- > exch_recv_length,
- > send_start,
- > send_len
- integer naa, nzz,
- > npcols, nprows,
- > proc_col, proc_row,
- > firstrow,
- > lastrow,
- > firstcol,
- > lastcol,
- > exch_proc,
- > exch_recv_length,
- > send_start,
- > send_len
-
-
-
- double precision x(*),
- > z(*),
- > a(nzz)
- integer colidx(nzz), rowstr(naa+1)
-
- double precision p(*),
- > q(*),
- > r(*),
- > w(*) ! used as work temporary
-
- integer l2npcols
- integer reduce_exch_proc(l2npcols)
- integer reduce_send_starts(l2npcols)
- integer reduce_send_lengths(l2npcols)
- integer reduce_recv_starts(l2npcols)
- integer reduce_recv_lengths(l2npcols)
-
- integer i, j, k, ierr
- integer cgit, cgitmax
-
- double precision d, sum, rho, rho0, alpha, beta, rnorm
-
- external timer_read
- double precision timer_read
-
- data cgitmax / 25 /
-
-
-c---------------------------------------------------------------------
-c Initialize the CG algorithm:
-c---------------------------------------------------------------------
- do j=1,naa/nprows+1
- q(j) = 0.0d0
- z(j) = 0.0d0
- r(j) = x(j)
- p(j) = r(j)
- w(j) = 0.0d0
- enddo
-
-
-c---------------------------------------------------------------------
-c rho = r.r
-c Now, obtain the norm of r: First, sum squares of r elements locally...
-c---------------------------------------------------------------------
- sum = 0.0d0
- do j=1, lastcol-firstcol+1
- sum = sum + r(j)*r(j)
- enddo
-
-c---------------------------------------------------------------------
-c Exchange and sum with procs identified in reduce_exch_proc
-c (This is equivalent to mpi_allreduce.)
-c Sum the partial sums of rho, leaving rho on all processors
-c---------------------------------------------------------------------
- do i = 1, l2npcols
- call mpi_irecv( rho,
- > 1,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > request,
- > ierr )
- call mpi_send( sum,
- > 1,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > ierr )
- call mpi_wait( request, status, ierr )
-
- sum = sum + rho
- enddo
- rho = sum
-
-
-
-c---------------------------------------------------------------------
-c---->
-c The conj grad iteration loop
-c---->
-c---------------------------------------------------------------------
- do cgit = 1, cgitmax
-
-
-c---------------------------------------------------------------------
-c q = A.p
-c The partition submatrix-vector multiply: use workspace w
-c---------------------------------------------------------------------
- do j=1,lastrow-firstrow+1
- sum = 0.d0
- do k=rowstr(j),rowstr(j+1)-1
- sum = sum + a(k)*p(colidx(k))
- enddo
- w(j) = sum
- enddo
-
-c---------------------------------------------------------------------
-c Sum the partition submatrix-vec A.p's across rows
-c Exchange and sum piece of w with procs identified in reduce_exch_proc
-c---------------------------------------------------------------------
- do i = l2npcols, 1, -1
- call mpi_irecv( q(reduce_recv_starts(i)),
- > reduce_recv_lengths(i),
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > request,
- > ierr )
- call mpi_send( w(reduce_send_starts(i)),
- > reduce_send_lengths(i),
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > ierr )
- call mpi_wait( request, status, ierr )
- do j=send_start,send_start + reduce_recv_lengths(i) - 1
- w(j) = w(j) + q(j)
- enddo
- enddo
-
-
-c---------------------------------------------------------------------
-c Exchange piece of q with transpose processor:
-c---------------------------------------------------------------------
- if( l2npcols .ne. 0 )then
- call mpi_irecv( q,
- > exch_recv_length,
- > dp_type,
- > exch_proc,
- > 1,
- > mpi_comm_world,
- > request,
- > ierr )
-
- call mpi_send( w(send_start),
- > send_len,
- > dp_type,
- > exch_proc,
- > 1,
- > mpi_comm_world,
- > ierr )
- call mpi_wait( request, status, ierr )
- else
- do j=1,exch_recv_length
- q(j) = w(j)
- enddo
- endif
-
-
-c---------------------------------------------------------------------
-c Clear w for reuse...
-c---------------------------------------------------------------------
- do j=1, max( lastrow-firstrow+1, lastcol-firstcol+1 )
- w(j) = 0.0d0
- enddo
-
-
-c---------------------------------------------------------------------
-c Obtain p.q
-c---------------------------------------------------------------------
- sum = 0.0d0
- do j=1, lastcol-firstcol+1
- sum = sum + p(j)*q(j)
- enddo
-
-c---------------------------------------------------------------------
-c Obtain d with a sum-reduce
-c---------------------------------------------------------------------
- do i = 1, l2npcols
- call mpi_irecv( d,
- > 1,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > request,
- > ierr )
- call mpi_send( sum,
- > 1,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > ierr )
-
- call mpi_wait( request, status, ierr )
-
- sum = sum + d
- enddo
- d = sum
-
-
-c---------------------------------------------------------------------
-c Obtain alpha = rho / (p.q)
-c---------------------------------------------------------------------
- alpha = rho / d
-
-c---------------------------------------------------------------------
-c Save a temporary of rho
-c---------------------------------------------------------------------
- rho0 = rho
-
-c---------------------------------------------------------------------
-c Obtain z = z + alpha*p
-c and r = r - alpha*q
-c---------------------------------------------------------------------
- do j=1, lastcol-firstcol+1
- z(j) = z(j) + alpha*p(j)
- r(j) = r(j) - alpha*q(j)
- enddo
-
-c---------------------------------------------------------------------
-c rho = r.r
-c Now, obtain the norm of r: First, sum squares of r elements locally...
-c---------------------------------------------------------------------
- sum = 0.0d0
- do j=1, lastcol-firstcol+1
- sum = sum + r(j)*r(j)
- enddo
-
-c---------------------------------------------------------------------
-c Obtain rho with a sum-reduce
-c---------------------------------------------------------------------
- do i = 1, l2npcols
- call mpi_irecv( rho,
- > 1,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > request,
- > ierr )
- call mpi_send( sum,
- > 1,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > ierr )
- call mpi_wait( request, status, ierr )
-
- sum = sum + rho
- enddo
- rho = sum
-
-c---------------------------------------------------------------------
-c Obtain beta:
-c---------------------------------------------------------------------
- beta = rho / rho0
-
-c---------------------------------------------------------------------
-c p = r + beta*p
-c---------------------------------------------------------------------
- do j=1, lastcol-firstcol+1
- p(j) = r(j) + beta*p(j)
- enddo
-
-
-
- enddo ! end of do cgit=1,cgitmax
-
-
-
-c---------------------------------------------------------------------
-c Compute residual norm explicitly: ||r|| = ||x - A.z||
-c First, form A.z
-c The partition submatrix-vector multiply
-c---------------------------------------------------------------------
- do j=1,lastrow-firstrow+1
- sum = 0.d0
- do k=rowstr(j),rowstr(j+1)-1
- sum = sum + a(k)*z(colidx(k))
- enddo
- w(j) = sum
- enddo
-
-
-
-c---------------------------------------------------------------------
-c Sum the partition submatrix-vec A.z's across rows
-c---------------------------------------------------------------------
- do i = l2npcols, 1, -1
- call mpi_irecv( r(reduce_recv_starts(i)),
- > reduce_recv_lengths(i),
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > request,
- > ierr )
- call mpi_send( w(reduce_send_starts(i)),
- > reduce_send_lengths(i),
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > ierr )
- call mpi_wait( request, status, ierr )
-
- do j=send_start,send_start + reduce_recv_lengths(i) - 1
- w(j) = w(j) + r(j)
- enddo
- enddo
-
-
-c---------------------------------------------------------------------
-c Exchange piece of q with transpose processor:
-c---------------------------------------------------------------------
- if( l2npcols .ne. 0 )then
- call mpi_irecv( r,
- > exch_recv_length,
- > dp_type,
- > exch_proc,
- > 1,
- > mpi_comm_world,
- > request,
- > ierr )
-
- call mpi_send( w(send_start),
- > send_len,
- > dp_type,
- > exch_proc,
- > 1,
- > mpi_comm_world,
- > ierr )
- call mpi_wait( request, status, ierr )
- else
- do j=1,exch_recv_length
- r(j) = w(j)
- enddo
- endif
-
-
-c---------------------------------------------------------------------
-c At this point, r contains A.z
-c---------------------------------------------------------------------
- sum = 0.0d0
- do j=1, lastcol-firstcol+1
- d = x(j) - r(j)
- sum = sum + d*d
- enddo
-
-c---------------------------------------------------------------------
-c Obtain d with a sum-reduce
-c---------------------------------------------------------------------
- do i = 1, l2npcols
- call mpi_irecv( d,
- > 1,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > request,
- > ierr )
- call mpi_send( sum,
- > 1,
- > dp_type,
- > reduce_exch_proc(i),
- > i,
- > mpi_comm_world,
- > ierr )
- call mpi_wait( request, status, ierr )
-
- sum = sum + d
- enddo
- d = sum
-
-
- if( me .eq. root ) rnorm = sqrt( d )
-
-
-
- return
- end ! end of routine conj_grad
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine makea( n, nz, a, colidx, rowstr, nonzer,
- > firstrow, lastrow, firstcol, lastcol,
- > rcond, arow, acol, aelt, v, iv, shift )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- integer n, nz
- integer firstrow, lastrow, firstcol, lastcol
- integer colidx(nz), rowstr(n+1)
- integer iv(2*n+1), arow(nz), acol(nz)
- double precision v(n+1), aelt(nz)
- double precision rcond, a(nz), shift
-
-c---------------------------------------------------------------------
-c generate the test problem for benchmark 6
-c makea generates a sparse matrix with a
-c prescribed sparsity distribution
-c
-c parameter type usage
-c
-c input
-c
-c n i number of cols/rows of matrix
-c nz i nonzeros as declared array size
-c rcond r*8 condition number
-c shift r*8 main diagonal shift
-c
-c output
-c
-c a r*8 array for nonzeros
-c colidx i col indices
-c rowstr i row pointers
-c
-c workspace
-c
-c iv, arow, acol i
-c v, aelt r*8
-c---------------------------------------------------------------------
-
- integer i, nnza, iouter, ivelt, ivelt1, irow, nzv, NONZER
-
-c---------------------------------------------------------------------
-c nonzer is approximately (int(sqrt(nnza /n)));
-c---------------------------------------------------------------------
-
- double precision size, ratio, scale
- external sparse, sprnvc, vecset
-
- size = 1.0D0
- ratio = rcond ** (1.0D0 / dfloat(n))
- nnza = 0
-
-c---------------------------------------------------------------------
-c Initialize iv(n+1 .. 2n) to zero.
-c Used by sprnvc to mark nonzero positions
-c---------------------------------------------------------------------
-
- do i = 1, n
- iv(n+i) = 0
- enddo
- do iouter = 1, n
- nzv = nonzer
- call sprnvc( n, nzv, v, colidx, iv(1), iv(n+1) )
- call vecset( n, v, colidx, nzv, iouter, .5D0 )
- do ivelt = 1, nzv
- jcol = colidx(ivelt)
- if (jcol.ge.firstcol .and. jcol.le.lastcol) then
- scale = size * v(ivelt)
- do ivelt1 = 1, nzv
- irow = colidx(ivelt1)
- if (irow.ge.firstrow .and. irow.le.lastrow) then
- nnza = nnza + 1
- if (nnza .gt. nz) goto 9999
- acol(nnza) = jcol
- arow(nnza) = irow
- aelt(nnza) = v(ivelt1) * scale
- endif
- enddo
- endif
- enddo
- size = size * ratio
- enddo
-
-
-c---------------------------------------------------------------------
-c ... add the identity * rcond to the generated matrix to bound
-c the smallest eigenvalue from below by rcond
-c---------------------------------------------------------------------
- do i = firstrow, lastrow
- if (i.ge.firstcol .and. i.le.lastcol) then
- iouter = n + i
- nnza = nnza + 1
- if (nnza .gt. nz) goto 9999
- acol(nnza) = i
- arow(nnza) = i
- aelt(nnza) = rcond - shift
- endif
- enddo
-
-
-c---------------------------------------------------------------------
-c ... make the sparse matrix from list of elements with duplicates
-c (v and iv are used as workspace)
-c---------------------------------------------------------------------
- call sparse( a, colidx, rowstr, n, arow, acol, aelt,
- > firstrow, lastrow,
- > v, iv(1), iv(n+1), nnza )
- return
-
- 9999 continue
- write(*,*) 'Space for matrix elements exceeded in makea'
- write(*,*) 'nnza, nzmax = ',nnza, nz
- write(*,*) ' iouter = ',iouter
-
- stop
- end
-c-------end of makea------------------------------
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine sparse( a, colidx, rowstr, n, arow, acol, aelt,
- > firstrow, lastrow,
- > x, mark, nzloc, nnza )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit logical (a-z)
- integer colidx(*), rowstr(*)
- integer firstrow, lastrow
- integer n, arow(*), acol(*), nnza
- double precision a(*), aelt(*)
-
-c---------------------------------------------------------------------
-c rows range from firstrow to lastrow
-c the rowstr pointers are defined for nrows = lastrow-firstrow+1 values
-c---------------------------------------------------------------------
- integer nzloc(n), nrows
- double precision x(n)
- logical mark(n)
-
-c---------------------------------------------------
-c generate a sparse matrix from a list of
-c [col, row, element] tri
-c---------------------------------------------------
-
- integer i, j, jajp1, nza, k, nzrow
- double precision xi
-
-c---------------------------------------------------------------------
-c how many rows of result
-c---------------------------------------------------------------------
- nrows = lastrow - firstrow + 1
-
-c---------------------------------------------------------------------
-c ...count the number of triples in each row
-c---------------------------------------------------------------------
- do j = 1, n
- rowstr(j) = 0
- mark(j) = .false.
- enddo
- rowstr(n+1) = 0
-
- do nza = 1, nnza
- j = (arow(nza) - firstrow + 1) + 1
- rowstr(j) = rowstr(j) + 1
- enddo
-
- rowstr(1) = 1
- do j = 2, nrows+1
- rowstr(j) = rowstr(j) + rowstr(j-1)
- enddo
-
-
-c---------------------------------------------------------------------
-c ... rowstr(j) now is the location of the first nonzero
-c of row j of a
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c ... do a bucket sort of the triples on the row index
-c---------------------------------------------------------------------
- do nza = 1, nnza
- j = arow(nza) - firstrow + 1
- k = rowstr(j)
- a(k) = aelt(nza)
- colidx(k) = acol(nza)
- rowstr(j) = rowstr(j) + 1
- enddo
-
-
-c---------------------------------------------------------------------
-c ... rowstr(j) now points to the first element of row j+1
-c---------------------------------------------------------------------
- do j = nrows, 1, -1
- rowstr(j+1) = rowstr(j)
- enddo
- rowstr(1) = 1
-
-
-c---------------------------------------------------------------------
-c ... generate the actual output rows by adding elements
-c---------------------------------------------------------------------
- nza = 0
- do i = 1, n
- x(i) = 0.0
- mark(i) = .false.
- enddo
-
- jajp1 = rowstr(1)
- do j = 1, nrows
- nzrow = 0
-
-c---------------------------------------------------------------------
-c ...loop over the jth row of a
-c---------------------------------------------------------------------
- do k = jajp1 , rowstr(j+1)-1
- i = colidx(k)
- x(i) = x(i) + a(k)
- if ( (.not. mark(i)) .and. (x(i) .ne. 0.D0)) then
- mark(i) = .true.
- nzrow = nzrow + 1
- nzloc(nzrow) = i
- endif
- enddo
-
-c---------------------------------------------------------------------
-c ... extract the nonzeros of this row
-c---------------------------------------------------------------------
- do k = 1, nzrow
- i = nzloc(k)
- mark(i) = .false.
- xi = x(i)
- x(i) = 0.D0
- if (xi .ne. 0.D0) then
- nza = nza + 1
- a(nza) = xi
- colidx(nza) = i
- endif
- enddo
- jajp1 = rowstr(j+1)
- rowstr(j+1) = nza + rowstr(1)
- enddo
-CC write (*, 11000) nza
- return
-11000 format ( //,'final nonzero count in sparse ',
- 1 /,'number of nonzeros = ', i16 )
- end
-c-------end of sparse-----------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine sprnvc( n, nz, v, iv, nzloc, mark )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit logical (a-z)
- double precision v(*)
- integer n, nz, iv(*), nzloc(n), nn1
- integer mark(n)
- common /urando/ amult, tran
- double precision amult, tran
-
-
-c---------------------------------------------------------------------
-c generate a sparse n-vector (v, iv)
-c having nzv nonzeros
-c
-c mark(i) is set to 1 if position i is nonzero.
-c mark is all zero on entry and is reset to all zero before exit
-c this corrects a performance bug found by John G. Lewis, caused by
-c reinitialization of mark on every one of the n calls to sprnvc
-c---------------------------------------------------------------------
-
- integer nzrow, nzv, ii, i, icnvrt
-
- external randlc, icnvrt
- double precision randlc, vecelt, vecloc
-
-
- nzv = 0
- nzrow = 0
- nn1 = 1
- 50 continue
- nn1 = 2 * nn1
- if (nn1 .lt. n) goto 50
-
-c---------------------------------------------------------------------
-c nn1 is the smallest power of two not less than n
-c---------------------------------------------------------------------
-
-100 continue
- if (nzv .ge. nz) goto 110
- vecelt = randlc( tran, amult )
-
-c---------------------------------------------------------------------
-c generate an integer between 1 and n in a portable manner
-c---------------------------------------------------------------------
- vecloc = randlc(tran, amult)
- i = icnvrt(vecloc, nn1) + 1
- if (i .gt. n) goto 100
-
-c---------------------------------------------------------------------
-c was this integer generated already?
-c---------------------------------------------------------------------
- if (mark(i) .eq. 0) then
- mark(i) = 1
- nzrow = nzrow + 1
- nzloc(nzrow) = i
- nzv = nzv + 1
- v(nzv) = vecelt
- iv(nzv) = i
- endif
- goto 100
-110 continue
- do ii = 1, nzrow
- i = nzloc(ii)
- mark(i) = 0
- enddo
- return
- end
-c-------end of sprnvc-----------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- function icnvrt(x, ipwr2)
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit logical (a-z)
- double precision x
- integer ipwr2, icnvrt
-
-c---------------------------------------------------------------------
-c scale a double precision number x in (0,1) by a power of 2 and chop it
-c---------------------------------------------------------------------
- icnvrt = int(ipwr2 * x)
-
- return
- end
-c-------end of icnvrt-----------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine vecset(n, v, iv, nzv, i, val)
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit logical (a-z)
- integer n, iv(*), nzv, i, k
- double precision v(*), val
-
-c---------------------------------------------------------------------
-c set ith element of sparse vector (v, iv) with
-c nzv nonzeros to val
-c---------------------------------------------------------------------
-
- logical set
-
- set = .false.
- do k = 1, nzv
- if (iv(k) .eq. i) then
- v(k) = val
- set = .true.
- endif
- enddo
- if (.not. set) then
- nzv = nzv + 1
- v(nzv) = val
- iv(nzv) = i
- endif
- return
- end
-c-------end of vecset-----------------------------
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'mpif.h'
-
- integer me, nprocs, root, dp_type
- common /mpistuff/ me, nprocs, root, dp_type
-
+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=ft
-BENCHMARKU=FT
-
-include ../config/make.def
-
-include ../sys/make.common
-
-OBJS = ft.o ${COMMON}/${RAND}.o ${COMMON}/print_results.o ${COMMON}/timers.o
-
-${PROGRAM}: config ${OBJS}
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-
-
-.f.o:
- ${FCOMPILE} $<
-
-ft.o: ft.f global.h mpinpb.h npbparams.h
-
-clean:
- - rm -f *.o *~ mputil*
- - rm -f ft npbparams.h core
+++ /dev/null
-This code implements the time integration of a three-dimensional
-partial differential equation using the Fast Fourier Transform.
-Some of the dimension statements are not F77 conforming and will
-not work using the g77 compiler. All dimension statements,
-however, are legal F90.
\ No newline at end of file
+++ /dev/null
-!-------------------------------------------------------------------------!
-! !
-! N A S P A R A L L E L B E N C H M A R K S 3.3 !
-! !
-! F T !
-! !
-!-------------------------------------------------------------------------!
-! !
-! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
-! It is described in NAS Technical Reports 95-020 and 02-007 !
-! !
-! Permission to use, copy, distribute and modify this software !
-! for any purpose with or without fee is hereby granted. We !
-! request, however, that all derived work reference the NAS !
-! Parallel Benchmarks 3.3. This software is provided "as is" !
-! without express or implied warranty. !
-! !
-! Information on NPB 3.3, including the technical report, the !
-! original specifications, source code, results and information !
-! on how to submit new results, is available at: !
-! !
-! http://www.nas.nasa.gov/Software/NPB/ !
-! !
-! Send comments or suggestions to npb@nas.nasa.gov !
-! !
-! NAS Parallel Benchmarks Group !
-! NASA Ames Research Center !
-! Mail Stop: T27A-1 !
-! Moffett Field, CA 94035-1000 !
-! !
-! E-mail: npb@nas.nasa.gov !
-! Fax: (650) 604-3957 !
-! !
-!-------------------------------------------------------------------------!
-
-!TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER
-!STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT
-!JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID
-!FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY
-!USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE
-!NPB3.0-SER)
-
-!TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY
-!MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE
-!NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE
-!COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS
-!((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS
-!THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE
-!PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A
-!FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER.
-!THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL.
-
-!UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT
-!FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE
-!"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM
-!DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER
-!OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1
-!AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION
-!OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE
-!ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO.
-
-c---------------------------------------------------------------------
-c
-c Authors: D. Bailey
-c W. Saphir
-c R. F. Van der Wijngaart
-c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c FT benchmark
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- program ft
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpif.h'
- include 'global.h'
- integer i, ierr
-
-c---------------------------------------------------------------------
-c u0, u1, u2 are the main arrays in the problem.
-c Depending on the decomposition, these arrays will have different
-c dimensions. To accomodate all possibilities, we allocate them as
-c one-dimensional arrays and pass them to subroutines for different
-c views
-c - u0 contains the initial (transformed) initial condition
-c - u1 and u2 are working arrays
-c---------------------------------------------------------------------
-
- double complex u0(ntdivnp),
- > u1(ntdivnp),
- > u2(ntdivnp)
- double precision twiddle(ntdivnp)
-c---------------------------------------------------------------------
-c Large arrays are in common so that they are allocated on the
-c heap rather than the stack. This common block is not
-c referenced directly anywhere else. Padding is to avoid accidental
-c cache problems, since all array sizes are powers of two.
-c---------------------------------------------------------------------
-
- double complex pad1(3), pad2(3), pad3(3)
- common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle
-
- integer iter
- double precision total_time, mflops
- logical verified
- character class
-
- call MPI_Init(ierr)
-
-c---------------------------------------------------------------------
-c Run the entire problem once to make sure all data is touched.
-c This reduces variable startup costs, which is important for such a
-c short benchmark. The other NPB 2 implementations are similar.
-c---------------------------------------------------------------------
- do i = 1, t_max
- call timer_clear(i)
- end do
-
- call setup()
- call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
- call compute_initial_conditions(u1, dims(1,1), dims(2,1),
- > dims(3,1))
- call fft_init (dims(1,1))
- call fft(1, u1, u0)
-
-c---------------------------------------------------------------------
-c Start over from the beginning. Note that all operations must
-c be timed, in contrast to other benchmarks.
-c---------------------------------------------------------------------
- do i = 1, t_max
- call timer_clear(i)
- end do
- call MPI_Barrier(MPI_COMM_WORLD, ierr)
-
- call timer_start(T_total)
- if (timers_enabled) call timer_start(T_setup)
-
- call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3))
- call compute_initial_conditions(u1, dims(1,1), dims(2,1),
- > dims(3,1))
- call fft_init (dims(1,1))
-
- if (timers_enabled) call synchup()
- if (timers_enabled) call timer_stop(T_setup)
-
- if (timers_enabled) call timer_start(T_fft)
- call fft(1, u1, u0)
- if (timers_enabled) call timer_stop(T_fft)
-
- do iter = 1, niter
- if (timers_enabled) call timer_start(T_evolve)
- call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1))
- if (timers_enabled) call timer_stop(T_evolve)
- if (timers_enabled) call timer_start(T_fft)
- call fft(-1, u1, u2)
- if (timers_enabled) call timer_stop(T_fft)
- if (timers_enabled) call synchup()
- if (timers_enabled) call timer_start(T_checksum)
- call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1))
- if (timers_enabled) call timer_stop(T_checksum)
- end do
-
- call verify(nx, ny, nz, niter, verified, class)
- call timer_stop(t_total)
- if (np .ne. np_min) verified = .false.
- total_time = timer_read(t_total)
-
- if( total_time .ne. 0. ) then
- mflops = 1.0d-6*ntotal_f *
- > (14.8157+7.19641*log(ntotal_f)
- > + (5.23518+7.21113*log(ntotal_f))*niter)
- > /total_time
- else
- mflops = 0.0
- endif
- if (me .eq. 0) then
- call print_results('FT', class, nx, ny, nz, niter, np_min, np,
- > total_time, mflops, ' floating point', verified,
- > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7)
- endif
- if (timers_enabled) call print_timers()
- call MPI_Finalize(ierr)
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine evolve(u0, u1, twiddle, d1, d2, d3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c evolve u0 -> u1 (t time steps) in fourier space
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer d1, d2, d3
- double precision exi
- double complex u0(d1,d2,d3)
- double complex u1(d1,d2,d3)
- double precision twiddle(d1,d2,d3)
- integer i, j, k
-
- do k = 1, d3
- do j = 1, d2
- do i = 1, d1
- u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k))
- u1(i,j,k) = u0(i,j,k)
- end do
- end do
- end do
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine compute_initial_conditions(u0, d1, d2, d3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Fill in array u0 with initial conditions from
-c random number generator
-c---------------------------------------------------------------------
- implicit none
- include 'global.h'
- integer d1, d2, d3
- double complex u0(d1, d2, d3)
- integer k
- double precision x0, start, an, dummy
-
-c---------------------------------------------------------------------
-c 0-D and 1-D layouts are easy because each processor gets a contiguous
-c chunk of the array, in the Fortran ordering sense.
-c For a 2-D layout, it's a bit more complicated. We always
-c have entire x-lines (contiguous) in processor.
-c We can do ny/np1 of them at a time since we have
-c ny/np1 contiguous in y-direction. But then we jump
-c by z-planes (nz/np2 of them, total).
-c For the 0-D and 1-D layouts we could do larger chunks, but
-c this turns out to have no measurable impact on performance.
-c---------------------------------------------------------------------
-
-
- start = seed
-c---------------------------------------------------------------------
-c Jump to the starting element for our first plane.
-c---------------------------------------------------------------------
- call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an)
- dummy = randlc(start, an)
- call ipow46(a, 2*nx, ny, an)
-
-c---------------------------------------------------------------------
-c Go through by z planes filling in one square at a time.
-c---------------------------------------------------------------------
- do k = 1, dims(3, 1) ! nz/np2
- x0 = start
- call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k))
- if (k .ne. dims(3, 1)) dummy = randlc(start, an)
- end do
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine ipow46(a, exp_1, exp_2, result)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute a^exponent mod 2^46
-c---------------------------------------------------------------------
-
- implicit none
- double precision a, result, dummy, q, r
- integer exp_1, exp_2, n, n2, ierr
- external randlc
- double precision randlc
- logical two_pow
-c---------------------------------------------------------------------
-c Use
-c a^n = a^(n/2)*a^(n/2) if n even else
-c a^n = a*a^(n-1) if n odd
-c---------------------------------------------------------------------
- result = 1
- if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return
- q = a
- r = 1
- n = exp_1
- two_pow = .true.
-
- do while (two_pow)
- n2 = n/2
- if (n2 * 2 .eq. n) then
- dummy = randlc(q, q)
- n = n2
- else
- n = n * exp_2
- two_pow = .false.
- endif
- end do
-
- do while (n .gt. 1)
- n2 = n/2
- if (n2 * 2 .eq. n) then
- dummy = randlc(q, q)
- n = n2
- else
- dummy = randlc(r, q)
- n = n-1
- endif
- end do
- dummy = randlc(r, q)
- result = r
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'mpinpb.h'
- include 'global.h'
-
- integer ierr, i, j, fstatus
- debug = .FALSE.
-
- call MPI_Comm_size(MPI_COMM_WORLD, np, ierr)
- call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr)
-
- if (.not. convertdouble) then
- dc_type = MPI_DOUBLE_COMPLEX
- else
- dc_type = MPI_COMPLEX
- endif
-
-
- if (me .eq. 0) then
- write(*, 1000)
- open (unit=2,file='inputft.data',status='old', iostat=fstatus)
-
- if (fstatus .eq. 0) then
- write(*,233)
- 233 format(' Reading from input file inputft.data')
- read (2,*) niter
- read (2,*) layout_type
- read (2,*) np1, np2
- close(2)
-
-c---------------------------------------------------------------------
-c check to make sure input data is consistent
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c 1. product of processor grid dims must equal number of processors
-c---------------------------------------------------------------------
-
- if (np1 * np2 .ne. np) then
- write(*, 238)
- 238 format(' np1 and np2 given in input file are not valid.')
- write(*, 239) np1*np2, np
- 239 format(' Product is ', i5, ' and should be ', i5)
- call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
- stop
- endif
-
-c---------------------------------------------------------------------
-c 2. layout type must be valid
-c---------------------------------------------------------------------
-
- if (layout_type .ne. layout_0D .and.
- > layout_type .ne. layout_1D .and.
- > layout_type .ne. layout_2D) then
- write(*, 240)
- 240 format(' Layout type specified in inputft.data is
- > invalid ')
- call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
- stop
- endif
-
-c---------------------------------------------------------------------
-c 3. 0D layout must be 1x1 grid
-c---------------------------------------------------------------------
-
- if (layout_type .eq. layout_0D .and.
- > (np1 .ne.1 .or. np2 .ne. 1)) then
- write(*, 241)
- 241 format(' For 0D layout, both np1 and np2 must be 1 ')
- call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
- stop
- endif
-c---------------------------------------------------------------------
-c 4. 1D layout must be 1xN grid
-c---------------------------------------------------------------------
-
- if (layout_type .eq. layout_1D .and. np1 .ne. 1) then
- write(*, 242)
- 242 format(' For 1D layout, np1 must be 1 ')
- call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
- stop
- endif
-
- else
- write(*,234)
- niter = niter_default
- if (np .eq. 1) then
- np1 = 1
- np2 = 1
- layout_type = layout_0D
- else if (np .le. nz) then
- np1 = 1
- np2 = np
- layout_type = layout_1D
- else
- np1 = nz
- np2 = np/nz
- layout_type = layout_2D
- endif
- endif
-
- if (np .lt. np_min) then
- write(*, 10) np_min
- 10 format(' Error: Compiled for ', I5, ' processors. ')
- write(*, 11) np
- 11 format(' Only ', i5, ' processors found ')
- call MPI_Abort(MPI_COMM_WORLD, 1, ierr)
- stop
- endif
-
- 234 format(' No input file inputft.data. Using compiled defaults')
- write(*, 1001) nx, ny, nz
- write(*, 1002) niter
- write(*, 1004) np
- write(*, 1005) np1, np2
- if (np .ne. np_min) write(*, 1006) np_min
-
- if (layout_type .eq. layout_0D) then
- write(*, 1010) '0D'
- else if (layout_type .eq. layout_1D) then
- write(*, 1010) '1D'
- else
- write(*, 1010) '2D'
- endif
-
- 1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/)
- 1001 format(' Size : ', i4, 'x', i4, 'x', i4)
- 1002 format(' Iterations : ', 7x, i7)
- 1004 format(' Number of processes : ', 7x, i7)
- 1005 format(' Processor array : ', 5x, i4, 'x', i4)
- 1006 format(' WARNING: compiled for ', i5, ' processes. ',
- > ' Will not verify. ')
- 1010 format(' Layout type : ', 9x, A5)
- endif
-
-
-c---------------------------------------------------------------------
-c Since np1, np2 and layout_type are in a common block,
-c this sends all three.
-c---------------------------------------------------------------------
- call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
- call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
-
- if (np1 .eq. 1 .and. np2 .eq. 1) then
- layout_type = layout_0D
- else if (np1 .eq. 1) then
- layout_type = layout_1D
- else
- layout_type = layout_2D
- endif
-
- if (layout_type .eq. layout_0D) then
- do i = 1, 3
- dims(1, i) = nx
- dims(2, i) = ny
- dims(3, i) = nz
- end do
- else if (layout_type .eq. layout_1D) then
- dims(1, 1) = nx
- dims(2, 1) = ny
- dims(3, 1) = nz
-
- dims(1, 2) = nx
- dims(2, 2) = ny
- dims(3, 2) = nz
-
- dims(1, 3) = nz
- dims(2, 3) = nx
- dims(3, 3) = ny
- else if (layout_type .eq. layout_2D) then
- dims(1, 1) = nx
- dims(2, 1) = ny
- dims(3, 1) = nz
-
- dims(1, 2) = ny
- dims(2, 2) = nx
- dims(3, 2) = nz
-
- dims(1, 3) = nz
- dims(2, 3) = nx
- dims(3, 3) = ny
-
- endif
- do i = 1, 3
- dims(2, i) = dims(2, i) / np1
- dims(3, i) = dims(3, i) / np2
- end do
-
-
-c---------------------------------------------------------------------
-c Determine processor coordinates of this processor
-c Processor grid is np1xnp2.
-c Arrays are always (n1, n2/np1, n3/np2)
-c Processor coords are zero-based.
-c---------------------------------------------------------------------
- me2 = mod(me, np2) ! goes from 0...np2-1
- me1 = me/np2 ! goes from 0...np1-1
-c---------------------------------------------------------------------
-c Communicators for rows/columns of processor grid.
-c commslice1 is communicator of all procs with same me1, ranked as me2
-c commslice2 is communicator of all procs with same me2, ranked as me1
-c mpi_comm_split(comm, color, key, ...)
-c---------------------------------------------------------------------
- call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr)
- call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr)
- if (timers_enabled) call synchup()
-
- if (debug) print *, 'proc coords: ', me, me1, me2
-
-c---------------------------------------------------------------------
-c Determine which section of the grid is owned by this
-c processor.
-c---------------------------------------------------------------------
- if (layout_type .eq. layout_0d) then
-
- do i = 1, 3
- xstart(i) = 1
- xend(i) = nx
- ystart(i) = 1
- yend(i) = ny
- zstart(i) = 1
- zend(i) = nz
- end do
-
- else if (layout_type .eq. layout_1d) then
-
- xstart(1) = 1
- xend(1) = nx
- ystart(1) = 1
- yend(1) = ny
- zstart(1) = 1 + me2 * nz/np2
- zend(1) = (me2+1) * nz/np2
-
- xstart(2) = 1
- xend(2) = nx
- ystart(2) = 1
- yend(2) = ny
- zstart(2) = 1 + me2 * nz/np2
- zend(2) = (me2+1) * nz/np2
-
- xstart(3) = 1
- xend(3) = nx
- ystart(3) = 1 + me2 * ny/np2
- yend(3) = (me2+1) * ny/np2
- zstart(3) = 1
- zend(3) = nz
-
- else if (layout_type .eq. layout_2d) then
-
- xstart(1) = 1
- xend(1) = nx
- ystart(1) = 1 + me1 * ny/np1
- yend(1) = (me1+1) * ny/np1
- zstart(1) = 1 + me2 * nz/np2
- zend(1) = (me2+1) * nz/np2
-
- xstart(2) = 1 + me1 * nx/np1
- xend(2) = (me1+1)*nx/np1
- ystart(2) = 1
- yend(2) = ny
- zstart(2) = zstart(1)
- zend(2) = zend(1)
-
- xstart(3) = xstart(2)
- xend(3) = xend(2)
- ystart(3) = 1 + me2 *ny/np2
- yend(3) = (me2+1)*ny/np2
- zstart(3) = 1
- zend(3) = nz
- endif
-
-c---------------------------------------------------------------------
-c Set up info for blocking of ffts and transposes. This improves
-c performance on cache-based systems. Blocking involves
-c working on a chunk of the problem at a time, taking chunks
-c along the first, second, or third dimension.
-c
-c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim)
-c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims)
-
-c Since 1st dim is always in processor, we'll assume it's long enough
-c (default blocking factor is 16 so min size for 1st dim is 16)
-c The only case we have to worry about is cffts1 in a 2d decomposition.
-c so the blocking factor should not be larger than the 2nd dimension.
-c---------------------------------------------------------------------
-
- fftblock = fftblock_default
- fftblockpad = fftblockpad_default
-
- if (layout_type .eq. layout_2d) then
- if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1)
- if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2)
- if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3)
- endif
-
- if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine compute_indexmap(twiddle, d1, d2, d3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2
-c for time evolution exponent.
-c---------------------------------------------------------------------
-
- implicit none
- include 'mpinpb.h'
- include 'global.h'
- integer d1, d2, d3
- integer i, j, k, ii, ii2, jj, ij2, kk
- double precision ap, twiddle(d1, d2, d3)
-
-c---------------------------------------------------------------------
-c this function is very different depending on whether
-c we are in the 0d, 1d or 2d layout. Compute separately.
-c basically we want to convert the fortran indices
-c 1 2 3 4 5 6 7 8
-c to
-c 0 1 2 3 -4 -3 -2 -1
-c The following magic formula does the trick:
-c mod(i-1+n/2, n) - n/2
-c---------------------------------------------------------------------
-
- ap = - 4.d0 * alpha * pi *pi
-
- if (layout_type .eq. layout_0d) then ! xyz layout
- do i = 1, dims(1,3)
- ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
- ii2 = ii*ii
- do j = 1, dims(2,3)
- jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
- ij2 = jj*jj+ii2
- do k = 1, dims(3,3)
- kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
- twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2))
- end do
- end do
- end do
- else if (layout_type .eq. layout_1d) then ! zxy layout
- do i = 1,dims(2,3)
- ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
- ii2 = ii*ii
- do j = 1,dims(3,3)
- jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
- ij2 = jj*jj+ii2
- do k = 1,dims(1,3)
- kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
- twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
- end do
- end do
- end do
- else if (layout_type .eq. layout_2d) then ! zxy layout
- do i = 1,dims(2,3)
- ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2
- ii2 = ii*ii
- do j = 1, dims(3,3)
- jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2
- ij2 = jj*jj+ii2
- do k =1,dims(1,3)
- kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2
- twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2))
- end do
- end do
- end do
- else
- print *, ' Unknown layout type ', layout_type
- stop
- endif
-
- return
- end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine print_timers()
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- integer i
- include 'global.h'
- character*25 tstrings(T_max)
- data tstrings / ' total ',
- > ' setup ',
- > ' fft ',
- > ' evolve ',
- > ' checksum ',
- > ' fftlow ',
- > ' fftcopy ',
- > ' transpose ',
- > ' transpose1_loc ',
- > ' transpose1_glo ',
- > ' transpose1_fin ',
- > ' transpose2_loc ',
- > ' transpose2_glo ',
- > ' transpose2_fin ',
- > ' sync ' /
-
- if (me .ne. 0) return
- do i = 1, t_max
- if (timer_read(i) .ne. 0.0d0) then
- write(*, 100) i, tstrings(i), timer_read(i)
- endif
- end do
- 100 format(' timer ', i2, '(', A16, ') :', F10.6)
- return
- end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine fft(dir, x1, x2)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer dir
- double complex x1(ntdivnp), x2(ntdivnp)
-
- double complex scratch(fftblockpad_default*maxdim*2)
-
-c---------------------------------------------------------------------
-c note: args x1, x2 must be different arrays
-c note: args for cfftsx are (direction, layout, xin, xout, scratch)
-c xin/xout may be the same and it can be somewhat faster
-c if they are
-c note: args for transpose are (layout1, layout2, xin, xout)
-c xin/xout must be different
-c---------------------------------------------------------------------
-
- if (dir .eq. 1) then
- if (layout_type .eq. layout_0d) then
- call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
- > x1, x1, scratch)
- call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
- > x1, x1, scratch)
- call cffts3(1, dims(1,3), dims(2,3), dims(3,3),
- > x1, x2, scratch)
- else if (layout_type .eq. layout_1d) then
- call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
- > x1, x1, scratch)
- call cffts2(1, dims(1,2), dims(2,2), dims(3,2),
- > x1, x1, scratch)
- if (timers_enabled) call timer_start(T_transpose)
- call transpose_xy_z(2, 3, x1, x2)
- if (timers_enabled) call timer_stop(T_transpose)
- call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
- > x2, x2, scratch)
- else if (layout_type .eq. layout_2d) then
- call cffts1(1, dims(1,1), dims(2,1), dims(3,1),
- > x1, x1, scratch)
- if (timers_enabled) call timer_start(T_transpose)
- call transpose_x_y(1, 2, x1, x2)
- if (timers_enabled) call timer_stop(T_transpose)
- call cffts1(1, dims(1,2), dims(2,2), dims(3,2),
- > x2, x2, scratch)
- if (timers_enabled) call timer_start(T_transpose)
- call transpose_x_z(2, 3, x2, x1)
- if (timers_enabled) call timer_stop(T_transpose)
- call cffts1(1, dims(1,3), dims(2,3), dims(3,3),
- > x1, x2, scratch)
- endif
- else
- if (layout_type .eq. layout_0d) then
- call cffts3(-1, dims(1,3), dims(2,3), dims(3,3),
- > x1, x1, scratch)
- call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
- > x1, x1, scratch)
- call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
- > x1, x2, scratch)
- else if (layout_type .eq. layout_1d) then
- call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
- > x1, x1, scratch)
- if (timers_enabled) call timer_start(T_transpose)
- call transpose_x_yz(3, 2, x1, x2)
- if (timers_enabled) call timer_stop(T_transpose)
- call cffts2(-1, dims(1,2), dims(2,2), dims(3,2),
- > x2, x2, scratch)
- call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
- > x2, x2, scratch)
- else if (layout_type .eq. layout_2d) then
- call cffts1(-1, dims(1,3), dims(2,3), dims(3,3),
- > x1, x1, scratch)
- if (timers_enabled) call timer_start(T_transpose)
- call transpose_x_z(3, 2, x1, x2)
- if (timers_enabled) call timer_stop(T_transpose)
- call cffts1(-1, dims(1,2), dims(2,2), dims(3,2),
- > x2, x2, scratch)
- if (timers_enabled) call timer_start(T_transpose)
- call transpose_x_y(2, 1, x2, x1)
- if (timers_enabled) call timer_stop(T_transpose)
- call cffts1(-1, dims(1,1), dims(2,1), dims(3,1),
- > x1, x2, scratch)
- endif
- endif
- return
- end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine cffts1(is, d1, d2, d3, x, xout, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'global.h'
- integer is, d1, d2, d3, logd1
- double complex x(d1,d2,d3)
- double complex xout(d1,d2,d3)
- double complex y(fftblockpad, d1, 2)
- integer i, j, k, jj
-
- logd1 = ilog2(d1)
-
- do k = 1, d3
- do jj = 0, d2 - fftblock, fftblock
- if (timers_enabled) call timer_start(T_fftcopy)
- do j = 1, fftblock
- do i = 1, d1
- y(j,i,1) = x(i,j+jj,k)
- enddo
- enddo
- if (timers_enabled) call timer_stop(T_fftcopy)
-
- if (timers_enabled) call timer_start(T_fftlow)
- call cfftz (is, logd1, d1, y, y(1,1,2))
- if (timers_enabled) call timer_stop(T_fftlow)
-
- if (timers_enabled) call timer_start(T_fftcopy)
- do j = 1, fftblock
- do i = 1, d1
- xout(i,j+jj,k) = y(j,i,1)
- enddo
- enddo
- if (timers_enabled) call timer_stop(T_fftcopy)
- enddo
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine cffts2(is, d1, d2, d3, x, xout, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'global.h'
- integer is, d1, d2, d3, logd2
- double complex x(d1,d2,d3)
- double complex xout(d1,d2,d3)
- double complex y(fftblockpad, d2, 2)
- integer i, j, k, ii
-
- logd2 = ilog2(d2)
-
- do k = 1, d3
- do ii = 0, d1 - fftblock, fftblock
- if (timers_enabled) call timer_start(T_fftcopy)
- do j = 1, d2
- do i = 1, fftblock
- y(i,j,1) = x(i+ii,j,k)
- enddo
- enddo
- if (timers_enabled) call timer_stop(T_fftcopy)
-
- if (timers_enabled) call timer_start(T_fftlow)
- call cfftz (is, logd2, d2, y, y(1, 1, 2))
- if (timers_enabled) call timer_stop(T_fftlow)
-
- if (timers_enabled) call timer_start(T_fftcopy)
- do j = 1, d2
- do i = 1, fftblock
- xout(i+ii,j,k) = y(i,j,1)
- enddo
- enddo
- if (timers_enabled) call timer_stop(T_fftcopy)
- enddo
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine cffts3(is, d1, d2, d3, x, xout, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'global.h'
- integer is, d1, d2, d3, logd3
- double complex x(d1,d2,d3)
- double complex xout(d1,d2,d3)
- double complex y(fftblockpad, d3, 2)
- integer i, j, k, ii
-
- logd3 = ilog2(d3)
-
- do j = 1, d2
- do ii = 0, d1 - fftblock, fftblock
- if (timers_enabled) call timer_start(T_fftcopy)
- do k = 1, d3
- do i = 1, fftblock
- y(i,k,1) = x(i+ii,j,k)
- enddo
- enddo
- if (timers_enabled) call timer_stop(T_fftcopy)
-
- if (timers_enabled) call timer_start(T_fftlow)
- call cfftz (is, logd3, d3, y, y(1, 1, 2))
- if (timers_enabled) call timer_stop(T_fftlow)
-
- if (timers_enabled) call timer_start(T_fftcopy)
- do k = 1, d3
- do i = 1, fftblock
- xout(i+ii,j,k) = y(i,k,1)
- enddo
- enddo
- if (timers_enabled) call timer_stop(T_fftcopy)
- enddo
- enddo
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine fft_init (n)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the roots-of-unity array that will be used for subsequent FFTs.
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
-
- integer m,n,nu,ku,i,j,ln
- double precision t, ti
-
-
-c---------------------------------------------------------------------
-c Initialize the U array with sines and cosines in a manner that permits
-c stride one access at each FFT iteration.
-c---------------------------------------------------------------------
- nu = n
- m = ilog2(n)
- u(1) = m
- ku = 2
- ln = 1
-
- do j = 1, m
- t = pi / ln
-
- do i = 0, ln - 1
- ti = i * t
- u(i+ku) = dcmplx (cos (ti), sin(ti))
- enddo
-
- ku = ku + ln
- ln = 2 * ln
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine cfftz (is, m, n, x, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Computes NY N-point complex-to-complex FFTs of X using an algorithm due
-c to Swarztrauber. X is both the input and the output array, while Y is a
-c scratch array. It is assumed that N = 2^M. Before calling CFFTZ to
-c perform FFTs, the array U must be initialized by calling CFFTZ with IS
-c set to 0 and M set to MX, where MX is the maximum value of M for any
-c subsequent call.
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
-
- integer is,m,n,i,j,l,mx
- double complex x, y
-
- dimension x(fftblockpad,n), y(fftblockpad,n)
-
-c---------------------------------------------------------------------
-c Check if input parameters are invalid.
-c---------------------------------------------------------------------
- mx = u(1)
- if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx)
- > then
- write (*, 1) is, m, mx
- 1 format ('CFFTZ: Either U has not been initialized, or else'/
- > 'one of the input parameters is invalid', 3I5)
- stop
- endif
-
-c---------------------------------------------------------------------
-c Perform one variant of the Stockham FFT.
-c---------------------------------------------------------------------
- do l = 1, m, 2
- call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y)
- if (l .eq. m) goto 160
- call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x)
- enddo
-
- goto 180
-
-c---------------------------------------------------------------------
-c Copy Y to X.
-c---------------------------------------------------------------------
- 160 do j = 1, n
- do i = 1, fftblock
- x(i,j) = y(i,j)
- enddo
- enddo
-
- 180 continue
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Performs the L-th iteration of the second variant of the Stockham FFT.
-c---------------------------------------------------------------------
-
- implicit none
-
- integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22
- double complex u,x,y,u1,x11,x21
- dimension u(n), x(ny1,n), y(ny1,n)
-
-
-c---------------------------------------------------------------------
-c Set initial parameters.
-c---------------------------------------------------------------------
-
- n1 = n / 2
- lk = 2 ** (l - 1)
- li = 2 ** (m - l)
- lj = 2 * lk
- ku = li + 1
-
- do i = 0, li - 1
- i11 = i * lk + 1
- i12 = i11 + n1
- i21 = i * lj + 1
- i22 = i21 + lk
- if (is .ge. 1) then
- u1 = u(ku+i)
- else
- u1 = dconjg (u(ku+i))
- endif
-
-c---------------------------------------------------------------------
-c This loop is vectorizable.
-c---------------------------------------------------------------------
- do k = 0, lk - 1
- do j = 1, ny
- x11 = x(j,i11+k)
- x21 = x(j,i12+k)
- y(j,i21+k) = x11 + x21
- y(j,i22+k) = u1 * (x11 - x21)
- enddo
- enddo
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- integer function ilog2(n)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- integer n, nn, lg
- if (n .eq. 1) then
- ilog2=0
- return
- endif
- lg = 1
- nn = 2
- do while (nn .lt. n)
- nn = nn*2
- lg = lg+1
- end do
- ilog2 = lg
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_x_yz(l1, l2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer l1, l2
- double complex xin(ntdivnp), xout(ntdivnp)
-
- call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1),
- > xin, xout)
-
- call transpose2_global(xout, xin)
-
- call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1),
- > xin, xout)
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_xy_z(l1, l2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer l1, l2
- double complex xin(ntdivnp), xout(ntdivnp)
-
- call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1),
- > xin, xout)
- call transpose2_global(xout, xin)
- call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1),
- > xin, xout)
-
- return
- end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose2_local(n1, n2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'mpinpb.h'
- include 'global.h'
- integer n1, n2
- double complex xin(n1, n2), xout(n2, n1)
-
- double complex z(transblockpad, transblock)
-
- integer i, j, ii, jj
-
- if (timers_enabled) call timer_start(T_transxzloc)
-
-c---------------------------------------------------------------------
-c If possible, block the transpose for cache memory systems.
-c How much does this help? Example: R8000 Power Challenge (90 MHz)
-c Blocked version decreases time spend in this routine
-c from 14 seconds to 5.2 seconds on 8 nodes class A.
-c---------------------------------------------------------------------
-
- if (n1 .lt. transblock .or. n2 .lt. transblock) then
- if (n1 .ge. n2) then
- do j = 1, n2
- do i = 1, n1
- xout(j, i) = xin(i, j)
- end do
- end do
- else
- do i = 1, n1
- do j = 1, n2
- xout(j, i) = xin(i, j)
- end do
- end do
- endif
- else
- do j = 0, n2-1, transblock
- do i = 0, n1-1, transblock
-
-c---------------------------------------------------------------------
-c Note: compiler should be able to take j+jj out of inner loop
-c---------------------------------------------------------------------
- do jj = 1, transblock
- do ii = 1, transblock
- z(jj,ii) = xin(i+ii, j+jj)
- end do
- end do
-
- do ii = 1, transblock
- do jj = 1, transblock
- xout(j+jj, i+ii) = z(jj,ii)
- end do
- end do
-
- end do
- end do
- endif
- if (timers_enabled) call timer_stop(T_transxzloc)
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose2_global(xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- include 'mpinpb.h'
- double complex xin(ntdivnp)
- double complex xout(ntdivnp)
- integer ierr
-
- if (timers_enabled) call synchup()
-
- if (timers_enabled) call timer_start(T_transxzglo)
- call mpi_alltoall(xin, ntdivnp/np, dc_type,
- > xout, ntdivnp/np, dc_type,
- > commslice1, ierr)
- if (timers_enabled) call timer_stop(T_transxzglo)
-
- return
- end
-
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose2_finish(n1, n2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer n1, n2, ioff
- double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2)
-
- integer i, j, p
-
- if (timers_enabled) call timer_start(T_transxzfin)
- do p = 0, np2-1
- ioff = p*n2
- do j = 1, n1/np2
- do i = 1, n2
- xout(i+ioff, j) = xin(i, j, p)
- end do
- end do
- end do
- if (timers_enabled) call timer_stop(T_transxzfin)
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_x_z(l1, l2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer l1, l2
- double complex xin(ntdivnp), xout(ntdivnp)
-
- call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1),
- > xin, xout)
- call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1),
- > xout, xin)
- call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2),
- > xin, xout)
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_x_z_local(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer d1, d2, d3
- double complex xin(d1,d2,d3)
- double complex xout(d3,d2,d1)
- integer block1, block3
- integer i, j, k, kk, ii, i1, k1
-
- double complex buf(transblockpad, maxdim)
- if (timers_enabled) call timer_start(T_transxzloc)
- if (d1 .lt. 32) goto 100
- block3 = d3
- if (block3 .eq. 1) goto 100
- if (block3 .gt. transblock) block3 = transblock
- block1 = d1
- if (block1*block3 .gt. transblock*transblock)
- > block1 = transblock*transblock/block3
-c---------------------------------------------------------------------
-c blocked transpose
-c---------------------------------------------------------------------
- do j = 1, d2
- do kk = 0, d3-block3, block3
- do ii = 0, d1-block1, block1
-
- do k = 1, block3
- k1 = k + kk
- do i = 1, block1
- buf(k, i) = xin(i+ii, j, k1)
- end do
- end do
-
- do i = 1, block1
- i1 = i + ii
- do k = 1, block3
- xout(k+kk, j, i1) = buf(k, i)
- end do
- end do
-
- end do
- end do
- end do
- goto 200
-
-
-c---------------------------------------------------------------------
-c basic transpose
-c---------------------------------------------------------------------
- 100 continue
-
- do j = 1, d2
- do k = 1, d3
- do i = 1, d1
- xout(k, j, i) = xin(i, j, k)
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c all done
-c---------------------------------------------------------------------
- 200 continue
-
- if (timers_enabled) call timer_stop(T_transxzloc)
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_x_z_global(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- include 'mpinpb.h'
- integer d1, d2, d3
- double complex xin(d3,d2,d1)
- double complex xout(d3,d2,d1) ! not real layout, but right size
- integer ierr
-
- if (timers_enabled) call synchup()
-
-c---------------------------------------------------------------------
-c do transpose among all processes with same 1-coord (me1)
-c---------------------------------------------------------------------
- if (timers_enabled)call timer_start(T_transxzglo)
- call mpi_alltoall(xin, d1*d2*d3/np2, dc_type,
- > xout, d1*d2*d3/np2, dc_type,
- > commslice1, ierr)
- if (timers_enabled) call timer_stop(T_transxzglo)
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_x_z_finish(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer d1, d2, d3
- double complex xin(d1/np2, d2, d3, 0:np2-1)
- double complex xout(d1,d2,d3)
- integer i, j, k, p, ioff
- if (timers_enabled) call timer_start(T_transxzfin)
-c---------------------------------------------------------------------
-c this is the most straightforward way of doing it. the
-c calculation in the inner loop doesn't help.
-c do i = 1, d1/np2
-c do j = 1, d2
-c do k = 1, d3
-c do p = 0, np2-1
-c ii = i + p*d1/np2
-c xout(ii, j, k) = xin(i, j, k, p)
-c end do
-c end do
-c end do
-c end do
-c---------------------------------------------------------------------
-
- do p = 0, np2-1
- ioff = p*d1/np2
- do k = 1, d3
- do j = 1, d2
- do i = 1, d1/np2
- xout(i+ioff, j, k) = xin(i, j, k, p)
- end do
- end do
- end do
- end do
- if (timers_enabled) call timer_stop(T_transxzfin)
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_x_y(l1, l2, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer l1, l2
- double complex xin(ntdivnp), xout(ntdivnp)
-
-c---------------------------------------------------------------------
-c xy transpose is a little tricky, since we don't want
-c to touch 3rd axis. But alltoall must involve 3rd axis (most
-c slowly varying) to be efficient. So we do
-c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local)
-c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global)
-c then local finish.
-c---------------------------------------------------------------------
-
-
- call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1),
- > xin, xout)
- call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1),
- > xout, xin)
- call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2),
- > xin, xout)
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_x_y_local(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer d1, d2, d3
- double complex xin(d1, d2, d3)
- double complex xout(d2, d3, d1)
- integer i, j, k
- if (timers_enabled) call timer_start(T_transxyloc)
-
- do k = 1, d3
- do i = 1, d1
- do j = 1, d2
- xout(j,k,i)=xin(i,j,k)
- end do
- end do
- end do
- if (timers_enabled) call timer_stop(T_transxyloc)
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_x_y_global(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- include 'mpinpb.h'
- integer d1, d2, d3
-c---------------------------------------------------------------------
-c array is in form (ny/np1, nz/np2, nx)
-c---------------------------------------------------------------------
- double complex xin(d2,d3,d1)
- double complex xout(d2,d3,d1) ! not real layout but right size
- integer ierr
-
- if (timers_enabled) call synchup()
-
-c---------------------------------------------------------------------
-c do transpose among all processes with same 1-coord (me1)
-c---------------------------------------------------------------------
- if (timers_enabled) call timer_start(T_transxyglo)
- call mpi_alltoall(xin, d1*d2*d3/np1, dc_type,
- > xout, d1*d2*d3/np1, dc_type,
- > commslice2, ierr)
- if (timers_enabled) call timer_stop(T_transxyglo)
-
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine transpose_x_y_finish(d1, d2, d3, xin, xout)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- integer d1, d2, d3
- double complex xin(d1/np1, d3, d2, 0:np1-1)
- double complex xout(d1,d2,d3)
- integer i, j, k, p, ioff
- if (timers_enabled) call timer_start(T_transxyfin)
-c---------------------------------------------------------------------
-c this is the most straightforward way of doing it. the
-c calculation in the inner loop doesn't help.
-c do i = 1, d1/np1
-c do j = 1, d2
-c do k = 1, d3
-c do p = 0, np1-1
-c ii = i + p*d1/np1
-c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2)
-c xout(ii, j, k) = xin(i, k, j, p)
-c end do
-c end do
-c end do
-c end do
-c---------------------------------------------------------------------
-
- do p = 0, np1-1
- ioff = p*d1/np1
- do k = 1, d3
- do j = 1, d2
- do i = 1, d1/np1
- xout(i+ioff, j, k) = xin(i, k, j, p)
- end do
- end do
- end do
- end do
- if (timers_enabled) call timer_stop(T_transxyfin)
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine checksum(i, u1, d1, d2, d3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- include 'mpinpb.h'
- integer i, d1, d2, d3
- double complex u1(d1, d2, d3)
- integer j, q,r,s, ierr
- double complex chk,allchk
- chk = (0.0,0.0)
-
- do j=1,1024
- q = mod(j, nx)+1
- if (q .ge. xstart(1) .and. q .le. xend(1)) then
- r = mod(3*j,ny)+1
- if (r .ge. ystart(1) .and. r .le. yend(1)) then
- s = mod(5*j,nz)+1
- if (s .ge. zstart(1) .and. s .le. zend(1)) then
- chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1)
- end if
- end if
- end if
- end do
- chk = chk/ntotal_f
-
- call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM,
- > 0, MPI_COMM_WORLD, ierr)
- if (me .eq. 0) then
- write (*, 30) i, allchk
- 30 format (' T =',I5,5X,'Checksum =',1P2D22.12)
- endif
-
-c sums(i) = allchk
-c If we compute the checksum for diagnostic purposes, we let i be
-c negative, so the result will not be stored in an array
- if (i .gt. 0) sums(i) = allchk
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine synchup
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- include 'mpinpb.h'
- integer ierr
- call timer_start(T_synch)
- call mpi_barrier(MPI_COMM_WORLD, ierr)
- call timer_stop(T_synch)
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine verify (d1, d2, d3, nt, verified, class)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
- include 'global.h'
- include 'mpinpb.h'
- integer d1, d2, d3, nt
- character class
- logical verified
- integer ierr, size, i
- double precision err, epsilon
-
-c---------------------------------------------------------------------
-c Reference checksums
-c---------------------------------------------------------------------
- double complex csum_ref(25)
-
-
- class = 'U'
-
- if (me .ne. 0) return
-
- epsilon = 1.0d-12
- verified = .FALSE.
-
- if (d1 .eq. 64 .and.
- > d2 .eq. 64 .and.
- > d3 .eq. 64 .and.
- > nt .eq. 6) then
-c---------------------------------------------------------------------
-c Sample size reference checksums
-c---------------------------------------------------------------------
- class = 'S'
- csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02)
- csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02)
- csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02)
- csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02)
- csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02)
- csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02)
-
- else if (d1 .eq. 128 .and.
- > d2 .eq. 128 .and.
- > d3 .eq. 32 .and.
- > nt .eq. 6) then
-c---------------------------------------------------------------------
-c Class W size reference checksums
-c---------------------------------------------------------------------
- class = 'W'
- csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02)
- csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02)
- csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02)
- csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02)
- csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02)
- csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02)
-
- else if (d1 .eq. 256 .and.
- > d2 .eq. 256 .and.
- > d3 .eq. 128 .and.
- > nt .eq. 6) then
-c---------------------------------------------------------------------
-c Class A size reference checksums
-c---------------------------------------------------------------------
- class = 'A'
- csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02)
- csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02)
- csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02)
- csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02)
- csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02)
- csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02)
-
- else if (d1 .eq. 512 .and.
- > d2 .eq. 256 .and.
- > d3 .eq. 256 .and.
- > nt .eq. 20) then
-c---------------------------------------------------------------------
-c Class B size reference checksums
-c---------------------------------------------------------------------
- class = 'B'
- csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02)
- csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02)
- csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02)
- csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02)
- csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02)
- csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02)
- csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02)
- csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02)
- csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02)
- csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02)
- csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02)
- csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02)
- csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02)
- csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02)
- csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02)
- csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02)
- csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02)
- csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02)
- csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02)
- csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02)
-
- else if (d1 .eq. 512 .and.
- > d2 .eq. 512 .and.
- > d3 .eq. 512 .and.
- > nt .eq. 20) then
-c---------------------------------------------------------------------
-c Class C size reference checksums
-c---------------------------------------------------------------------
- class = 'C'
- csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02)
- csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02)
- csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02)
- csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02)
- csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02)
- csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02)
- csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02)
- csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02)
- csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02)
- csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02)
- csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02)
- csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02)
- csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02)
- csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02)
- csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02)
- csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02)
- csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02)
- csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02)
- csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02)
- csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02)
-
- else if (d1 .eq. 2048 .and.
- > d2 .eq. 1024 .and.
- > d3 .eq. 1024 .and.
- > nt .eq. 25) then
-c---------------------------------------------------------------------
-c Class D size reference checksums
-c---------------------------------------------------------------------
- class = 'D'
- csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02)
- csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02)
- csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02)
- csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02)
- csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02)
- csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02)
- csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02)
- csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02)
- csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02)
- csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02)
- csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02)
- csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02)
- csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02)
- csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02)
- csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02)
- csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02)
- csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02)
- csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02)
- csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02)
- csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02)
- csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02)
- csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02)
- csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02)
- csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02)
- csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02)
-
- else if (d1 .eq. 4096 .and.
- > d2 .eq. 2048 .and.
- > d3 .eq. 2048 .and.
- > nt .eq. 25) then
-c---------------------------------------------------------------------
-c Class E size reference checksums
-c---------------------------------------------------------------------
- class = 'E'
- csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02)
- csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02)
- csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02)
- csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02)
- csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02)
- csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02)
- csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02)
- csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02)
- csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02)
- csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02)
- csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02)
- csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02)
- csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02)
- csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02)
- csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02)
- csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02)
- csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02)
- csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02)
- csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02)
- csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02)
- csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02)
- csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02)
- csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02)
- csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02)
- csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02)
-
- endif
-
-
- if (class .ne. 'U') then
-
- do i = 1, nt
- err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) )
- if (.not.(err .le. epsilon)) goto 100
- end do
- verified = .TRUE.
- 100 continue
-
- endif
-
- call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr)
- if (size .ne. np) then
- write(*, 4010) np
- write(*, 4011)
- write(*, 4012)
-c---------------------------------------------------------------------
-c multiple statements because some Fortran compilers have
-c problems with long strings.
-c---------------------------------------------------------------------
- 4010 format( ' Warning: benchmark was compiled for ', i5,
- > 'processors')
- 4011 format( ' Must be run on this many processors for official',
- > ' verification')
- 4012 format( ' so memory access is repeatable')
- verified = .false.
- endif
-
- if (class .ne. 'U') then
- if (verified) then
- write(*,2000)
- 2000 format(' Result verification successful')
- else
- write(*,2001)
- 2001 format(' Result verification failed')
- endif
- endif
- print *, 'class = ', class
-
- return
- end
-
-
+++ /dev/null
- include 'npbparams.h'
-
-c 2D processor array -> 2D grid decomposition (by pencils)
-c If processor array is 1xN or -> 1D grid decomposition (by planes)
-c If processor array is 1x1 -> 0D grid decomposition
-c For simplicity, do not treat Nx1 (np2 = 1) specially
- integer np1, np2, np
-
-c basic decomposition strategy
- integer layout_type
- integer layout_0D, layout_1D, layout_2D
- parameter (layout_0D = 0, layout_1D = 1, layout_2D = 2)
-
- common /procgrid/ np1, np2, layout_type, np
-
-
-c Cache blocking params. These values are good for most
-c RISC processors.
-c FFT parameters:
-c fftblock controls how many ffts are done at a time.
-c The default is appropriate for most cache-based machines
-c On vector machines, the FFT can be vectorized with vector
-c length equal to the block size, so the block size should
-c be as large as possible. This is the size of the smallest
-c dimension of the problem: 128 for class A, 256 for class B and
-c 512 for class C.
-c Transpose parameters:
-c transblock is the blocking factor for the transposes when there
-c is a 1-D layout. On vector machines it should probably be
-c large (largest dimension of the problem).
-
-
- integer fftblock_default, fftblockpad_default
- parameter (fftblock_default=16, fftblockpad_default=18)
- integer transblock, transblockpad
- parameter(transblock=32, transblockpad=34)
-
- integer fftblock, fftblockpad
- common /blockinfo/ fftblock, fftblockpad
-
-c we need a bunch of logic to keep track of how
-c arrays are laid out.
-c coords of this processor
- integer me, me1, me2
- common /coords/ me, me1, me2
-c need a communicator for row/col in processor grid
- integer commslice1, commslice2
- common /comms/ commslice1, commslice2
-
-
-
-c There are basically three stages
-c 1: x-y-z layout
-c 2: after x-transform (before y)
-c 3: after y-transform (before z)
-c The computation proceeds logically as
-
-c set up initial conditions
-c fftx(1)
-c transpose (1->2)
-c ffty(2)
-c transpose (2->3)
-c fftz(3)
-c time evolution
-c fftz(3)
-c transpose (3->2)
-c ffty(2)
-c transpose (2->1)
-c fftx(1)
-c compute residual(1)
-
-c for the 0D, 1D, 2D strategies, the layouts look like xxx
-c
-c 0D 1D 2D
-c 1: xyz xyz xyz
-c 2: xyz xyz yxz
-c 3: xyz zyx zxy
-
-c the array dimensions are stored in dims(coord, phase)
- integer dims(3, 3)
- integer xstart(3), ystart(3), zstart(3)
- integer xend(3), yend(3), zend(3)
- common /layout/ dims,
- > xstart, ystart, zstart,
- > xend, yend, zend
-
- integer T_total, T_setup, T_fft, T_evolve, T_checksum,
- > T_fftlow, T_fftcopy, T_transpose,
- > T_transxzloc, T_transxzglo, T_transxzfin,
- > T_transxyloc, T_transxyglo, T_transxyfin,
- > T_synch, T_max
- parameter (T_total = 1, T_setup = 2, T_fft = 3,
- > T_evolve = 4, T_checksum = 5,
- > T_fftlow = 6, T_fftcopy = 7, T_transpose = 8,
- > T_transxzloc = 9, T_transxzglo = 10, T_transxzfin = 11,
- > T_transxyloc = 12, T_transxyglo = 13,
- > T_transxyfin = 14, T_synch = 15, T_max = 15)
-
-
-
- logical timers_enabled
- parameter (timers_enabled = .false.)
-
-
- external timer_read
- double precision timer_read
- external ilog2
- integer ilog2
-
- external randlc
- double precision randlc
-
-
-c other stuff
- logical debug, debugsynch
- common /dbg/ debug, debugsynch
-
- double precision seed, a, pi, alpha
- parameter (seed = 314159265.d0, a = 1220703125.d0,
- > pi = 3.141592653589793238d0, alpha=1.0d-6)
-
-c roots of unity array
-c relies on x being largest dimension?
- double complex u(nx)
- common /ucomm/ u
-
-
-c for checksum data
- double complex sums(0:niter_default)
- common /sumcomm/ sums
-
-c number of iterations
- integer niter
- common /iter/ niter
+++ /dev/null
-6 ! number of iterations
-2 ! layout type. 0 = 0d, 1 = 1d, 2 = 2d
-2 4 ! processor layout. 0d must be "1 1"; 1d must be "1 N"
+++ /dev/null
- include 'mpif.h'
-c mpi data types
- integer dc_type
- common /mpistuff/ dc_type
+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=lu
-BENCHMARKU=LU
-VEC=
-
-include ../config/make.def
-
-OBJS = lu.o init_comm.o read_input.o bcast_inputs.o proc_grid.o neighbors.o \
- nodedim.o subdomain.o setcoeff.o sethyper.o setbv.o exact.o setiv.o \
- erhs.o ssor.o exchange_1.o exchange_3.o exchange_4.o exchange_5.o \
- exchange_6.o rhs.o l2norm.o jacld.o blts$(VEC).o jacu.o buts$(VEC).o \
- error.o pintgr.o verify.o ${COMMON}/print_results.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-
-# npbparams.h is included by applu.incl
-# The following rule should do the trick but many make programs (not gmake)
-# will do the wrong thing and rebuild the world every time (because the
-# mod time on header.h is not changed. One solution would be to
-# touch header.h but this might cause confusion if someone has
-# accidentally deleted it. Instead, make the dependency on npbparams.h
-# explicit in all the lines below (even though dependence is indirect).
-
-# applu.incl: npbparams.h
-
-${PROGRAM}: config
- @if [ x$(VERSION) = xvec ] ; then \
- ${MAKE} VEC=_vec exec; \
- elif [ x$(VERSION) = xVEC ] ; then \
- ${MAKE} VEC=_vec exec; \
- else \
- ${MAKE} exec; \
- fi
-
-exec: $(OBJS)
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-.f.o :
- ${FCOMPILE} $<
-
-lu.o: lu.f applu.incl npbparams.h
-bcast_inputs.o: bcast_inputs.f applu.incl npbparams.h mpinpb.h
-blts$(VEC).o: blts$(VEC).f
-buts$(VEC).o: buts$(VEC).f
-erhs.o: erhs.f applu.incl npbparams.h
-error.o: error.f applu.incl npbparams.h mpinpb.h
-exact.o: exact.f applu.incl npbparams.h
-exchange_1.o: exchange_1.f applu.incl npbparams.h mpinpb.h
-exchange_3.o: exchange_3.f applu.incl npbparams.h mpinpb.h
-exchange_4.o: exchange_4.f applu.incl npbparams.h mpinpb.h
-exchange_5.o: exchange_5.f applu.incl npbparams.h mpinpb.h
-exchange_6.o: exchange_6.f applu.incl npbparams.h mpinpb.h
-init_comm.o: init_comm.f applu.incl npbparams.h mpinpb.h
-jacld.o: jacld.f applu.incl npbparams.h
-jacu.o: jacu.f applu.incl npbparams.h
-l2norm.o: l2norm.f mpinpb.h
-neighbors.o: neighbors.f applu.incl npbparams.h
-nodedim.o: nodedim.f
-pintgr.o: pintgr.f applu.incl npbparams.h mpinpb.h
-proc_grid.o: proc_grid.f applu.incl npbparams.h
-read_input.o: read_input.f applu.incl npbparams.h mpinpb.h
-rhs.o: rhs.f applu.incl npbparams.h
-setbv.o: setbv.f applu.incl npbparams.h
-setiv.o: setiv.f applu.incl npbparams.h
-setcoeff.o: setcoeff.f applu.incl npbparams.h
-sethyper.o: sethyper.f applu.incl npbparams.h
-ssor.o: ssor.f applu.incl npbparams.h mpinpb.h
-subdomain.o: subdomain.f applu.incl npbparams.h mpinpb.h
-verify.o: verify.f applu.incl npbparams.h
-
-clean:
- - /bin/rm -f npbparams.h
- - /bin/rm -f *.o *~
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-c--- applu.incl
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c npbparams.h defines parameters that depend on the class and
-c number of nodes
-c---------------------------------------------------------------------
-
- include 'npbparams.h'
-
-c---------------------------------------------------------------------
-c parameters which can be overridden in runtime config file
-c (in addition to size of problem - isiz01,02,03 give the maximum size)
-c ipr = 1 to print out verbose information
-c omega = 2.0 is correct for all classes
-c tolrsd is tolerance levels for steady state residuals
-c---------------------------------------------------------------------
- integer ipr_default
- parameter (ipr_default = 1)
- double precision omega_default
- parameter (omega_default = 1.2d0)
- double precision tolrsd1_def, tolrsd2_def, tolrsd3_def,
- > tolrsd4_def, tolrsd5_def
- parameter (tolrsd1_def=1.0e-08,
- > tolrsd2_def=1.0e-08, tolrsd3_def=1.0e-08,
- > tolrsd4_def=1.0e-08, tolrsd5_def=1.0e-08)
-
- double precision c1, c2, c3, c4, c5
- parameter( c1 = 1.40d+00, c2 = 0.40d+00,
- > c3 = 1.00d-01, c4 = 1.00d+00,
- > c5 = 1.40d+00 )
-
-c---------------------------------------------------------------------
-c grid
-c---------------------------------------------------------------------
- integer nx, ny, nz
- integer nx0, ny0, nz0
- integer ipt, ist, iend
- integer jpt, jst, jend
- integer ii1, ii2
- integer ji1, ji2
- integer ki1, ki2
- double precision dxi, deta, dzeta
- double precision tx1, tx2, tx3
- double precision ty1, ty2, ty3
- double precision tz1, tz2, tz3
-
- common/cgcon/ dxi, deta, dzeta,
- > tx1, tx2, tx3,
- > ty1, ty2, ty3,
- > tz1, tz2, tz3,
- > nx, ny, nz,
- > nx0, ny0, nz0,
- > ipt, ist, iend,
- > jpt, jst, jend,
- > ii1, ii2,
- > ji1, ji2,
- > ki1, ki2
-
-c---------------------------------------------------------------------
-c dissipation
-c---------------------------------------------------------------------
- double precision dx1, dx2, dx3, dx4, dx5
- double precision dy1, dy2, dy3, dy4, dy5
- double precision dz1, dz2, dz3, dz4, dz5
- double precision dssp
-
- common/disp/ dx1,dx2,dx3,dx4,dx5,
- > dy1,dy2,dy3,dy4,dy5,
- > dz1,dz2,dz3,dz4,dz5,
- > dssp
-
-c---------------------------------------------------------------------
-c field variables and residuals
-c---------------------------------------------------------------------
- double precision u(5,-1:isiz1+2,-1:isiz2+2,isiz3),
- > rsd(5,-1:isiz1+2,-1:isiz2+2,isiz3),
- > frct(5,-1:isiz1+2,-1:isiz2+2,isiz3),
- > flux(5,0:isiz1+1,0:isiz2+1,isiz3)
-
- common/cvar/ u,
- > rsd,
- > frct,
- > flux
-
-
-c---------------------------------------------------------------------
-c output control parameters
-c---------------------------------------------------------------------
- integer ipr, inorm
-
- common/cprcon/ ipr, inorm
-
-c---------------------------------------------------------------------
-c newton-raphson iteration control parameters
-c---------------------------------------------------------------------
- integer itmax, invert
- double precision dt, omega, tolrsd(5),
- > rsdnm(5), errnm(5), frc, ttotal
-
- common/ctscon/ dt, omega, tolrsd,
- > rsdnm, errnm, frc, ttotal,
- > itmax, invert
-
- double precision a(5,5,isiz1,isiz2),
- > b(5,5,isiz1,isiz2),
- > c(5,5,isiz1,isiz2),
- > d(5,5,isiz1,isiz2)
-
- common/cjac/ a, b, c, d
-
-c---------------------------------------------------------------------
-c coefficients of the exact solution
-c---------------------------------------------------------------------
- double precision ce(5,13)
-
- common/cexact/ ce
-
-c---------------------------------------------------------------------
-c multi-processor common blocks
-c---------------------------------------------------------------------
- integer id, ndim, num, xdim, ydim, row, col
- common/dim/ id,ndim,num,xdim,ydim,row,col
-
- integer north,south,east,west
- common/neigh/ north,south,east, west
-
- integer from_s,from_n,from_e,from_w
- parameter (from_s=1,from_n=2,from_e=3,from_w=4)
-
- integer npmax
- parameter (npmax=isiz01+isiz02)
-
- logical icommn(npmax+1),icomms(npmax+1),
- > icomme(npmax+1),icommw(npmax+1)
- double precision buf(5,2*isiz2*isiz3),
- > buf1(5,2*isiz2*isiz3)
-
- common/comm/ buf, buf1,
- > icommn,icomms,
- > icomme,icommw
-
- double precision maxtime
- common/timer/maxtime
-
-
-c---------------------------------------------------------------------
-c end of include file
-c---------------------------------------------------------------------
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine bcast_inputs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer ierr
-
-c---------------------------------------------------------------------
-c root broadcasts the data
-c The data isn't contiguous or of the same type, so it's not
-c clear how to send it in the "MPI" way.
-c We could pack the info into a buffer or we could create
-c an obscene datatype to handle it all at once. Since we only
-c broadcast the data once, just use a separate broadcast for
-c each piece.
-c---------------------------------------------------------------------
- call MPI_BCAST(ipr, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
- call MPI_BCAST(inorm, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
- call MPI_BCAST(itmax, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
- call MPI_BCAST(dt, 1, dp_type, root, MPI_COMM_WORLD, ierr)
- call MPI_BCAST(omega, 1, dp_type, root, MPI_COMM_WORLD, ierr)
- call MPI_BCAST(tolrsd, 5, dp_type, root, MPI_COMM_WORLD, ierr)
- call MPI_BCAST(nx0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
- call MPI_BCAST(ny0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
- call MPI_BCAST(nz0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr)
-
- return
- end
-
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine blts ( ldmx, ldmy, ldmz,
- > nx, ny, nz, k,
- > omega,
- > v,
- > ldz, ldy, ldx, d,
- > ist, iend, jst, jend,
- > nx0, ny0, ipt, jpt)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c compute the regular-sparse, block lower triangular solution:
-c
-c v <-- ( L-inv ) * v
-c
-c---------------------------------------------------------------------
-
- implicit none
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- integer ldmx, ldmy, ldmz
- integer nx, ny, nz
- integer k
- double precision omega
- double precision v( 5, -1:ldmx+2, -1:ldmy+2, *),
- > ldz( 5, 5, ldmx, ldmy),
- > ldy( 5, 5, ldmx, ldmy),
- > ldx( 5, 5, ldmx, ldmy),
- > d( 5, 5, ldmx, ldmy)
- integer ist, iend
- integer jst, jend
- integer nx0, ny0
- integer ipt, jpt
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, m
- integer iex
- double precision tmp, tmp1
- double precision tmat(5,5)
-
-
-c---------------------------------------------------------------------
-c receive data from north and west
-c---------------------------------------------------------------------
- iex = 0
- call exchange_1( v,k,iex )
-
-
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
-
- v( m, i, j, k ) = v( m, i, j, k )
- > - omega * ( ldz( m, 1, i, j ) * v( 1, i, j, k-1 )
- > + ldz( m, 2, i, j ) * v( 2, i, j, k-1 )
- > + ldz( m, 3, i, j ) * v( 3, i, j, k-1 )
- > + ldz( m, 4, i, j ) * v( 4, i, j, k-1 )
- > + ldz( m, 5, i, j ) * v( 5, i, j, k-1 ) )
-
- end do
- end do
- end do
-
-
- do j=jst,jend
- do i = ist, iend
-
- do m = 1, 5
-
- v( m, i, j, k ) = v( m, i, j, k )
- > - omega * ( ldy( m, 1, i, j ) * v( 1, i, j-1, k )
- > + ldx( m, 1, i, j ) * v( 1, i-1, j, k )
- > + ldy( m, 2, i, j ) * v( 2, i, j-1, k )
- > + ldx( m, 2, i, j ) * v( 2, i-1, j, k )
- > + ldy( m, 3, i, j ) * v( 3, i, j-1, k )
- > + ldx( m, 3, i, j ) * v( 3, i-1, j, k )
- > + ldy( m, 4, i, j ) * v( 4, i, j-1, k )
- > + ldx( m, 4, i, j ) * v( 4, i-1, j, k )
- > + ldy( m, 5, i, j ) * v( 5, i, j-1, k )
- > + ldx( m, 5, i, j ) * v( 5, i-1, j, k ) )
-
- end do
-
-c---------------------------------------------------------------------
-c diagonal block inversion
-c
-c forward elimination
-c---------------------------------------------------------------------
- do m = 1, 5
- tmat( m, 1 ) = d( m, 1, i, j )
- tmat( m, 2 ) = d( m, 2, i, j )
- tmat( m, 3 ) = d( m, 3, i, j )
- tmat( m, 4 ) = d( m, 4, i, j )
- tmat( m, 5 ) = d( m, 5, i, j )
- end do
-
- tmp1 = 1.0d+00 / tmat( 1, 1 )
- tmp = tmp1 * tmat( 2, 1 )
- tmat( 2, 2 ) = tmat( 2, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 2, 3 ) = tmat( 2, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 2, 4 ) = tmat( 2, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 2, 5 ) = tmat( 2, 5 )
- > - tmp * tmat( 1, 5 )
- v( 2, i, j, k ) = v( 2, i, j, k )
- > - v( 1, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 3, 1 )
- tmat( 3, 2 ) = tmat( 3, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 3, 3 ) = tmat( 3, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 3, 4 ) = tmat( 3, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 3, 5 ) = tmat( 3, 5 )
- > - tmp * tmat( 1, 5 )
- v( 3, i, j, k ) = v( 3, i, j, k )
- > - v( 1, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 4, 1 )
- tmat( 4, 2 ) = tmat( 4, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 4, 3 ) = tmat( 4, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 1, 5 )
- v( 4, i, j, k ) = v( 4, i, j, k )
- > - v( 1, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 5, 1 )
- tmat( 5, 2 ) = tmat( 5, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 5, 3 ) = tmat( 5, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 1, 5 )
- v( 5, i, j, k ) = v( 5, i, j, k )
- > - v( 1, i, j, k ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 2, 2 )
- tmp = tmp1 * tmat( 3, 2 )
- tmat( 3, 3 ) = tmat( 3, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 3, 4 ) = tmat( 3, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 3, 5 ) = tmat( 3, 5 )
- > - tmp * tmat( 2, 5 )
- v( 3, i, j, k ) = v( 3, i, j, k )
- > - v( 2, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 4, 2 )
- tmat( 4, 3 ) = tmat( 4, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 2, 5 )
- v( 4, i, j, k ) = v( 4, i, j, k )
- > - v( 2, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 5, 2 )
- tmat( 5, 3 ) = tmat( 5, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 2, 5 )
- v( 5, i, j, k ) = v( 5, i, j, k )
- > - v( 2, i, j, k ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 3, 3 )
- tmp = tmp1 * tmat( 4, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 3, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 3, 5 )
- v( 4, i, j, k ) = v( 4, i, j, k )
- > - v( 3, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 5, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 3, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 3, 5 )
- v( 5, i, j, k ) = v( 5, i, j, k )
- > - v( 3, i, j, k ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 4, 4 )
- tmp = tmp1 * tmat( 5, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 4, 5 )
- v( 5, i, j, k ) = v( 5, i, j, k )
- > - v( 4, i, j, k ) * tmp
-
-c---------------------------------------------------------------------
-c back substitution
-c---------------------------------------------------------------------
- v( 5, i, j, k ) = v( 5, i, j, k )
- > / tmat( 5, 5 )
-
- v( 4, i, j, k ) = v( 4, i, j, k )
- > - tmat( 4, 5 ) * v( 5, i, j, k )
- v( 4, i, j, k ) = v( 4, i, j, k )
- > / tmat( 4, 4 )
-
- v( 3, i, j, k ) = v( 3, i, j, k )
- > - tmat( 3, 4 ) * v( 4, i, j, k )
- > - tmat( 3, 5 ) * v( 5, i, j, k )
- v( 3, i, j, k ) = v( 3, i, j, k )
- > / tmat( 3, 3 )
-
- v( 2, i, j, k ) = v( 2, i, j, k )
- > - tmat( 2, 3 ) * v( 3, i, j, k )
- > - tmat( 2, 4 ) * v( 4, i, j, k )
- > - tmat( 2, 5 ) * v( 5, i, j, k )
- v( 2, i, j, k ) = v( 2, i, j, k )
- > / tmat( 2, 2 )
-
- v( 1, i, j, k ) = v( 1, i, j, k )
- > - tmat( 1, 2 ) * v( 2, i, j, k )
- > - tmat( 1, 3 ) * v( 3, i, j, k )
- > - tmat( 1, 4 ) * v( 4, i, j, k )
- > - tmat( 1, 5 ) * v( 5, i, j, k )
- v( 1, i, j, k ) = v( 1, i, j, k )
- > / tmat( 1, 1 )
-
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c send data to east and south
-c---------------------------------------------------------------------
- iex = 2
- call exchange_1( v,k,iex )
-
- return
- end
-
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine blts ( ldmx, ldmy, ldmz,
- > nx, ny, nz, k,
- > omega,
- > v,
- > ldz, ldy, ldx, d,
- > ist, iend, jst, jend,
- > nx0, ny0, ipt, jpt)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c compute the regular-sparse, block lower triangular solution:
-c
-c v <-- ( L-inv ) * v
-c
-c---------------------------------------------------------------------
-
- implicit none
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- integer ldmx, ldmy, ldmz
- integer nx, ny, nz
- integer k
- double precision omega
- double precision v( 5, -1:ldmx+2, -1:ldmy+2, *),
- > ldz( 5, 5, ldmx, ldmy),
- > ldy( 5, 5, ldmx, ldmy),
- > ldx( 5, 5, ldmx, ldmy),
- > d( 5, 5, ldmx, ldmy)
- integer ist, iend
- integer jst, jend
- integer nx0, ny0
- integer ipt, jpt
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, m, l, istp, iendp
- integer iex
- double precision tmp, tmp1
- double precision tmat(5,5)
-
-
-c---------------------------------------------------------------------
-c receive data from north and west
-c---------------------------------------------------------------------
- iex = 0
- call exchange_1( v,k,iex )
-
-
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
-
- v( m, i, j, k ) = v( m, i, j, k )
- > - omega * ( ldz( m, 1, i, j ) * v( 1, i, j, k-1 )
- > + ldz( m, 2, i, j ) * v( 2, i, j, k-1 )
- > + ldz( m, 3, i, j ) * v( 3, i, j, k-1 )
- > + ldz( m, 4, i, j ) * v( 4, i, j, k-1 )
- > + ldz( m, 5, i, j ) * v( 5, i, j, k-1 ) )
-
- end do
- end do
- end do
-
-
- do l = ist+jst, iend+jend
- istp = max(l - jend, ist)
- iendp = min(l - jst, iend)
-
-!dir$ ivdep
- do i = istp, iendp
- j = l - i
-
-!!dir$ unroll 5
-! manually unroll the loop
-! do m = 1, 5
-
- v( 1, i, j, k ) = v( 1, i, j, k )
- > - omega * ( ldy( 1, 1, i, j ) * v( 1, i, j-1, k )
- > + ldx( 1, 1, i, j ) * v( 1, i-1, j, k )
- > + ldy( 1, 2, i, j ) * v( 2, i, j-1, k )
- > + ldx( 1, 2, i, j ) * v( 2, i-1, j, k )
- > + ldy( 1, 3, i, j ) * v( 3, i, j-1, k )
- > + ldx( 1, 3, i, j ) * v( 3, i-1, j, k )
- > + ldy( 1, 4, i, j ) * v( 4, i, j-1, k )
- > + ldx( 1, 4, i, j ) * v( 4, i-1, j, k )
- > + ldy( 1, 5, i, j ) * v( 5, i, j-1, k )
- > + ldx( 1, 5, i, j ) * v( 5, i-1, j, k ) )
- v( 2, i, j, k ) = v( 2, i, j, k )
- > - omega * ( ldy( 2, 1, i, j ) * v( 1, i, j-1, k )
- > + ldx( 2, 1, i, j ) * v( 1, i-1, j, k )
- > + ldy( 2, 2, i, j ) * v( 2, i, j-1, k )
- > + ldx( 2, 2, i, j ) * v( 2, i-1, j, k )
- > + ldy( 2, 3, i, j ) * v( 3, i, j-1, k )
- > + ldx( 2, 3, i, j ) * v( 3, i-1, j, k )
- > + ldy( 2, 4, i, j ) * v( 4, i, j-1, k )
- > + ldx( 2, 4, i, j ) * v( 4, i-1, j, k )
- > + ldy( 2, 5, i, j ) * v( 5, i, j-1, k )
- > + ldx( 2, 5, i, j ) * v( 5, i-1, j, k ) )
- v( 3, i, j, k ) = v( 3, i, j, k )
- > - omega * ( ldy( 3, 1, i, j ) * v( 1, i, j-1, k )
- > + ldx( 3, 1, i, j ) * v( 1, i-1, j, k )
- > + ldy( 3, 2, i, j ) * v( 2, i, j-1, k )
- > + ldx( 3, 2, i, j ) * v( 2, i-1, j, k )
- > + ldy( 3, 3, i, j ) * v( 3, i, j-1, k )
- > + ldx( 3, 3, i, j ) * v( 3, i-1, j, k )
- > + ldy( 3, 4, i, j ) * v( 4, i, j-1, k )
- > + ldx( 3, 4, i, j ) * v( 4, i-1, j, k )
- > + ldy( 3, 5, i, j ) * v( 5, i, j-1, k )
- > + ldx( 3, 5, i, j ) * v( 5, i-1, j, k ) )
- v( 4, i, j, k ) = v( 4, i, j, k )
- > - omega * ( ldy( 4, 1, i, j ) * v( 1, i, j-1, k )
- > + ldx( 4, 1, i, j ) * v( 1, i-1, j, k )
- > + ldy( 4, 2, i, j ) * v( 2, i, j-1, k )
- > + ldx( 4, 2, i, j ) * v( 2, i-1, j, k )
- > + ldy( 4, 3, i, j ) * v( 3, i, j-1, k )
- > + ldx( 4, 3, i, j ) * v( 3, i-1, j, k )
- > + ldy( 4, 4, i, j ) * v( 4, i, j-1, k )
- > + ldx( 4, 4, i, j ) * v( 4, i-1, j, k )
- > + ldy( 4, 5, i, j ) * v( 5, i, j-1, k )
- > + ldx( 4, 5, i, j ) * v( 5, i-1, j, k ) )
- v( 5, i, j, k ) = v( 5, i, j, k )
- > - omega * ( ldy( 5, 1, i, j ) * v( 1, i, j-1, k )
- > + ldx( 5, 1, i, j ) * v( 1, i-1, j, k )
- > + ldy( 5, 2, i, j ) * v( 2, i, j-1, k )
- > + ldx( 5, 2, i, j ) * v( 2, i-1, j, k )
- > + ldy( 5, 3, i, j ) * v( 3, i, j-1, k )
- > + ldx( 5, 3, i, j ) * v( 3, i-1, j, k )
- > + ldy( 5, 4, i, j ) * v( 4, i, j-1, k )
- > + ldx( 5, 4, i, j ) * v( 4, i-1, j, k )
- > + ldy( 5, 5, i, j ) * v( 5, i, j-1, k )
- > + ldx( 5, 5, i, j ) * v( 5, i-1, j, k ) )
-
-! end do
-
-c---------------------------------------------------------------------
-c diagonal block inversion
-c
-c forward elimination
-c---------------------------------------------------------------------
-!!dir$ unroll 5
-! manually unroll the loop
-! do m = 1, 5
- tmat( 1, 1 ) = d( 1, 1, i, j )
- tmat( 1, 2 ) = d( 1, 2, i, j )
- tmat( 1, 3 ) = d( 1, 3, i, j )
- tmat( 1, 4 ) = d( 1, 4, i, j )
- tmat( 1, 5 ) = d( 1, 5, i, j )
- tmat( 2, 1 ) = d( 2, 1, i, j )
- tmat( 2, 2 ) = d( 2, 2, i, j )
- tmat( 2, 3 ) = d( 2, 3, i, j )
- tmat( 2, 4 ) = d( 2, 4, i, j )
- tmat( 2, 5 ) = d( 2, 5, i, j )
- tmat( 3, 1 ) = d( 3, 1, i, j )
- tmat( 3, 2 ) = d( 3, 2, i, j )
- tmat( 3, 3 ) = d( 3, 3, i, j )
- tmat( 3, 4 ) = d( 3, 4, i, j )
- tmat( 3, 5 ) = d( 3, 5, i, j )
- tmat( 4, 1 ) = d( 4, 1, i, j )
- tmat( 4, 2 ) = d( 4, 2, i, j )
- tmat( 4, 3 ) = d( 4, 3, i, j )
- tmat( 4, 4 ) = d( 4, 4, i, j )
- tmat( 4, 5 ) = d( 4, 5, i, j )
- tmat( 5, 1 ) = d( 5, 1, i, j )
- tmat( 5, 2 ) = d( 5, 2, i, j )
- tmat( 5, 3 ) = d( 5, 3, i, j )
- tmat( 5, 4 ) = d( 5, 4, i, j )
- tmat( 5, 5 ) = d( 5, 5, i, j )
-! end do
-
- tmp1 = 1.0d+00 / tmat( 1, 1 )
- tmp = tmp1 * tmat( 2, 1 )
- tmat( 2, 2 ) = tmat( 2, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 2, 3 ) = tmat( 2, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 2, 4 ) = tmat( 2, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 2, 5 ) = tmat( 2, 5 )
- > - tmp * tmat( 1, 5 )
- v( 2, i, j, k ) = v( 2, i, j, k )
- > - v( 1, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 3, 1 )
- tmat( 3, 2 ) = tmat( 3, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 3, 3 ) = tmat( 3, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 3, 4 ) = tmat( 3, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 3, 5 ) = tmat( 3, 5 )
- > - tmp * tmat( 1, 5 )
- v( 3, i, j, k ) = v( 3, i, j, k )
- > - v( 1, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 4, 1 )
- tmat( 4, 2 ) = tmat( 4, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 4, 3 ) = tmat( 4, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 1, 5 )
- v( 4, i, j, k ) = v( 4, i, j, k )
- > - v( 1, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 5, 1 )
- tmat( 5, 2 ) = tmat( 5, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 5, 3 ) = tmat( 5, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 1, 5 )
- v( 5, i, j, k ) = v( 5, i, j, k )
- > - v( 1, i, j, k ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 2, 2 )
- tmp = tmp1 * tmat( 3, 2 )
- tmat( 3, 3 ) = tmat( 3, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 3, 4 ) = tmat( 3, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 3, 5 ) = tmat( 3, 5 )
- > - tmp * tmat( 2, 5 )
- v( 3, i, j, k ) = v( 3, i, j, k )
- > - v( 2, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 4, 2 )
- tmat( 4, 3 ) = tmat( 4, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 2, 5 )
- v( 4, i, j, k ) = v( 4, i, j, k )
- > - v( 2, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 5, 2 )
- tmat( 5, 3 ) = tmat( 5, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 2, 5 )
- v( 5, i, j, k ) = v( 5, i, j, k )
- > - v( 2, i, j, k ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 3, 3 )
- tmp = tmp1 * tmat( 4, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 3, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 3, 5 )
- v( 4, i, j, k ) = v( 4, i, j, k )
- > - v( 3, i, j, k ) * tmp
-
- tmp = tmp1 * tmat( 5, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 3, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 3, 5 )
- v( 5, i, j, k ) = v( 5, i, j, k )
- > - v( 3, i, j, k ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 4, 4 )
- tmp = tmp1 * tmat( 5, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 4, 5 )
- v( 5, i, j, k ) = v( 5, i, j, k )
- > - v( 4, i, j, k ) * tmp
-
-c---------------------------------------------------------------------
-c back substitution
-c---------------------------------------------------------------------
- v( 5, i, j, k ) = v( 5, i, j, k )
- > / tmat( 5, 5 )
-
- v( 4, i, j, k ) = v( 4, i, j, k )
- > - tmat( 4, 5 ) * v( 5, i, j, k )
- v( 4, i, j, k ) = v( 4, i, j, k )
- > / tmat( 4, 4 )
-
- v( 3, i, j, k ) = v( 3, i, j, k )
- > - tmat( 3, 4 ) * v( 4, i, j, k )
- > - tmat( 3, 5 ) * v( 5, i, j, k )
- v( 3, i, j, k ) = v( 3, i, j, k )
- > / tmat( 3, 3 )
-
- v( 2, i, j, k ) = v( 2, i, j, k )
- > - tmat( 2, 3 ) * v( 3, i, j, k )
- > - tmat( 2, 4 ) * v( 4, i, j, k )
- > - tmat( 2, 5 ) * v( 5, i, j, k )
- v( 2, i, j, k ) = v( 2, i, j, k )
- > / tmat( 2, 2 )
-
- v( 1, i, j, k ) = v( 1, i, j, k )
- > - tmat( 1, 2 ) * v( 2, i, j, k )
- > - tmat( 1, 3 ) * v( 3, i, j, k )
- > - tmat( 1, 4 ) * v( 4, i, j, k )
- > - tmat( 1, 5 ) * v( 5, i, j, k )
- v( 1, i, j, k ) = v( 1, i, j, k )
- > / tmat( 1, 1 )
-
-
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c send data to east and south
-c---------------------------------------------------------------------
- iex = 2
- call exchange_1( v,k,iex )
-
- return
- end
-
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine buts( ldmx, ldmy, ldmz,
- > nx, ny, nz, k,
- > omega,
- > v, tv,
- > d, udx, udy, udz,
- > ist, iend, jst, jend,
- > nx0, ny0, ipt, jpt )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c compute the regular-sparse, block upper triangular solution:
-c
-c v <-- ( U-inv ) * v
-c
-c---------------------------------------------------------------------
-
- implicit none
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- integer ldmx, ldmy, ldmz
- integer nx, ny, nz
- integer k
- double precision omega
- double precision v( 5, -1:ldmx+2, -1:ldmy+2, *),
- > tv(5, ldmx, ldmy),
- > d( 5, 5, ldmx, ldmy),
- > udx( 5, 5, ldmx, ldmy),
- > udy( 5, 5, ldmx, ldmy),
- > udz( 5, 5, ldmx, ldmy )
- integer ist, iend
- integer jst, jend
- integer nx0, ny0
- integer ipt, jpt
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, m
- integer iex
- double precision tmp, tmp1
- double precision tmat(5,5)
-
-
-c---------------------------------------------------------------------
-c receive data from south and east
-c---------------------------------------------------------------------
- iex = 1
- call exchange_1( v,k,iex )
-
- do j = jend, jst, -1
- do i = iend, ist, -1
- do m = 1, 5
- tv( m, i, j ) =
- > omega * ( udz( m, 1, i, j ) * v( 1, i, j, k+1 )
- > + udz( m, 2, i, j ) * v( 2, i, j, k+1 )
- > + udz( m, 3, i, j ) * v( 3, i, j, k+1 )
- > + udz( m, 4, i, j ) * v( 4, i, j, k+1 )
- > + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) )
- end do
- end do
- end do
-
-
- do j = jend,jst,-1
- do i = iend,ist,-1
-
- do m = 1, 5
- tv( m, i, j ) = tv( m, i, j )
- > + omega * ( udy( m, 1, i, j ) * v( 1, i, j+1, k )
- > + udx( m, 1, i, j ) * v( 1, i+1, j, k )
- > + udy( m, 2, i, j ) * v( 2, i, j+1, k )
- > + udx( m, 2, i, j ) * v( 2, i+1, j, k )
- > + udy( m, 3, i, j ) * v( 3, i, j+1, k )
- > + udx( m, 3, i, j ) * v( 3, i+1, j, k )
- > + udy( m, 4, i, j ) * v( 4, i, j+1, k )
- > + udx( m, 4, i, j ) * v( 4, i+1, j, k )
- > + udy( m, 5, i, j ) * v( 5, i, j+1, k )
- > + udx( m, 5, i, j ) * v( 5, i+1, j, k ) )
- end do
-
-c---------------------------------------------------------------------
-c diagonal block inversion
-c---------------------------------------------------------------------
- do m = 1, 5
- tmat( m, 1 ) = d( m, 1, i, j )
- tmat( m, 2 ) = d( m, 2, i, j )
- tmat( m, 3 ) = d( m, 3, i, j )
- tmat( m, 4 ) = d( m, 4, i, j )
- tmat( m, 5 ) = d( m, 5, i, j )
- end do
-
- tmp1 = 1.0d+00 / tmat( 1, 1 )
- tmp = tmp1 * tmat( 2, 1 )
- tmat( 2, 2 ) = tmat( 2, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 2, 3 ) = tmat( 2, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 2, 4 ) = tmat( 2, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 2, 5 ) = tmat( 2, 5 )
- > - tmp * tmat( 1, 5 )
- tv( 2, i, j ) = tv( 2, i, j )
- > - tv( 1, i, j ) * tmp
-
- tmp = tmp1 * tmat( 3, 1 )
- tmat( 3, 2 ) = tmat( 3, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 3, 3 ) = tmat( 3, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 3, 4 ) = tmat( 3, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 3, 5 ) = tmat( 3, 5 )
- > - tmp * tmat( 1, 5 )
- tv( 3, i, j ) = tv( 3, i, j )
- > - tv( 1, i, j ) * tmp
-
- tmp = tmp1 * tmat( 4, 1 )
- tmat( 4, 2 ) = tmat( 4, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 4, 3 ) = tmat( 4, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 1, 5 )
- tv( 4, i, j ) = tv( 4, i, j )
- > - tv( 1, i, j ) * tmp
-
- tmp = tmp1 * tmat( 5, 1 )
- tmat( 5, 2 ) = tmat( 5, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 5, 3 ) = tmat( 5, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 1, 5 )
- tv( 5, i, j ) = tv( 5, i, j )
- > - tv( 1, i, j ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 2, 2 )
- tmp = tmp1 * tmat( 3, 2 )
- tmat( 3, 3 ) = tmat( 3, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 3, 4 ) = tmat( 3, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 3, 5 ) = tmat( 3, 5 )
- > - tmp * tmat( 2, 5 )
- tv( 3, i, j ) = tv( 3, i, j )
- > - tv( 2, i, j ) * tmp
-
- tmp = tmp1 * tmat( 4, 2 )
- tmat( 4, 3 ) = tmat( 4, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 2, 5 )
- tv( 4, i, j ) = tv( 4, i, j )
- > - tv( 2, i, j ) * tmp
-
- tmp = tmp1 * tmat( 5, 2 )
- tmat( 5, 3 ) = tmat( 5, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 2, 5 )
- tv( 5, i, j ) = tv( 5, i, j )
- > - tv( 2, i, j ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 3, 3 )
- tmp = tmp1 * tmat( 4, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 3, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 3, 5 )
- tv( 4, i, j ) = tv( 4, i, j )
- > - tv( 3, i, j ) * tmp
-
- tmp = tmp1 * tmat( 5, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 3, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 3, 5 )
- tv( 5, i, j ) = tv( 5, i, j )
- > - tv( 3, i, j ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 4, 4 )
- tmp = tmp1 * tmat( 5, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 4, 5 )
- tv( 5, i, j ) = tv( 5, i, j )
- > - tv( 4, i, j ) * tmp
-
-c---------------------------------------------------------------------
-c back substitution
-c---------------------------------------------------------------------
- tv( 5, i, j ) = tv( 5, i, j )
- > / tmat( 5, 5 )
-
- tv( 4, i, j ) = tv( 4, i, j )
- > - tmat( 4, 5 ) * tv( 5, i, j )
- tv( 4, i, j ) = tv( 4, i, j )
- > / tmat( 4, 4 )
-
- tv( 3, i, j ) = tv( 3, i, j )
- > - tmat( 3, 4 ) * tv( 4, i, j )
- > - tmat( 3, 5 ) * tv( 5, i, j )
- tv( 3, i, j ) = tv( 3, i, j )
- > / tmat( 3, 3 )
-
- tv( 2, i, j ) = tv( 2, i, j )
- > - tmat( 2, 3 ) * tv( 3, i, j )
- > - tmat( 2, 4 ) * tv( 4, i, j )
- > - tmat( 2, 5 ) * tv( 5, i, j )
- tv( 2, i, j ) = tv( 2, i, j )
- > / tmat( 2, 2 )
-
- tv( 1, i, j ) = tv( 1, i, j )
- > - tmat( 1, 2 ) * tv( 2, i, j )
- > - tmat( 1, 3 ) * tv( 3, i, j )
- > - tmat( 1, 4 ) * tv( 4, i, j )
- > - tmat( 1, 5 ) * tv( 5, i, j )
- tv( 1, i, j ) = tv( 1, i, j )
- > / tmat( 1, 1 )
-
- v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j )
- v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j )
- v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j )
- v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j )
- v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j )
-
-
- enddo
- end do
-
-c---------------------------------------------------------------------
-c send data to north and west
-c---------------------------------------------------------------------
- iex = 3
- call exchange_1( v,k,iex )
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine buts( ldmx, ldmy, ldmz,
- > nx, ny, nz, k,
- > omega,
- > v, tv,
- > d, udx, udy, udz,
- > ist, iend, jst, jend,
- > nx0, ny0, ipt, jpt )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c compute the regular-sparse, block upper triangular solution:
-c
-c v <-- ( U-inv ) * v
-c
-c---------------------------------------------------------------------
-
- implicit none
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- integer ldmx, ldmy, ldmz
- integer nx, ny, nz
- integer k
- double precision omega
- double precision v( 5, -1:ldmx+2, -1:ldmy+2, *),
- > tv(5, ldmx, ldmy),
- > d( 5, 5, ldmx, ldmy),
- > udx( 5, 5, ldmx, ldmy),
- > udy( 5, 5, ldmx, ldmy),
- > udz( 5, 5, ldmx, ldmy )
- integer ist, iend
- integer jst, jend
- integer nx0, ny0
- integer ipt, jpt
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, m, l, istp, iendp
- integer iex
- double precision tmp, tmp1
- double precision tmat(5,5)
-
-
-c---------------------------------------------------------------------
-c receive data from south and east
-c---------------------------------------------------------------------
- iex = 1
- call exchange_1( v,k,iex )
-
- do j = jend, jst, -1
- do i = iend, ist, -1
- do m = 1, 5
- tv( m, i, j ) =
- > omega * ( udz( m, 1, i, j ) * v( 1, i, j, k+1 )
- > + udz( m, 2, i, j ) * v( 2, i, j, k+1 )
- > + udz( m, 3, i, j ) * v( 3, i, j, k+1 )
- > + udz( m, 4, i, j ) * v( 4, i, j, k+1 )
- > + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) )
- end do
- end do
- end do
-
-
- do l = iend+jend, ist+jst, -1
- istp = max(l - jend, ist)
- iendp = min(l - jst, iend)
-
-!dir$ ivdep
- do i = istp, iendp
- j = l - i
-
-!!dir$ unroll 5
-! manually unroll the loop
-! do m = 1, 5
- tv( 1, i, j ) = tv( 1, i, j )
- > + omega * ( udy( 1, 1, i, j ) * v( 1, i, j+1, k )
- > + udx( 1, 1, i, j ) * v( 1, i+1, j, k )
- > + udy( 1, 2, i, j ) * v( 2, i, j+1, k )
- > + udx( 1, 2, i, j ) * v( 2, i+1, j, k )
- > + udy( 1, 3, i, j ) * v( 3, i, j+1, k )
- > + udx( 1, 3, i, j ) * v( 3, i+1, j, k )
- > + udy( 1, 4, i, j ) * v( 4, i, j+1, k )
- > + udx( 1, 4, i, j ) * v( 4, i+1, j, k )
- > + udy( 1, 5, i, j ) * v( 5, i, j+1, k )
- > + udx( 1, 5, i, j ) * v( 5, i+1, j, k ) )
- tv( 2, i, j ) = tv( 2, i, j )
- > + omega * ( udy( 2, 1, i, j ) * v( 1, i, j+1, k )
- > + udx( 2, 1, i, j ) * v( 1, i+1, j, k )
- > + udy( 2, 2, i, j ) * v( 2, i, j+1, k )
- > + udx( 2, 2, i, j ) * v( 2, i+1, j, k )
- > + udy( 2, 3, i, j ) * v( 3, i, j+1, k )
- > + udx( 2, 3, i, j ) * v( 3, i+1, j, k )
- > + udy( 2, 4, i, j ) * v( 4, i, j+1, k )
- > + udx( 2, 4, i, j ) * v( 4, i+1, j, k )
- > + udy( 2, 5, i, j ) * v( 5, i, j+1, k )
- > + udx( 2, 5, i, j ) * v( 5, i+1, j, k ) )
- tv( 3, i, j ) = tv( 3, i, j )
- > + omega * ( udy( 3, 1, i, j ) * v( 1, i, j+1, k )
- > + udx( 3, 1, i, j ) * v( 1, i+1, j, k )
- > + udy( 3, 2, i, j ) * v( 2, i, j+1, k )
- > + udx( 3, 2, i, j ) * v( 2, i+1, j, k )
- > + udy( 3, 3, i, j ) * v( 3, i, j+1, k )
- > + udx( 3, 3, i, j ) * v( 3, i+1, j, k )
- > + udy( 3, 4, i, j ) * v( 4, i, j+1, k )
- > + udx( 3, 4, i, j ) * v( 4, i+1, j, k )
- > + udy( 3, 5, i, j ) * v( 5, i, j+1, k )
- > + udx( 3, 5, i, j ) * v( 5, i+1, j, k ) )
- tv( 4, i, j ) = tv( 4, i, j )
- > + omega * ( udy( 4, 1, i, j ) * v( 1, i, j+1, k )
- > + udx( 4, 1, i, j ) * v( 1, i+1, j, k )
- > + udy( 4, 2, i, j ) * v( 2, i, j+1, k )
- > + udx( 4, 2, i, j ) * v( 2, i+1, j, k )
- > + udy( 4, 3, i, j ) * v( 3, i, j+1, k )
- > + udx( 4, 3, i, j ) * v( 3, i+1, j, k )
- > + udy( 4, 4, i, j ) * v( 4, i, j+1, k )
- > + udx( 4, 4, i, j ) * v( 4, i+1, j, k )
- > + udy( 4, 5, i, j ) * v( 5, i, j+1, k )
- > + udx( 4, 5, i, j ) * v( 5, i+1, j, k ) )
- tv( 5, i, j ) = tv( 5, i, j )
- > + omega * ( udy( 5, 1, i, j ) * v( 1, i, j+1, k )
- > + udx( 5, 1, i, j ) * v( 1, i+1, j, k )
- > + udy( 5, 2, i, j ) * v( 2, i, j+1, k )
- > + udx( 5, 2, i, j ) * v( 2, i+1, j, k )
- > + udy( 5, 3, i, j ) * v( 3, i, j+1, k )
- > + udx( 5, 3, i, j ) * v( 3, i+1, j, k )
- > + udy( 5, 4, i, j ) * v( 4, i, j+1, k )
- > + udx( 5, 4, i, j ) * v( 4, i+1, j, k )
- > + udy( 5, 5, i, j ) * v( 5, i, j+1, k )
- > + udx( 5, 5, i, j ) * v( 5, i+1, j, k ) )
-! end do
-
-c---------------------------------------------------------------------
-c diagonal block inversion
-c---------------------------------------------------------------------
-!!dir$ unroll 5
-! manually unroll the loop
-! do m = 1, 5
- tmat( 1, 1 ) = d( 1, 1, i, j )
- tmat( 1, 2 ) = d( 1, 2, i, j )
- tmat( 1, 3 ) = d( 1, 3, i, j )
- tmat( 1, 4 ) = d( 1, 4, i, j )
- tmat( 1, 5 ) = d( 1, 5, i, j )
- tmat( 2, 1 ) = d( 2, 1, i, j )
- tmat( 2, 2 ) = d( 2, 2, i, j )
- tmat( 2, 3 ) = d( 2, 3, i, j )
- tmat( 2, 4 ) = d( 2, 4, i, j )
- tmat( 2, 5 ) = d( 2, 5, i, j )
- tmat( 3, 1 ) = d( 3, 1, i, j )
- tmat( 3, 2 ) = d( 3, 2, i, j )
- tmat( 3, 3 ) = d( 3, 3, i, j )
- tmat( 3, 4 ) = d( 3, 4, i, j )
- tmat( 3, 5 ) = d( 3, 5, i, j )
- tmat( 4, 1 ) = d( 4, 1, i, j )
- tmat( 4, 2 ) = d( 4, 2, i, j )
- tmat( 4, 3 ) = d( 4, 3, i, j )
- tmat( 4, 4 ) = d( 4, 4, i, j )
- tmat( 4, 5 ) = d( 4, 5, i, j )
- tmat( 5, 1 ) = d( 5, 1, i, j )
- tmat( 5, 2 ) = d( 5, 2, i, j )
- tmat( 5, 3 ) = d( 5, 3, i, j )
- tmat( 5, 4 ) = d( 5, 4, i, j )
- tmat( 5, 5 ) = d( 5, 5, i, j )
-! end do
-
- tmp1 = 1.0d+00 / tmat( 1, 1 )
- tmp = tmp1 * tmat( 2, 1 )
- tmat( 2, 2 ) = tmat( 2, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 2, 3 ) = tmat( 2, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 2, 4 ) = tmat( 2, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 2, 5 ) = tmat( 2, 5 )
- > - tmp * tmat( 1, 5 )
- tv( 2, i, j ) = tv( 2, i, j )
- > - tv( 1, i, j ) * tmp
-
- tmp = tmp1 * tmat( 3, 1 )
- tmat( 3, 2 ) = tmat( 3, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 3, 3 ) = tmat( 3, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 3, 4 ) = tmat( 3, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 3, 5 ) = tmat( 3, 5 )
- > - tmp * tmat( 1, 5 )
- tv( 3, i, j ) = tv( 3, i, j )
- > - tv( 1, i, j ) * tmp
-
- tmp = tmp1 * tmat( 4, 1 )
- tmat( 4, 2 ) = tmat( 4, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 4, 3 ) = tmat( 4, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 1, 5 )
- tv( 4, i, j ) = tv( 4, i, j )
- > - tv( 1, i, j ) * tmp
-
- tmp = tmp1 * tmat( 5, 1 )
- tmat( 5, 2 ) = tmat( 5, 2 )
- > - tmp * tmat( 1, 2 )
- tmat( 5, 3 ) = tmat( 5, 3 )
- > - tmp * tmat( 1, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 1, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 1, 5 )
- tv( 5, i, j ) = tv( 5, i, j )
- > - tv( 1, i, j ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 2, 2 )
- tmp = tmp1 * tmat( 3, 2 )
- tmat( 3, 3 ) = tmat( 3, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 3, 4 ) = tmat( 3, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 3, 5 ) = tmat( 3, 5 )
- > - tmp * tmat( 2, 5 )
- tv( 3, i, j ) = tv( 3, i, j )
- > - tv( 2, i, j ) * tmp
-
- tmp = tmp1 * tmat( 4, 2 )
- tmat( 4, 3 ) = tmat( 4, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 2, 5 )
- tv( 4, i, j ) = tv( 4, i, j )
- > - tv( 2, i, j ) * tmp
-
- tmp = tmp1 * tmat( 5, 2 )
- tmat( 5, 3 ) = tmat( 5, 3 )
- > - tmp * tmat( 2, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 2, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 2, 5 )
- tv( 5, i, j ) = tv( 5, i, j )
- > - tv( 2, i, j ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 3, 3 )
- tmp = tmp1 * tmat( 4, 3 )
- tmat( 4, 4 ) = tmat( 4, 4 )
- > - tmp * tmat( 3, 4 )
- tmat( 4, 5 ) = tmat( 4, 5 )
- > - tmp * tmat( 3, 5 )
- tv( 4, i, j ) = tv( 4, i, j )
- > - tv( 3, i, j ) * tmp
-
- tmp = tmp1 * tmat( 5, 3 )
- tmat( 5, 4 ) = tmat( 5, 4 )
- > - tmp * tmat( 3, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 3, 5 )
- tv( 5, i, j ) = tv( 5, i, j )
- > - tv( 3, i, j ) * tmp
-
-
-
- tmp1 = 1.0d+00 / tmat( 4, 4 )
- tmp = tmp1 * tmat( 5, 4 )
- tmat( 5, 5 ) = tmat( 5, 5 )
- > - tmp * tmat( 4, 5 )
- tv( 5, i, j ) = tv( 5, i, j )
- > - tv( 4, i, j ) * tmp
-
-c---------------------------------------------------------------------
-c back substitution
-c---------------------------------------------------------------------
- tv( 5, i, j ) = tv( 5, i, j )
- > / tmat( 5, 5 )
-
- tv( 4, i, j ) = tv( 4, i, j )
- > - tmat( 4, 5 ) * tv( 5, i, j )
- tv( 4, i, j ) = tv( 4, i, j )
- > / tmat( 4, 4 )
-
- tv( 3, i, j ) = tv( 3, i, j )
- > - tmat( 3, 4 ) * tv( 4, i, j )
- > - tmat( 3, 5 ) * tv( 5, i, j )
- tv( 3, i, j ) = tv( 3, i, j )
- > / tmat( 3, 3 )
-
- tv( 2, i, j ) = tv( 2, i, j )
- > - tmat( 2, 3 ) * tv( 3, i, j )
- > - tmat( 2, 4 ) * tv( 4, i, j )
- > - tmat( 2, 5 ) * tv( 5, i, j )
- tv( 2, i, j ) = tv( 2, i, j )
- > / tmat( 2, 2 )
-
- tv( 1, i, j ) = tv( 1, i, j )
- > - tmat( 1, 2 ) * tv( 2, i, j )
- > - tmat( 1, 3 ) * tv( 3, i, j )
- > - tmat( 1, 4 ) * tv( 4, i, j )
- > - tmat( 1, 5 ) * tv( 5, i, j )
- tv( 1, i, j ) = tv( 1, i, j )
- > / tmat( 1, 1 )
-
- v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j )
- v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j )
- v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j )
- v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j )
- v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j )
-
-
- enddo
- end do
-
-c---------------------------------------------------------------------
-c send data to north and west
-c---------------------------------------------------------------------
- iex = 3
- call exchange_1( v,k,iex )
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine erhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c compute the right hand side based on exact solution
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, k, m
- integer iglob, jglob
- integer iex
- integer L1, L2
- integer ist1, iend1
- integer jst1, jend1
- double precision dsspm
- double precision xi, eta, zeta
- double precision q
- double precision u21, u31, u41
- double precision tmp
- double precision u21i, u31i, u41i, u51i
- double precision u21j, u31j, u41j, u51j
- double precision u21k, u31k, u41k, u51k
- double precision u21im1, u31im1, u41im1, u51im1
- double precision u21jm1, u31jm1, u41jm1, u51jm1
- double precision u21km1, u31km1, u41km1, u51km1
-
- dsspm = dssp
-
-
- do k = 1, nz
- do j = 1, ny
- do i = 1, nx
- do m = 1, 5
- frct( m, i, j, k ) = 0.0d+00
- end do
- end do
- end do
- end do
-
- do k = 1, nz
- zeta = ( dble(k-1) ) / ( nz - 1 )
- do j = 1, ny
- jglob = jpt + j
- eta = ( dble(jglob-1) ) / ( ny0 - 1 )
- do i = 1, nx
- iglob = ipt + i
- xi = ( dble(iglob-1) ) / ( nx0 - 1 )
- do m = 1, 5
- rsd(m,i,j,k) = ce(m,1)
- > + ce(m,2) * xi
- > + ce(m,3) * eta
- > + ce(m,4) * zeta
- > + ce(m,5) * xi * xi
- > + ce(m,6) * eta * eta
- > + ce(m,7) * zeta * zeta
- > + ce(m,8) * xi * xi * xi
- > + ce(m,9) * eta * eta * eta
- > + ce(m,10) * zeta * zeta * zeta
- > + ce(m,11) * xi * xi * xi * xi
- > + ce(m,12) * eta * eta * eta * eta
- > + ce(m,13) * zeta * zeta * zeta * zeta
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c xi-direction flux differences
-c---------------------------------------------------------------------
-c
-c iex = flag : iex = 0 north/south communication
-c : iex = 1 east/west communication
-c
-c---------------------------------------------------------------------
- iex = 0
-
-c---------------------------------------------------------------------
-c communicate and receive/send two rows of data
-c---------------------------------------------------------------------
- call exchange_3 (rsd,iex)
-
- L1 = 0
- if (north.eq.-1) L1 = 1
- L2 = nx + 1
- if (south.eq.-1) L2 = nx
-
- ist1 = 1
- iend1 = nx
- if (north.eq.-1) ist1 = 4
- if (south.eq.-1) iend1 = nx - 3
-
- do k = 2, nz - 1
- do j = jst, jend
- do i = L1, L2
- flux(1,i,j,k) = rsd(2,i,j,k)
- u21 = rsd(2,i,j,k) / rsd(1,i,j,k)
- q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k)
- > + rsd(3,i,j,k) * rsd(3,i,j,k)
- > + rsd(4,i,j,k) * rsd(4,i,j,k) )
- > / rsd(1,i,j,k)
- flux(2,i,j,k) = rsd(2,i,j,k) * u21 + c2 *
- > ( rsd(5,i,j,k) - q )
- flux(3,i,j,k) = rsd(3,i,j,k) * u21
- flux(4,i,j,k) = rsd(4,i,j,k) * u21
- flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u21
- end do
- end do
- end do
-
- do k = 2, nz - 1
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- frct(m,i,j,k) = frct(m,i,j,k)
- > - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) )
- end do
- end do
- do i = ist, L2
- tmp = 1.0d+00 / rsd(1,i,j,k)
-
- u21i = tmp * rsd(2,i,j,k)
- u31i = tmp * rsd(3,i,j,k)
- u41i = tmp * rsd(4,i,j,k)
- u51i = tmp * rsd(5,i,j,k)
-
- tmp = 1.0d+00 / rsd(1,i-1,j,k)
-
- u21im1 = tmp * rsd(2,i-1,j,k)
- u31im1 = tmp * rsd(3,i-1,j,k)
- u41im1 = tmp * rsd(4,i-1,j,k)
- u51im1 = tmp * rsd(5,i-1,j,k)
-
- flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 *
- > ( u21i - u21im1 )
- flux(3,i,j,k) = tx3 * ( u31i - u31im1 )
- flux(4,i,j,k) = tx3 * ( u41i - u41im1 )
- flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
- > * tx3 * ( ( u21i **2 + u31i **2 + u41i **2 )
- > - ( u21im1**2 + u31im1**2 + u41im1**2 ) )
- > + (1.0d+00/6.0d+00)
- > * tx3 * ( u21i**2 - u21im1**2 )
- > + c1 * c5 * tx3 * ( u51i - u51im1 )
- end do
-
- do i = ist, iend
- frct(1,i,j,k) = frct(1,i,j,k)
- > + dx1 * tx1 * ( rsd(1,i-1,j,k)
- > - 2.0d+00 * rsd(1,i,j,k)
- > + rsd(1,i+1,j,k) )
- frct(2,i,j,k) = frct(2,i,j,k)
- > + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) )
- > + dx2 * tx1 * ( rsd(2,i-1,j,k)
- > - 2.0d+00 * rsd(2,i,j,k)
- > + rsd(2,i+1,j,k) )
- frct(3,i,j,k) = frct(3,i,j,k)
- > + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) )
- > + dx3 * tx1 * ( rsd(3,i-1,j,k)
- > - 2.0d+00 * rsd(3,i,j,k)
- > + rsd(3,i+1,j,k) )
- frct(4,i,j,k) = frct(4,i,j,k)
- > + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) )
- > + dx4 * tx1 * ( rsd(4,i-1,j,k)
- > - 2.0d+00 * rsd(4,i,j,k)
- > + rsd(4,i+1,j,k) )
- frct(5,i,j,k) = frct(5,i,j,k)
- > + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) )
- > + dx5 * tx1 * ( rsd(5,i-1,j,k)
- > - 2.0d+00 * rsd(5,i,j,k)
- > + rsd(5,i+1,j,k) )
- end do
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation
-c---------------------------------------------------------------------
- IF (north.eq.-1) then
- do m = 1, 5
- frct(m,2,j,k) = frct(m,2,j,k)
- > - dsspm * ( + 5.0d+00 * rsd(m,2,j,k)
- > - 4.0d+00 * rsd(m,3,j,k)
- > + rsd(m,4,j,k) )
- frct(m,3,j,k) = frct(m,3,j,k)
- > - dsspm * ( - 4.0d+00 * rsd(m,2,j,k)
- > + 6.0d+00 * rsd(m,3,j,k)
- > - 4.0d+00 * rsd(m,4,j,k)
- > + rsd(m,5,j,k) )
- end do
- END IF
-
- do i = ist1,iend1
- do m = 1, 5
- frct(m,i,j,k) = frct(m,i,j,k)
- > - dsspm * ( rsd(m,i-2,j,k)
- > - 4.0d+00 * rsd(m,i-1,j,k)
- > + 6.0d+00 * rsd(m,i,j,k)
- > - 4.0d+00 * rsd(m,i+1,j,k)
- > + rsd(m,i+2,j,k) )
- end do
- end do
-
- IF (south.eq.-1) then
- do m = 1, 5
- frct(m,nx-2,j,k) = frct(m,nx-2,j,k)
- > - dsspm * ( rsd(m,nx-4,j,k)
- > - 4.0d+00 * rsd(m,nx-3,j,k)
- > + 6.0d+00 * rsd(m,nx-2,j,k)
- > - 4.0d+00 * rsd(m,nx-1,j,k) )
- frct(m,nx-1,j,k) = frct(m,nx-1,j,k)
- > - dsspm * ( rsd(m,nx-3,j,k)
- > - 4.0d+00 * rsd(m,nx-2,j,k)
- > + 5.0d+00 * rsd(m,nx-1,j,k) )
- end do
- END IF
-
- end do
- end do
-
-c---------------------------------------------------------------------
-c eta-direction flux differences
-c---------------------------------------------------------------------
-c
-c iex = flag : iex = 0 north/south communication
-c : iex = 1 east/west communication
-c
-c---------------------------------------------------------------------
- iex = 1
-
-c---------------------------------------------------------------------
-c communicate and receive/send two rows of data
-c---------------------------------------------------------------------
- call exchange_3 (rsd,iex)
-
- L1 = 0
- if (west.eq.-1) L1 = 1
- L2 = ny + 1
- if (east.eq.-1) L2 = ny
-
- jst1 = 1
- jend1 = ny
- if (west.eq.-1) jst1 = 4
- if (east.eq.-1) jend1 = ny - 3
-
- do k = 2, nz - 1
- do j = L1, L2
- do i = ist, iend
- flux(1,i,j,k) = rsd(3,i,j,k)
- u31 = rsd(3,i,j,k) / rsd(1,i,j,k)
- q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k)
- > + rsd(3,i,j,k) * rsd(3,i,j,k)
- > + rsd(4,i,j,k) * rsd(4,i,j,k) )
- > / rsd(1,i,j,k)
- flux(2,i,j,k) = rsd(2,i,j,k) * u31
- flux(3,i,j,k) = rsd(3,i,j,k) * u31 + c2 *
- > ( rsd(5,i,j,k) - q )
- flux(4,i,j,k) = rsd(4,i,j,k) * u31
- flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u31
- end do
- end do
- end do
-
- do k = 2, nz - 1
- do i = ist, iend
- do j = jst, jend
- do m = 1, 5
- frct(m,i,j,k) = frct(m,i,j,k)
- > - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) )
- end do
- end do
- end do
-
- do j = jst, L2
- do i = ist, iend
- tmp = 1.0d+00 / rsd(1,i,j,k)
-
- u21j = tmp * rsd(2,i,j,k)
- u31j = tmp * rsd(3,i,j,k)
- u41j = tmp * rsd(4,i,j,k)
- u51j = tmp * rsd(5,i,j,k)
-
- tmp = 1.0d+00 / rsd(1,i,j-1,k)
-
- u21jm1 = tmp * rsd(2,i,j-1,k)
- u31jm1 = tmp * rsd(3,i,j-1,k)
- u41jm1 = tmp * rsd(4,i,j-1,k)
- u51jm1 = tmp * rsd(5,i,j-1,k)
-
- flux(2,i,j,k) = ty3 * ( u21j - u21jm1 )
- flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 *
- > ( u31j - u31jm1 )
- flux(4,i,j,k) = ty3 * ( u41j - u41jm1 )
- flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
- > * ty3 * ( ( u21j **2 + u31j **2 + u41j **2 )
- > - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) )
- > + (1.0d+00/6.0d+00)
- > * ty3 * ( u31j**2 - u31jm1**2 )
- > + c1 * c5 * ty3 * ( u51j - u51jm1 )
- end do
- end do
-
- do j = jst, jend
- do i = ist, iend
- frct(1,i,j,k) = frct(1,i,j,k)
- > + dy1 * ty1 * ( rsd(1,i,j-1,k)
- > - 2.0d+00 * rsd(1,i,j,k)
- > + rsd(1,i,j+1,k) )
- frct(2,i,j,k) = frct(2,i,j,k)
- > + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) )
- > + dy2 * ty1 * ( rsd(2,i,j-1,k)
- > - 2.0d+00 * rsd(2,i,j,k)
- > + rsd(2,i,j+1,k) )
- frct(3,i,j,k) = frct(3,i,j,k)
- > + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) )
- > + dy3 * ty1 * ( rsd(3,i,j-1,k)
- > - 2.0d+00 * rsd(3,i,j,k)
- > + rsd(3,i,j+1,k) )
- frct(4,i,j,k) = frct(4,i,j,k)
- > + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) )
- > + dy4 * ty1 * ( rsd(4,i,j-1,k)
- > - 2.0d+00 * rsd(4,i,j,k)
- > + rsd(4,i,j+1,k) )
- frct(5,i,j,k) = frct(5,i,j,k)
- > + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) )
- > + dy5 * ty1 * ( rsd(5,i,j-1,k)
- > - 2.0d+00 * rsd(5,i,j,k)
- > + rsd(5,i,j+1,k) )
- end do
- end do
-
-c---------------------------------------------------------------------
-c fourth-order dissipation
-c---------------------------------------------------------------------
- IF (west.eq.-1) then
- do i = ist, iend
- do m = 1, 5
- frct(m,i,2,k) = frct(m,i,2,k)
- > - dsspm * ( + 5.0d+00 * rsd(m,i,2,k)
- > - 4.0d+00 * rsd(m,i,3,k)
- > + rsd(m,i,4,k) )
- frct(m,i,3,k) = frct(m,i,3,k)
- > - dsspm * ( - 4.0d+00 * rsd(m,i,2,k)
- > + 6.0d+00 * rsd(m,i,3,k)
- > - 4.0d+00 * rsd(m,i,4,k)
- > + rsd(m,i,5,k) )
- end do
- end do
- END IF
-
- do j = jst1, jend1
- do i = ist, iend
- do m = 1, 5
- frct(m,i,j,k) = frct(m,i,j,k)
- > - dsspm * ( rsd(m,i,j-2,k)
- > - 4.0d+00 * rsd(m,i,j-1,k)
- > + 6.0d+00 * rsd(m,i,j,k)
- > - 4.0d+00 * rsd(m,i,j+1,k)
- > + rsd(m,i,j+2,k) )
- end do
- end do
- end do
-
- IF (east.eq.-1) then
- do i = ist, iend
- do m = 1, 5
- frct(m,i,ny-2,k) = frct(m,i,ny-2,k)
- > - dsspm * ( rsd(m,i,ny-4,k)
- > - 4.0d+00 * rsd(m,i,ny-3,k)
- > + 6.0d+00 * rsd(m,i,ny-2,k)
- > - 4.0d+00 * rsd(m,i,ny-1,k) )
- frct(m,i,ny-1,k) = frct(m,i,ny-1,k)
- > - dsspm * ( rsd(m,i,ny-3,k)
- > - 4.0d+00 * rsd(m,i,ny-2,k)
- > + 5.0d+00 * rsd(m,i,ny-1,k) )
- end do
- end do
- END IF
-
- end do
-
-c---------------------------------------------------------------------
-c zeta-direction flux differences
-c---------------------------------------------------------------------
- do k = 1, nz
- do j = jst, jend
- do i = ist, iend
- flux(1,i,j,k) = rsd(4,i,j,k)
- u41 = rsd(4,i,j,k) / rsd(1,i,j,k)
- q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k)
- > + rsd(3,i,j,k) * rsd(3,i,j,k)
- > + rsd(4,i,j,k) * rsd(4,i,j,k) )
- > / rsd(1,i,j,k)
- flux(2,i,j,k) = rsd(2,i,j,k) * u41
- flux(3,i,j,k) = rsd(3,i,j,k) * u41
- flux(4,i,j,k) = rsd(4,i,j,k) * u41 + c2 *
- > ( rsd(5,i,j,k) - q )
- flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u41
- end do
- end do
- end do
-
- do k = 2, nz - 1
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- frct(m,i,j,k) = frct(m,i,j,k)
- > - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) )
- end do
- end do
- end do
- end do
-
- do k = 2, nz
- do j = jst, jend
- do i = ist, iend
- tmp = 1.0d+00 / rsd(1,i,j,k)
-
- u21k = tmp * rsd(2,i,j,k)
- u31k = tmp * rsd(3,i,j,k)
- u41k = tmp * rsd(4,i,j,k)
- u51k = tmp * rsd(5,i,j,k)
-
- tmp = 1.0d+00 / rsd(1,i,j,k-1)
-
- u21km1 = tmp * rsd(2,i,j,k-1)
- u31km1 = tmp * rsd(3,i,j,k-1)
- u41km1 = tmp * rsd(4,i,j,k-1)
- u51km1 = tmp * rsd(5,i,j,k-1)
-
- flux(2,i,j,k) = tz3 * ( u21k - u21km1 )
- flux(3,i,j,k) = tz3 * ( u31k - u31km1 )
- flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * ( u41k
- > - u41km1 )
- flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
- > * tz3 * ( ( u21k **2 + u31k **2 + u41k **2 )
- > - ( u21km1**2 + u31km1**2 + u41km1**2 ) )
- > + (1.0d+00/6.0d+00)
- > * tz3 * ( u41k**2 - u41km1**2 )
- > + c1 * c5 * tz3 * ( u51k - u51km1 )
- end do
- end do
- end do
-
- do k = 2, nz - 1
- do j = jst, jend
- do i = ist, iend
- frct(1,i,j,k) = frct(1,i,j,k)
- > + dz1 * tz1 * ( rsd(1,i,j,k+1)
- > - 2.0d+00 * rsd(1,i,j,k)
- > + rsd(1,i,j,k-1) )
- frct(2,i,j,k) = frct(2,i,j,k)
- > + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) )
- > + dz2 * tz1 * ( rsd(2,i,j,k+1)
- > - 2.0d+00 * rsd(2,i,j,k)
- > + rsd(2,i,j,k-1) )
- frct(3,i,j,k) = frct(3,i,j,k)
- > + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) )
- > + dz3 * tz1 * ( rsd(3,i,j,k+1)
- > - 2.0d+00 * rsd(3,i,j,k)
- > + rsd(3,i,j,k-1) )
- frct(4,i,j,k) = frct(4,i,j,k)
- > + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) )
- > + dz4 * tz1 * ( rsd(4,i,j,k+1)
- > - 2.0d+00 * rsd(4,i,j,k)
- > + rsd(4,i,j,k-1) )
- frct(5,i,j,k) = frct(5,i,j,k)
- > + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) )
- > + dz5 * tz1 * ( rsd(5,i,j,k+1)
- > - 2.0d+00 * rsd(5,i,j,k)
- > + rsd(5,i,j,k-1) )
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c fourth-order dissipation
-c---------------------------------------------------------------------
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- frct(m,i,j,2) = frct(m,i,j,2)
- > - dsspm * ( + 5.0d+00 * rsd(m,i,j,2)
- > - 4.0d+00 * rsd(m,i,j,3)
- > + rsd(m,i,j,4) )
- frct(m,i,j,3) = frct(m,i,j,3)
- > - dsspm * (- 4.0d+00 * rsd(m,i,j,2)
- > + 6.0d+00 * rsd(m,i,j,3)
- > - 4.0d+00 * rsd(m,i,j,4)
- > + rsd(m,i,j,5) )
- end do
- end do
- end do
-
- do k = 4, nz - 3
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- frct(m,i,j,k) = frct(m,i,j,k)
- > - dsspm * ( rsd(m,i,j,k-2)
- > - 4.0d+00 * rsd(m,i,j,k-1)
- > + 6.0d+00 * rsd(m,i,j,k)
- > - 4.0d+00 * rsd(m,i,j,k+1)
- > + rsd(m,i,j,k+2) )
- end do
- end do
- end do
- end do
-
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- frct(m,i,j,nz-2) = frct(m,i,j,nz-2)
- > - dsspm * ( rsd(m,i,j,nz-4)
- > - 4.0d+00 * rsd(m,i,j,nz-3)
- > + 6.0d+00 * rsd(m,i,j,nz-2)
- > - 4.0d+00 * rsd(m,i,j,nz-1) )
- frct(m,i,j,nz-1) = frct(m,i,j,nz-1)
- > - dsspm * ( rsd(m,i,j,nz-3)
- > - 4.0d+00 * rsd(m,i,j,nz-2)
- > + 5.0d+00 * rsd(m,i,j,nz-1) )
- end do
- end do
- end do
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine error
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c compute the solution error
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, k, m
- integer iglob, jglob
- double precision tmp
- double precision u000ijk(5), dummy(5)
-
- integer IERROR
-
-
- do m = 1, 5
- errnm(m) = 0.0d+00
- dummy(m) = 0.0d+00
- end do
-
- do k = 2, nz-1
- do j = jst, jend
- jglob = jpt + j
- do i = ist, iend
- iglob = ipt + i
- call exact( iglob, jglob, k, u000ijk )
- do m = 1, 5
- tmp = ( u000ijk(m) - u(m,i,j,k) )
- dummy(m) = dummy(m) + tmp ** 2
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c compute the global sum of individual contributions to dot product.
-c---------------------------------------------------------------------
- call MPI_ALLREDUCE( dummy,
- > errnm,
- > 5,
- > dp_type,
- > MPI_SUM,
- > MPI_COMM_WORLD,
- > IERROR )
-
- do m = 1, 5
- errnm(m) = sqrt ( errnm(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) )
- end do
-
-c if (id.eq.0) then
-c write (*,1002) ( errnm(m), m = 1, 5 )
-c end if
-
- 1002 format (1x/1x,'RMS-norm of error in soln. to ',
- > 'first pde = ',1pe12.5/,
- > 1x,'RMS-norm of error in soln. to ',
- > 'second pde = ',1pe12.5/,
- > 1x,'RMS-norm of error in soln. to ',
- > 'third pde = ',1pe12.5/,
- > 1x,'RMS-norm of error in soln. to ',
- > 'fourth pde = ',1pe12.5/,
- > 1x,'RMS-norm of error in soln. to ',
- > 'fifth pde = ',1pe12.5)
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exact( i, j, k, u000ijk )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c compute the exact solution at (i,j,k)
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- integer i, j, k
- double precision u000ijk(*)
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer m
- double precision xi, eta, zeta
-
- xi = ( dble ( i - 1 ) ) / ( nx0 - 1 )
- eta = ( dble ( j - 1 ) ) / ( ny0 - 1 )
- zeta = ( dble ( k - 1 ) ) / ( nz - 1 )
-
-
- do m = 1, 5
- u000ijk(m) = ce(m,1)
- > + ce(m,2) * xi
- > + ce(m,3) * eta
- > + ce(m,4) * zeta
- > + ce(m,5) * xi * xi
- > + ce(m,6) * eta * eta
- > + ce(m,7) * zeta * zeta
- > + ce(m,8) * xi * xi * xi
- > + ce(m,9) * eta * eta * eta
- > + ce(m,10) * zeta * zeta * zeta
- > + ce(m,11) * xi * xi * xi * xi
- > + ce(m,12) * eta * eta * eta * eta
- > + ce(m,13) * zeta * zeta * zeta * zeta
- end do
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exchange_1( g,k,iex )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
- double precision g(5,-1:isiz1+2,-1:isiz2+2,isiz3)
- integer k
- integer iex
- integer i, j
- double precision dum(5,isiz1+isiz2), dum1(5,isiz1+isiz2)
-
- integer STATUS(MPI_STATUS_SIZE)
- integer IERROR
-
-
-
- if( iex .eq. 0 ) then
-
- if( north .ne. -1 ) then
- call MPI_RECV( dum1(1,jst),
- > 5*(jend-jst+1),
- > dp_type,
- > north,
- > from_n,
- > MPI_COMM_WORLD,
- > status,
- > IERROR )
- do j=jst,jend
- g(1,0,j,k) = dum1(1,j)
- g(2,0,j,k) = dum1(2,j)
- g(3,0,j,k) = dum1(3,j)
- g(4,0,j,k) = dum1(4,j)
- g(5,0,j,k) = dum1(5,j)
- enddo
- endif
-
- if( west .ne. -1 ) then
- call MPI_RECV( dum1(1,ist),
- > 5*(iend-ist+1),
- > dp_type,
- > west,
- > from_w,
- > MPI_COMM_WORLD,
- > status,
- > IERROR )
- do i=ist,iend
- g(1,i,0,k) = dum1(1,i)
- g(2,i,0,k) = dum1(2,i)
- g(3,i,0,k) = dum1(3,i)
- g(4,i,0,k) = dum1(4,i)
- g(5,i,0,k) = dum1(5,i)
- enddo
- endif
-
- else if( iex .eq. 1 ) then
-
- if( south .ne. -1 ) then
- call MPI_RECV( dum1(1,jst),
- > 5*(jend-jst+1),
- > dp_type,
- > south,
- > from_s,
- > MPI_COMM_WORLD,
- > status,
- > IERROR )
- do j=jst,jend
- g(1,nx+1,j,k) = dum1(1,j)
- g(2,nx+1,j,k) = dum1(2,j)
- g(3,nx+1,j,k) = dum1(3,j)
- g(4,nx+1,j,k) = dum1(4,j)
- g(5,nx+1,j,k) = dum1(5,j)
- enddo
- endif
-
- if( east .ne. -1 ) then
- call MPI_RECV( dum1(1,ist),
- > 5*(iend-ist+1),
- > dp_type,
- > east,
- > from_e,
- > MPI_COMM_WORLD,
- > status,
- > IERROR )
- do i=ist,iend
- g(1,i,ny+1,k) = dum1(1,i)
- g(2,i,ny+1,k) = dum1(2,i)
- g(3,i,ny+1,k) = dum1(3,i)
- g(4,i,ny+1,k) = dum1(4,i)
- g(5,i,ny+1,k) = dum1(5,i)
- enddo
- endif
-
- else if( iex .eq. 2 ) then
-
- if( south .ne. -1 ) then
- do j=jst,jend
- dum(1,j) = g(1,nx,j,k)
- dum(2,j) = g(2,nx,j,k)
- dum(3,j) = g(3,nx,j,k)
- dum(4,j) = g(4,nx,j,k)
- dum(5,j) = g(5,nx,j,k)
- enddo
- call MPI_SEND( dum(1,jst),
- > 5*(jend-jst+1),
- > dp_type,
- > south,
- > from_n,
- > MPI_COMM_WORLD,
- > IERROR )
- endif
-
- if( east .ne. -1 ) then
- do i=ist,iend
- dum(1,i) = g(1,i,ny,k)
- dum(2,i) = g(2,i,ny,k)
- dum(3,i) = g(3,i,ny,k)
- dum(4,i) = g(4,i,ny,k)
- dum(5,i) = g(5,i,ny,k)
- enddo
- call MPI_SEND( dum(1,ist),
- > 5*(iend-ist+1),
- > dp_type,
- > east,
- > from_w,
- > MPI_COMM_WORLD,
- > IERROR )
- endif
-
- else
-
- if( north .ne. -1 ) then
- do j=jst,jend
- dum(1,j) = g(1,1,j,k)
- dum(2,j) = g(2,1,j,k)
- dum(3,j) = g(3,1,j,k)
- dum(4,j) = g(4,1,j,k)
- dum(5,j) = g(5,1,j,k)
- enddo
- call MPI_SEND( dum(1,jst),
- > 5*(jend-jst+1),
- > dp_type,
- > north,
- > from_s,
- > MPI_COMM_WORLD,
- > IERROR )
- endif
-
- if( west .ne. -1 ) then
- do i=ist,iend
- dum(1,i) = g(1,i,1,k)
- dum(2,i) = g(2,i,1,k)
- dum(3,i) = g(3,i,1,k)
- dum(4,i) = g(4,i,1,k)
- dum(5,i) = g(5,i,1,k)
- enddo
- call MPI_SEND( dum(1,ist),
- > 5*(iend-ist+1),
- > dp_type,
- > west,
- > from_e,
- > MPI_COMM_WORLD,
- > IERROR )
- endif
-
- endif
-
- end
-
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exchange_3(g,iex)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- double precision g(5,-1:isiz1+2,-1:isiz2+2,isiz3)
- integer iex
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, k
- integer ipos1, ipos2
-
- integer mid
- integer STATUS(MPI_STATUS_SIZE)
- integer IERROR
-
-
-
- if (iex.eq.0) then
-c---------------------------------------------------------------------
-c communicate in the south and north directions
-c---------------------------------------------------------------------
- if (north.ne.-1) then
- call MPI_IRECV( buf1,
- > 10*ny*nz,
- > dp_type,
- > MPI_ANY_SOURCE,
- > from_n,
- > MPI_COMM_WORLD,
- > mid,
- > IERROR )
- end if
-
-c---------------------------------------------------------------------
-c send south
-c---------------------------------------------------------------------
- if (south.ne.-1) then
- do k = 1,nz
- do j = 1,ny
- ipos1 = (k-1)*ny + j
- ipos2 = ipos1 + ny*nz
- buf(1,ipos1) = g(1,nx-1,j,k)
- buf(2,ipos1) = g(2,nx-1,j,k)
- buf(3,ipos1) = g(3,nx-1,j,k)
- buf(4,ipos1) = g(4,nx-1,j,k)
- buf(5,ipos1) = g(5,nx-1,j,k)
- buf(1,ipos2) = g(1,nx,j,k)
- buf(2,ipos2) = g(2,nx,j,k)
- buf(3,ipos2) = g(3,nx,j,k)
- buf(4,ipos2) = g(4,nx,j,k)
- buf(5,ipos2) = g(5,nx,j,k)
- end do
- end do
-
- call MPI_SEND( buf,
- > 10*ny*nz,
- > dp_type,
- > south,
- > from_n,
- > MPI_COMM_WORLD,
- > IERROR )
- end if
-
-c---------------------------------------------------------------------
-c receive from north
-c---------------------------------------------------------------------
- if (north.ne.-1) then
- call MPI_WAIT( mid, STATUS, IERROR )
-
- do k = 1,nz
- do j = 1,ny
- ipos1 = (k-1)*ny + j
- ipos2 = ipos1 + ny*nz
- g(1,-1,j,k) = buf1(1,ipos1)
- g(2,-1,j,k) = buf1(2,ipos1)
- g(3,-1,j,k) = buf1(3,ipos1)
- g(4,-1,j,k) = buf1(4,ipos1)
- g(5,-1,j,k) = buf1(5,ipos1)
- g(1,0,j,k) = buf1(1,ipos2)
- g(2,0,j,k) = buf1(2,ipos2)
- g(3,0,j,k) = buf1(3,ipos2)
- g(4,0,j,k) = buf1(4,ipos2)
- g(5,0,j,k) = buf1(5,ipos2)
- end do
- end do
-
- end if
-
- if (south.ne.-1) then
- call MPI_IRECV( buf1,
- > 10*ny*nz,
- > dp_type,
- > MPI_ANY_SOURCE,
- > from_s,
- > MPI_COMM_WORLD,
- > mid,
- > IERROR )
- end if
-
-c---------------------------------------------------------------------
-c send north
-c---------------------------------------------------------------------
- if (north.ne.-1) then
- do k = 1,nz
- do j = 1,ny
- ipos1 = (k-1)*ny + j
- ipos2 = ipos1 + ny*nz
- buf(1,ipos1) = g(1,2,j,k)
- buf(2,ipos1) = g(2,2,j,k)
- buf(3,ipos1) = g(3,2,j,k)
- buf(4,ipos1) = g(4,2,j,k)
- buf(5,ipos1) = g(5,2,j,k)
- buf(1,ipos2) = g(1,1,j,k)
- buf(2,ipos2) = g(2,1,j,k)
- buf(3,ipos2) = g(3,1,j,k)
- buf(4,ipos2) = g(4,1,j,k)
- buf(5,ipos2) = g(5,1,j,k)
- end do
- end do
-
- call MPI_SEND( buf,
- > 10*ny*nz,
- > dp_type,
- > north,
- > from_s,
- > MPI_COMM_WORLD,
- > IERROR )
- end if
-
-c---------------------------------------------------------------------
-c receive from south
-c---------------------------------------------------------------------
- if (south.ne.-1) then
- call MPI_WAIT( mid, STATUS, IERROR )
-
- do k = 1,nz
- do j = 1,ny
- ipos1 = (k-1)*ny + j
- ipos2 = ipos1 + ny*nz
- g(1,nx+2,j,k) = buf1(1,ipos1)
- g(2,nx+2,j,k) = buf1(2,ipos1)
- g(3,nx+2,j,k) = buf1(3,ipos1)
- g(4,nx+2,j,k) = buf1(4,ipos1)
- g(5,nx+2,j,k) = buf1(5,ipos1)
- g(1,nx+1,j,k) = buf1(1,ipos2)
- g(2,nx+1,j,k) = buf1(2,ipos2)
- g(3,nx+1,j,k) = buf1(3,ipos2)
- g(4,nx+1,j,k) = buf1(4,ipos2)
- g(5,nx+1,j,k) = buf1(5,ipos2)
- end do
- end do
- end if
-
- else
-
-c---------------------------------------------------------------------
-c communicate in the east and west directions
-c---------------------------------------------------------------------
- if (west.ne.-1) then
- call MPI_IRECV( buf1,
- > 10*nx*nz,
- > dp_type,
- > MPI_ANY_SOURCE,
- > from_w,
- > MPI_COMM_WORLD,
- > mid,
- > IERROR )
- end if
-
-c---------------------------------------------------------------------
-c send east
-c---------------------------------------------------------------------
- if (east.ne.-1) then
- do k = 1,nz
- do i = 1,nx
- ipos1 = (k-1)*nx + i
- ipos2 = ipos1 + nx*nz
- buf(1,ipos1) = g(1,i,ny-1,k)
- buf(2,ipos1) = g(2,i,ny-1,k)
- buf(3,ipos1) = g(3,i,ny-1,k)
- buf(4,ipos1) = g(4,i,ny-1,k)
- buf(5,ipos1) = g(5,i,ny-1,k)
- buf(1,ipos2) = g(1,i,ny,k)
- buf(2,ipos2) = g(2,i,ny,k)
- buf(3,ipos2) = g(3,i,ny,k)
- buf(4,ipos2) = g(4,i,ny,k)
- buf(5,ipos2) = g(5,i,ny,k)
- end do
- end do
-
- call MPI_SEND( buf,
- > 10*nx*nz,
- > dp_type,
- > east,
- > from_w,
- > MPI_COMM_WORLD,
- > IERROR )
- end if
-
-c---------------------------------------------------------------------
-c receive from west
-c---------------------------------------------------------------------
- if (west.ne.-1) then
- call MPI_WAIT( mid, STATUS, IERROR )
-
- do k = 1,nz
- do i = 1,nx
- ipos1 = (k-1)*nx + i
- ipos2 = ipos1 + nx*nz
- g(1,i,-1,k) = buf1(1,ipos1)
- g(2,i,-1,k) = buf1(2,ipos1)
- g(3,i,-1,k) = buf1(3,ipos1)
- g(4,i,-1,k) = buf1(4,ipos1)
- g(5,i,-1,k) = buf1(5,ipos1)
- g(1,i,0,k) = buf1(1,ipos2)
- g(2,i,0,k) = buf1(2,ipos2)
- g(3,i,0,k) = buf1(3,ipos2)
- g(4,i,0,k) = buf1(4,ipos2)
- g(5,i,0,k) = buf1(5,ipos2)
- end do
- end do
-
- end if
-
- if (east.ne.-1) then
- call MPI_IRECV( buf1,
- > 10*nx*nz,
- > dp_type,
- > MPI_ANY_SOURCE,
- > from_e,
- > MPI_COMM_WORLD,
- > mid,
- > IERROR )
- end if
-
-c---------------------------------------------------------------------
-c send west
-c---------------------------------------------------------------------
- if (west.ne.-1) then
- do k = 1,nz
- do i = 1,nx
- ipos1 = (k-1)*nx + i
- ipos2 = ipos1 + nx*nz
- buf(1,ipos1) = g(1,i,2,k)
- buf(2,ipos1) = g(2,i,2,k)
- buf(3,ipos1) = g(3,i,2,k)
- buf(4,ipos1) = g(4,i,2,k)
- buf(5,ipos1) = g(5,i,2,k)
- buf(1,ipos2) = g(1,i,1,k)
- buf(2,ipos2) = g(2,i,1,k)
- buf(3,ipos2) = g(3,i,1,k)
- buf(4,ipos2) = g(4,i,1,k)
- buf(5,ipos2) = g(5,i,1,k)
- end do
- end do
-
- call MPI_SEND( buf,
- > 10*nx*nz,
- > dp_type,
- > west,
- > from_e,
- > MPI_COMM_WORLD,
- > IERROR )
- end if
-
-c---------------------------------------------------------------------
-c receive from east
-c---------------------------------------------------------------------
- if (east.ne.-1) then
- call MPI_WAIT( mid, STATUS, IERROR )
-
- do k = 1,nz
- do i = 1,nx
- ipos1 = (k-1)*nx + i
- ipos2 = ipos1 + nx*nz
- g(1,i,ny+2,k) = buf1(1,ipos1)
- g(2,i,ny+2,k) = buf1(2,ipos1)
- g(3,i,ny+2,k) = buf1(3,ipos1)
- g(4,i,ny+2,k) = buf1(4,ipos1)
- g(5,i,ny+2,k) = buf1(5,ipos1)
- g(1,i,ny+1,k) = buf1(1,ipos2)
- g(2,i,ny+1,k) = buf1(2,ipos2)
- g(3,i,ny+1,k) = buf1(3,ipos2)
- g(4,i,ny+1,k) = buf1(4,ipos2)
- g(5,i,ny+1,k) = buf1(5,ipos2)
- end do
- end do
-
- end if
-
- end if
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exchange_4(g,h,ibeg,ifin1,jbeg,jfin1)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- double precision g(0:isiz2+1,0:isiz3+1),
- > h(0:isiz2+1,0:isiz3+1)
- integer ibeg, ifin1
- integer jbeg, jfin1
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j
- integer ny2
- double precision dum(1024)
-
- integer msgid1, msgid3
- integer STATUS(MPI_STATUS_SIZE)
- integer IERROR
-
-
-
- ny2 = ny + 2
-
-c---------------------------------------------------------------------
-c communicate in the east and west directions
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c receive from east
-c---------------------------------------------------------------------
- if (jfin1.eq.ny) then
- call MPI_IRECV( dum,
- > 2*nx,
- > dp_type,
- > MPI_ANY_SOURCE,
- > from_e,
- > MPI_COMM_WORLD,
- > msgid3,
- > IERROR )
-
- call MPI_WAIT( msgid3, STATUS, IERROR )
-
- do i = 1,nx
- g(i,ny+1) = dum(i)
- h(i,ny+1) = dum(i+nx)
- end do
-
- end if
-
-c---------------------------------------------------------------------
-c send west
-c---------------------------------------------------------------------
- if (jbeg.eq.1) then
- do i = 1,nx
- dum(i) = g(i,1)
- dum(i+nx) = h(i,1)
- end do
-
- call MPI_SEND( dum,
- > 2*nx,
- > dp_type,
- > west,
- > from_e,
- > MPI_COMM_WORLD,
- > IERROR )
-
- end if
-
-c---------------------------------------------------------------------
-c communicate in the south and north directions
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c receive from south
-c---------------------------------------------------------------------
- if (ifin1.eq.nx) then
- call MPI_IRECV( dum,
- > 2*ny2,
- > dp_type,
- > MPI_ANY_SOURCE,
- > from_s,
- > MPI_COMM_WORLD,
- > msgid1,
- > IERROR )
-
- call MPI_WAIT( msgid1, STATUS, IERROR )
-
- do j = 0,ny+1
- g(nx+1,j) = dum(j+1)
- h(nx+1,j) = dum(j+ny2+1)
- end do
-
- end if
-
-c---------------------------------------------------------------------
-c send north
-c---------------------------------------------------------------------
- if (ibeg.eq.1) then
- do j = 0,ny+1
- dum(j+1) = g(1,j)
- dum(j+ny2+1) = h(1,j)
- end do
-
- call MPI_SEND( dum,
- > 2*ny2,
- > dp_type,
- > north,
- > from_s,
- > MPI_COMM_WORLD,
- > IERROR )
-
- end if
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exchange_5(g,ibeg,ifin1)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- double precision g(0:isiz2+1,0:isiz3+1)
- integer ibeg, ifin1
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer k
- double precision dum(1024)
-
- integer msgid1
- integer STATUS(MPI_STATUS_SIZE)
- integer IERROR
-
-
-
-c---------------------------------------------------------------------
-c communicate in the south and north directions
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c receive from south
-c---------------------------------------------------------------------
- if (ifin1.eq.nx) then
- call MPI_IRECV( dum,
- > nz,
- > dp_type,
- > MPI_ANY_SOURCE,
- > from_s,
- > MPI_COMM_WORLD,
- > msgid1,
- > IERROR )
-
- call MPI_WAIT( msgid1, STATUS, IERROR )
-
- do k = 1,nz
- g(nx+1,k) = dum(k)
- end do
-
- end if
-
-c---------------------------------------------------------------------
-c send north
-c---------------------------------------------------------------------
- if (ibeg.eq.1) then
- do k = 1,nz
- dum(k) = g(1,k)
- end do
-
- call MPI_SEND( dum,
- > nz,
- > dp_type,
- > north,
- > from_s,
- > MPI_COMM_WORLD,
- > IERROR )
-
- end if
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exchange_6(g,jbeg,jfin1)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- double precision g(0:isiz2+1,0:isiz3+1)
- integer jbeg, jfin1
-
-c---------------------------------------------------------------------
-c local parameters
-c---------------------------------------------------------------------
- integer k
- double precision dum(1024)
-
- integer msgid3
- integer STATUS(MPI_STATUS_SIZE)
- integer IERROR
-
-
-
-c---------------------------------------------------------------------
-c communicate in the east and west directions
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c receive from east
-c---------------------------------------------------------------------
- if (jfin1.eq.ny) then
- call MPI_IRECV( dum,
- > nz,
- > dp_type,
- > MPI_ANY_SOURCE,
- > from_e,
- > MPI_COMM_WORLD,
- > msgid3,
- > IERROR )
-
- call MPI_WAIT( msgid3, STATUS, IERROR )
-
- do k = 1,nz
- g(ny+1,k) = dum(k)
- end do
-
- end if
-
-c---------------------------------------------------------------------
-c send west
-c---------------------------------------------------------------------
- if (jbeg.eq.1) then
- do k = 1,nz
- dum(k) = g(1,k)
- end do
-
- call MPI_SEND( dum,
- > nz,
- > dp_type,
- > west,
- > from_e,
- > MPI_COMM_WORLD,
- > IERROR )
-
- end if
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine init_comm
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c initialize MPI and establish rank and size
-c
-c This is a module in the MPI implementation of LUSSOR
-c pseudo application from the NAS Parallel Benchmarks.
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
- integer nodedim
- integer IERROR
-
-
-c---------------------------------------------------------------------
-c initialize MPI communication
-c---------------------------------------------------------------------
- call MPI_INIT( IERROR )
-
-c---------------------------------------------------------------------
-c establish the global rank of this process
-c---------------------------------------------------------------------
- call MPI_COMM_RANK( MPI_COMM_WORLD,
- > id,
- > IERROR )
-
-c---------------------------------------------------------------------
-c establish the size of the global group
-c---------------------------------------------------------------------
- call MPI_COMM_SIZE( MPI_COMM_WORLD,
- > num,
- > IERROR )
-
- ndim = nodedim(num)
-
- if (.not. convertdouble) then
- dp_type = MPI_DOUBLE_PRECISION
- else
- dp_type = MPI_REAL
- endif
-
-
- return
- end
+++ /dev/null
-c
-c***controls printing of the progress of iterations: ipr inorm
- 1 250
-c
-c***the maximum no. of pseudo-time steps to be performed: nitmax
- 250
-c
-c***magnitude of the time step: dt
- 2.0e+00
-c
-c***relaxation factor for SSOR iterations: omega
- 1.2
-c
-c***tolerance levels for steady-state residuals: tolnwt(m),m=1,5
- 1.0e-08 1.0e-08 1.0e-08 1.0e-08 1.0e-08
-c
-c***number of grid points in xi and eta and zeta directions: nx ny nz
- 64 64 64
-c
-
-
-
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine jacld(k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c compute the lower triangular part of the jacobian matrix
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- integer k
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j
- double precision r43
- double precision c1345
- double precision c34
- double precision tmp1, tmp2, tmp3
-
-
-
- r43 = ( 4.0d+00 / 3.0d+00 )
- c1345 = c1 * c3 * c4 * c5
- c34 = c3 * c4
-
- do j = jst, jend
- do i = ist, iend
-
-c---------------------------------------------------------------------
-c form the block daigonal
-c---------------------------------------------------------------------
- tmp1 = 1.0d+00 / u(1,i,j,k)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- d(1,1,i,j) = 1.0d+00
- > + dt * 2.0d+00 * ( tx1 * dx1
- > + ty1 * dy1
- > + tz1 * dz1 )
- d(1,2,i,j) = 0.0d+00
- d(1,3,i,j) = 0.0d+00
- d(1,4,i,j) = 0.0d+00
- d(1,5,i,j) = 0.0d+00
-
- d(2,1,i,j) = dt * 2.0d+00
- > * ( tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) )
- > + ty1 * ( - c34 * tmp2 * u(2,i,j,k) )
- > + tz1 * ( - c34 * tmp2 * u(2,i,j,k) ) )
- d(2,2,i,j) = 1.0d+00
- > + dt * 2.0d+00
- > * ( tx1 * r43 * c34 * tmp1
- > + ty1 * c34 * tmp1
- > + tz1 * c34 * tmp1 )
- > + dt * 2.0d+00 * ( tx1 * dx2
- > + ty1 * dy2
- > + tz1 * dz2 )
- d(2,3,i,j) = 0.0d+00
- d(2,4,i,j) = 0.0d+00
- d(2,5,i,j) = 0.0d+00
-
- d(3,1,i,j) = dt * 2.0d+00
- > * ( tx1 * ( - c34 * tmp2 * u(3,i,j,k) )
- > + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) )
- > + tz1 * ( - c34 * tmp2 * u(3,i,j,k) ) )
- d(3,2,i,j) = 0.0d+00
- d(3,3,i,j) = 1.0d+00
- > + dt * 2.0d+00
- > * ( tx1 * c34 * tmp1
- > + ty1 * r43 * c34 * tmp1
- > + tz1 * c34 * tmp1 )
- > + dt * 2.0d+00 * ( tx1 * dx3
- > + ty1 * dy3
- > + tz1 * dz3 )
- d(3,4,i,j) = 0.0d+00
- d(3,5,i,j) = 0.0d+00
-
- d(4,1,i,j) = dt * 2.0d+00
- > * ( tx1 * ( - c34 * tmp2 * u(4,i,j,k) )
- > + ty1 * ( - c34 * tmp2 * u(4,i,j,k) )
- > + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) )
- d(4,2,i,j) = 0.0d+00
- d(4,3,i,j) = 0.0d+00
- d(4,4,i,j) = 1.0d+00
- > + dt * 2.0d+00
- > * ( tx1 * c34 * tmp1
- > + ty1 * c34 * tmp1
- > + tz1 * r43 * c34 * tmp1 )
- > + dt * 2.0d+00 * ( tx1 * dx4
- > + ty1 * dy4
- > + tz1 * dz4 )
- d(4,5,i,j) = 0.0d+00
-
- d(5,1,i,j) = dt * 2.0d+00
- > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
- > - ( c1345 ) * tmp2 * u(5,i,j,k) )
- > + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
- > - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
- > - ( c1345 ) * tmp2 * u(5,i,j,k) )
- > + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
- > - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
- > - ( c1345 ) * tmp2 * u(5,i,j,k) ) )
- d(5,2,i,j) = dt * 2.0d+00
- > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k)
- > + ty1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k)
- > + tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) )
- d(5,3,i,j) = dt * 2.0d+00
- > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k)
- > + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k)
- > + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) )
- d(5,4,i,j) = dt * 2.0d+00
- > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
- > + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
- > + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) )
- d(5,5,i,j) = 1.0d+00
- > + dt * 2.0d+00 * ( tx1 * c1345 * tmp1
- > + ty1 * c1345 * tmp1
- > + tz1 * c1345 * tmp1 )
- > + dt * 2.0d+00 * ( tx1 * dx5
- > + ty1 * dy5
- > + tz1 * dz5 )
-
-c---------------------------------------------------------------------
-c form the first block sub-diagonal
-c---------------------------------------------------------------------
- tmp1 = 1.0d+00 / u(1,i,j,k-1)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- a(1,1,i,j) = - dt * tz1 * dz1
- a(1,2,i,j) = 0.0d+00
- a(1,3,i,j) = 0.0d+00
- a(1,4,i,j) = - dt * tz2
- a(1,5,i,j) = 0.0d+00
-
- a(2,1,i,j) = - dt * tz2
- > * ( - ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
- > - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k-1) )
- a(2,2,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 )
- > - dt * tz1 * c34 * tmp1
- > - dt * tz1 * dz2
- a(2,3,i,j) = 0.0d+00
- a(2,4,i,j) = - dt * tz2 * ( u(2,i,j,k-1) * tmp1 )
- a(2,5,i,j) = 0.0d+00
-
- a(3,1,i,j) = - dt * tz2
- > * ( - ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
- > - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k-1) )
- a(3,2,i,j) = 0.0d+00
- a(3,3,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 )
- > - dt * tz1 * ( c34 * tmp1 )
- > - dt * tz1 * dz3
- a(3,4,i,j) = - dt * tz2 * ( u(3,i,j,k-1) * tmp1 )
- a(3,5,i,j) = 0.0d+00
-
- a(4,1,i,j) = - dt * tz2
- > * ( - ( u(4,i,j,k-1) * tmp1 ) ** 2
- > + 0.50d+00 * c2
- > * ( ( u(2,i,j,k-1) * u(2,i,j,k-1)
- > + u(3,i,j,k-1) * u(3,i,j,k-1)
- > + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2 ) )
- > - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k-1) )
- a(4,2,i,j) = - dt * tz2
- > * ( - c2 * ( u(2,i,j,k-1) * tmp1 ) )
- a(4,3,i,j) = - dt * tz2
- > * ( - c2 * ( u(3,i,j,k-1) * tmp1 ) )
- a(4,4,i,j) = - dt * tz2 * ( 2.0d+00 - c2 )
- > * ( u(4,i,j,k-1) * tmp1 )
- > - dt * tz1 * ( r43 * c34 * tmp1 )
- > - dt * tz1 * dz4
- a(4,5,i,j) = - dt * tz2 * c2
-
- a(5,1,i,j) = - dt * tz2
- > * ( ( c2 * ( u(2,i,j,k-1) * u(2,i,j,k-1)
- > + u(3,i,j,k-1) * u(3,i,j,k-1)
- > + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2
- > - c1 * ( u(5,i,j,k-1) * tmp1 ) )
- > * ( u(4,i,j,k-1) * tmp1 ) )
- > - dt * tz1
- > * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k-1)**2)
- > - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k-1)**2)
- > - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k-1)**2)
- > - c1345 * tmp2 * u(5,i,j,k-1) )
- a(5,2,i,j) = - dt * tz2
- > * ( - c2 * ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
- > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k-1)
- a(5,3,i,j) = - dt * tz2
- > * ( - c2 * ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 )
- > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k-1)
- a(5,4,i,j) = - dt * tz2
- > * ( c1 * ( u(5,i,j,k-1) * tmp1 )
- > - 0.50d+00 * c2
- > * ( ( u(2,i,j,k-1)*u(2,i,j,k-1)
- > + u(3,i,j,k-1)*u(3,i,j,k-1)
- > + 3.0d+00*u(4,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) )
- > - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k-1)
- a(5,5,i,j) = - dt * tz2
- > * ( c1 * ( u(4,i,j,k-1) * tmp1 ) )
- > - dt * tz1 * c1345 * tmp1
- > - dt * tz1 * dz5
-
-c---------------------------------------------------------------------
-c form the second block sub-diagonal
-c---------------------------------------------------------------------
- tmp1 = 1.0d+00 / u(1,i,j-1,k)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- b(1,1,i,j) = - dt * ty1 * dy1
- b(1,2,i,j) = 0.0d+00
- b(1,3,i,j) = - dt * ty2
- b(1,4,i,j) = 0.0d+00
- b(1,5,i,j) = 0.0d+00
-
- b(2,1,i,j) = - dt * ty2
- > * ( - ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 )
- > - dt * ty1 * ( - c34 * tmp2 * u(2,i,j-1,k) )
- b(2,2,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 )
- > - dt * ty1 * ( c34 * tmp1 )
- > - dt * ty1 * dy2
- b(2,3,i,j) = - dt * ty2 * ( u(2,i,j-1,k) * tmp1 )
- b(2,4,i,j) = 0.0d+00
- b(2,5,i,j) = 0.0d+00
-
- b(3,1,i,j) = - dt * ty2
- > * ( - ( u(3,i,j-1,k) * tmp1 ) ** 2
- > + 0.50d+00 * c2 * ( ( u(2,i,j-1,k) * u(2,i,j-1,k)
- > + u(3,i,j-1,k) * u(3,i,j-1,k)
- > + u(4,i,j-1,k) * u(4,i,j-1,k) )
- > * tmp2 ) )
- > - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j-1,k) )
- b(3,2,i,j) = - dt * ty2
- > * ( - c2 * ( u(2,i,j-1,k) * tmp1 ) )
- b(3,3,i,j) = - dt * ty2 * ( ( 2.0d+00 - c2 )
- > * ( u(3,i,j-1,k) * tmp1 ) )
- > - dt * ty1 * ( r43 * c34 * tmp1 )
- > - dt * ty1 * dy3
- b(3,4,i,j) = - dt * ty2
- > * ( - c2 * ( u(4,i,j-1,k) * tmp1 ) )
- b(3,5,i,j) = - dt * ty2 * c2
-
- b(4,1,i,j) = - dt * ty2
- > * ( - ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 )
- > - dt * ty1 * ( - c34 * tmp2 * u(4,i,j-1,k) )
- b(4,2,i,j) = 0.0d+00
- b(4,3,i,j) = - dt * ty2 * ( u(4,i,j-1,k) * tmp1 )
- b(4,4,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 )
- > - dt * ty1 * ( c34 * tmp1 )
- > - dt * ty1 * dy4
- b(4,5,i,j) = 0.0d+00
-
- b(5,1,i,j) = - dt * ty2
- > * ( ( c2 * ( u(2,i,j-1,k) * u(2,i,j-1,k)
- > + u(3,i,j-1,k) * u(3,i,j-1,k)
- > + u(4,i,j-1,k) * u(4,i,j-1,k) ) * tmp2
- > - c1 * ( u(5,i,j-1,k) * tmp1 ) )
- > * ( u(3,i,j-1,k) * tmp1 ) )
- > - dt * ty1
- > * ( - ( c34 - c1345 )*tmp3*(u(2,i,j-1,k)**2)
- > - ( r43*c34 - c1345 )*tmp3*(u(3,i,j-1,k)**2)
- > - ( c34 - c1345 )*tmp3*(u(4,i,j-1,k)**2)
- > - c1345*tmp2*u(5,i,j-1,k) )
- b(5,2,i,j) = - dt * ty2
- > * ( - c2 * ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 )
- > - dt * ty1
- > * ( c34 - c1345 ) * tmp2 * u(2,i,j-1,k)
- b(5,3,i,j) = - dt * ty2
- > * ( c1 * ( u(5,i,j-1,k) * tmp1 )
- > - 0.50d+00 * c2
- > * ( ( u(2,i,j-1,k)*u(2,i,j-1,k)
- > + 3.0d+00 * u(3,i,j-1,k)*u(3,i,j-1,k)
- > + u(4,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) )
- > - dt * ty1
- > * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j-1,k)
- b(5,4,i,j) = - dt * ty2
- > * ( - c2 * ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 )
- > - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j-1,k)
- b(5,5,i,j) = - dt * ty2
- > * ( c1 * ( u(3,i,j-1,k) * tmp1 ) )
- > - dt * ty1 * c1345 * tmp1
- > - dt * ty1 * dy5
-
-c---------------------------------------------------------------------
-c form the third block sub-diagonal
-c---------------------------------------------------------------------
- tmp1 = 1.0d+00 / u(1,i-1,j,k)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- c(1,1,i,j) = - dt * tx1 * dx1
- c(1,2,i,j) = - dt * tx2
- c(1,3,i,j) = 0.0d+00
- c(1,4,i,j) = 0.0d+00
- c(1,5,i,j) = 0.0d+00
-
- c(2,1,i,j) = - dt * tx2
- > * ( - ( u(2,i-1,j,k) * tmp1 ) ** 2
- > + c2 * 0.50d+00 * ( u(2,i-1,j,k) * u(2,i-1,j,k)
- > + u(3,i-1,j,k) * u(3,i-1,j,k)
- > + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2 )
- > - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i-1,j,k) )
- c(2,2,i,j) = - dt * tx2
- > * ( ( 2.0d+00 - c2 ) * ( u(2,i-1,j,k) * tmp1 ) )
- > - dt * tx1 * ( r43 * c34 * tmp1 )
- > - dt * tx1 * dx2
- c(2,3,i,j) = - dt * tx2
- > * ( - c2 * ( u(3,i-1,j,k) * tmp1 ) )
- c(2,4,i,j) = - dt * tx2
- > * ( - c2 * ( u(4,i-1,j,k) * tmp1 ) )
- c(2,5,i,j) = - dt * tx2 * c2
-
- c(3,1,i,j) = - dt * tx2
- > * ( - ( u(2,i-1,j,k) * u(3,i-1,j,k) ) * tmp2 )
- > - dt * tx1 * ( - c34 * tmp2 * u(3,i-1,j,k) )
- c(3,2,i,j) = - dt * tx2 * ( u(3,i-1,j,k) * tmp1 )
- c(3,3,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 )
- > - dt * tx1 * ( c34 * tmp1 )
- > - dt * tx1 * dx3
- c(3,4,i,j) = 0.0d+00
- c(3,5,i,j) = 0.0d+00
-
- c(4,1,i,j) = - dt * tx2
- > * ( - ( u(2,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 )
- > - dt * tx1 * ( - c34 * tmp2 * u(4,i-1,j,k) )
- c(4,2,i,j) = - dt * tx2 * ( u(4,i-1,j,k) * tmp1 )
- c(4,3,i,j) = 0.0d+00
- c(4,4,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 )
- > - dt * tx1 * ( c34 * tmp1 )
- > - dt * tx1 * dx4
- c(4,5,i,j) = 0.0d+00
-
- c(5,1,i,j) = - dt * tx2
- > * ( ( c2 * ( u(2,i-1,j,k) * u(2,i-1,j,k)
- > + u(3,i-1,j,k) * u(3,i-1,j,k)
- > + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2
- > - c1 * ( u(5,i-1,j,k) * tmp1 ) )
- > * ( u(2,i-1,j,k) * tmp1 ) )
- > - dt * tx1
- > * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i-1,j,k)**2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(3,i-1,j,k)**2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(4,i-1,j,k)**2 )
- > - c1345 * tmp2 * u(5,i-1,j,k) )
- c(5,2,i,j) = - dt * tx2
- > * ( c1 * ( u(5,i-1,j,k) * tmp1 )
- > - 0.50d+00 * c2
- > * ( ( 3.0d+00*u(2,i-1,j,k)*u(2,i-1,j,k)
- > + u(3,i-1,j,k)*u(3,i-1,j,k)
- > + u(4,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 ) )
- > - dt * tx1
- > * ( r43*c34 - c1345 ) * tmp2 * u(2,i-1,j,k)
- c(5,3,i,j) = - dt * tx2
- > * ( - c2 * ( u(3,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 )
- > - dt * tx1
- > * ( c34 - c1345 ) * tmp2 * u(3,i-1,j,k)
- c(5,4,i,j) = - dt * tx2
- > * ( - c2 * ( u(4,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 )
- > - dt * tx1
- > * ( c34 - c1345 ) * tmp2 * u(4,i-1,j,k)
- c(5,5,i,j) = - dt * tx2
- > * ( c1 * ( u(2,i-1,j,k) * tmp1 ) )
- > - dt * tx1 * c1345 * tmp1
- > - dt * tx1 * dx5
-
- end do
- end do
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine jacu(k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the upper triangular part of the jacobian matrix
-c---------------------------------------------------------------------
-
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- integer k
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j
- double precision r43
- double precision c1345
- double precision c34
- double precision tmp1, tmp2, tmp3
-
-
-
- r43 = ( 4.0d+00 / 3.0d+00 )
- c1345 = c1 * c3 * c4 * c5
- c34 = c3 * c4
-
- do j = jst, jend
- do i = ist, iend
-
-c---------------------------------------------------------------------
-c form the block daigonal
-c---------------------------------------------------------------------
- tmp1 = 1.0d+00 / u(1,i,j,k)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- d(1,1,i,j) = 1.0d+00
- > + dt * 2.0d+00 * ( tx1 * dx1
- > + ty1 * dy1
- > + tz1 * dz1 )
- d(1,2,i,j) = 0.0d+00
- d(1,3,i,j) = 0.0d+00
- d(1,4,i,j) = 0.0d+00
- d(1,5,i,j) = 0.0d+00
-
- d(2,1,i,j) = dt * 2.0d+00
- > * ( tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) )
- > + ty1 * ( - c34 * tmp2 * u(2,i,j,k) )
- > + tz1 * ( - c34 * tmp2 * u(2,i,j,k) ) )
- d(2,2,i,j) = 1.0d+00
- > + dt * 2.0d+00
- > * ( tx1 * r43 * c34 * tmp1
- > + ty1 * c34 * tmp1
- > + tz1 * c34 * tmp1 )
- > + dt * 2.0d+00 * ( tx1 * dx2
- > + ty1 * dy2
- > + tz1 * dz2 )
- d(2,3,i,j) = 0.0d+00
- d(2,4,i,j) = 0.0d+00
- d(2,5,i,j) = 0.0d+00
-
- d(3,1,i,j) = dt * 2.0d+00
- > * ( tx1 * ( - c34 * tmp2 * u(3,i,j,k) )
- > + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) )
- > + tz1 * ( - c34 * tmp2 * u(3,i,j,k) ) )
- d(3,2,i,j) = 0.0d+00
- d(3,3,i,j) = 1.0d+00
- > + dt * 2.0d+00
- > * ( tx1 * c34 * tmp1
- > + ty1 * r43 * c34 * tmp1
- > + tz1 * c34 * tmp1 )
- > + dt * 2.0d+00 * ( tx1 * dx3
- > + ty1 * dy3
- > + tz1 * dz3 )
- d(3,4,i,j) = 0.0d+00
- d(3,5,i,j) = 0.0d+00
-
- d(4,1,i,j) = dt * 2.0d+00
- > * ( tx1 * ( - c34 * tmp2 * u(4,i,j,k) )
- > + ty1 * ( - c34 * tmp2 * u(4,i,j,k) )
- > + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) )
- d(4,2,i,j) = 0.0d+00
- d(4,3,i,j) = 0.0d+00
- d(4,4,i,j) = 1.0d+00
- > + dt * 2.0d+00
- > * ( tx1 * c34 * tmp1
- > + ty1 * c34 * tmp1
- > + tz1 * r43 * c34 * tmp1 )
- > + dt * 2.0d+00 * ( tx1 * dx4
- > + ty1 * dy4
- > + tz1 * dz4 )
- d(4,5,i,j) = 0.0d+00
-
- d(5,1,i,j) = dt * 2.0d+00
- > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
- > - ( c1345 ) * tmp2 * u(5,i,j,k) )
- > + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
- > - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
- > - ( c1345 ) * tmp2 * u(5,i,j,k) )
- > + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 )
- > - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 )
- > - ( c1345 ) * tmp2 * u(5,i,j,k) ) )
- d(5,2,i,j) = dt * 2.0d+00
- > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k)
- > + ty1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k)
- > + tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) )
- d(5,3,i,j) = dt * 2.0d+00
- > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k)
- > + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k)
- > + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) )
- d(5,4,i,j) = dt * 2.0d+00
- > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
- > + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k)
- > + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) )
- d(5,5,i,j) = 1.0d+00
- > + dt * 2.0d+00 * ( tx1 * c1345 * tmp1
- > + ty1 * c1345 * tmp1
- > + tz1 * c1345 * tmp1 )
- > + dt * 2.0d+00 * ( tx1 * dx5
- > + ty1 * dy5
- > + tz1 * dz5 )
-
-c---------------------------------------------------------------------
-c form the first block sub-diagonal
-c---------------------------------------------------------------------
- tmp1 = 1.0d+00 / u(1,i+1,j,k)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- a(1,1,i,j) = - dt * tx1 * dx1
- a(1,2,i,j) = dt * tx2
- a(1,3,i,j) = 0.0d+00
- a(1,4,i,j) = 0.0d+00
- a(1,5,i,j) = 0.0d+00
-
- a(2,1,i,j) = dt * tx2
- > * ( - ( u(2,i+1,j,k) * tmp1 ) ** 2
- > + c2 * 0.50d+00 * ( u(2,i+1,j,k) * u(2,i+1,j,k)
- > + u(3,i+1,j,k) * u(3,i+1,j,k)
- > + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2 )
- > - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i+1,j,k) )
- a(2,2,i,j) = dt * tx2
- > * ( ( 2.0d+00 - c2 ) * ( u(2,i+1,j,k) * tmp1 ) )
- > - dt * tx1 * ( r43 * c34 * tmp1 )
- > - dt * tx1 * dx2
- a(2,3,i,j) = dt * tx2
- > * ( - c2 * ( u(3,i+1,j,k) * tmp1 ) )
- a(2,4,i,j) = dt * tx2
- > * ( - c2 * ( u(4,i+1,j,k) * tmp1 ) )
- a(2,5,i,j) = dt * tx2 * c2
-
- a(3,1,i,j) = dt * tx2
- > * ( - ( u(2,i+1,j,k) * u(3,i+1,j,k) ) * tmp2 )
- > - dt * tx1 * ( - c34 * tmp2 * u(3,i+1,j,k) )
- a(3,2,i,j) = dt * tx2 * ( u(3,i+1,j,k) * tmp1 )
- a(3,3,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 )
- > - dt * tx1 * ( c34 * tmp1 )
- > - dt * tx1 * dx3
- a(3,4,i,j) = 0.0d+00
- a(3,5,i,j) = 0.0d+00
-
- a(4,1,i,j) = dt * tx2
- > * ( - ( u(2,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 )
- > - dt * tx1 * ( - c34 * tmp2 * u(4,i+1,j,k) )
- a(4,2,i,j) = dt * tx2 * ( u(4,i+1,j,k) * tmp1 )
- a(4,3,i,j) = 0.0d+00
- a(4,4,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 )
- > - dt * tx1 * ( c34 * tmp1 )
- > - dt * tx1 * dx4
- a(4,5,i,j) = 0.0d+00
-
- a(5,1,i,j) = dt * tx2
- > * ( ( c2 * ( u(2,i+1,j,k) * u(2,i+1,j,k)
- > + u(3,i+1,j,k) * u(3,i+1,j,k)
- > + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2
- > - c1 * ( u(5,i+1,j,k) * tmp1 ) )
- > * ( u(2,i+1,j,k) * tmp1 ) )
- > - dt * tx1
- > * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i+1,j,k)**2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(3,i+1,j,k)**2 )
- > - ( c34 - c1345 ) * tmp3 * ( u(4,i+1,j,k)**2 )
- > - c1345 * tmp2 * u(5,i+1,j,k) )
- a(5,2,i,j) = dt * tx2
- > * ( c1 * ( u(5,i+1,j,k) * tmp1 )
- > - 0.50d+00 * c2
- > * ( ( 3.0d+00*u(2,i+1,j,k)*u(2,i+1,j,k)
- > + u(3,i+1,j,k)*u(3,i+1,j,k)
- > + u(4,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 ) )
- > - dt * tx1
- > * ( r43*c34 - c1345 ) * tmp2 * u(2,i+1,j,k)
- a(5,3,i,j) = dt * tx2
- > * ( - c2 * ( u(3,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 )
- > - dt * tx1
- > * ( c34 - c1345 ) * tmp2 * u(3,i+1,j,k)
- a(5,4,i,j) = dt * tx2
- > * ( - c2 * ( u(4,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 )
- > - dt * tx1
- > * ( c34 - c1345 ) * tmp2 * u(4,i+1,j,k)
- a(5,5,i,j) = dt * tx2
- > * ( c1 * ( u(2,i+1,j,k) * tmp1 ) )
- > - dt * tx1 * c1345 * tmp1
- > - dt * tx1 * dx5
-
-c---------------------------------------------------------------------
-c form the second block sub-diagonal
-c---------------------------------------------------------------------
- tmp1 = 1.0d+00 / u(1,i,j+1,k)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- b(1,1,i,j) = - dt * ty1 * dy1
- b(1,2,i,j) = 0.0d+00
- b(1,3,i,j) = dt * ty2
- b(1,4,i,j) = 0.0d+00
- b(1,5,i,j) = 0.0d+00
-
- b(2,1,i,j) = dt * ty2
- > * ( - ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 )
- > - dt * ty1 * ( - c34 * tmp2 * u(2,i,j+1,k) )
- b(2,2,i,j) = dt * ty2 * ( u(3,i,j+1,k) * tmp1 )
- > - dt * ty1 * ( c34 * tmp1 )
- > - dt * ty1 * dy2
- b(2,3,i,j) = dt * ty2 * ( u(2,i,j+1,k) * tmp1 )
- b(2,4,i,j) = 0.0d+00
- b(2,5,i,j) = 0.0d+00
-
- b(3,1,i,j) = dt * ty2
- > * ( - ( u(3,i,j+1,k) * tmp1 ) ** 2
- > + 0.50d+00 * c2 * ( ( u(2,i,j+1,k) * u(2,i,j+1,k)
- > + u(3,i,j+1,k) * u(3,i,j+1,k)
- > + u(4,i,j+1,k) * u(4,i,j+1,k) )
- > * tmp2 ) )
- > - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j+1,k) )
- b(3,2,i,j) = dt * ty2
- > * ( - c2 * ( u(2,i,j+1,k) * tmp1 ) )
- b(3,3,i,j) = dt * ty2 * ( ( 2.0d+00 - c2 )
- > * ( u(3,i,j+1,k) * tmp1 ) )
- > - dt * ty1 * ( r43 * c34 * tmp1 )
- > - dt * ty1 * dy3
- b(3,4,i,j) = dt * ty2
- > * ( - c2 * ( u(4,i,j+1,k) * tmp1 ) )
- b(3,5,i,j) = dt * ty2 * c2
-
- b(4,1,i,j) = dt * ty2
- > * ( - ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 )
- > - dt * ty1 * ( - c34 * tmp2 * u(4,i,j+1,k) )
- b(4,2,i,j) = 0.0d+00
- b(4,3,i,j) = dt * ty2 * ( u(4,i,j+1,k) * tmp1 )
- b(4,4,i,j) = dt * ty2 * ( u(3,i,j+1,k) * tmp1 )
- > - dt * ty1 * ( c34 * tmp1 )
- > - dt * ty1 * dy4
- b(4,5,i,j) = 0.0d+00
-
- b(5,1,i,j) = dt * ty2
- > * ( ( c2 * ( u(2,i,j+1,k) * u(2,i,j+1,k)
- > + u(3,i,j+1,k) * u(3,i,j+1,k)
- > + u(4,i,j+1,k) * u(4,i,j+1,k) ) * tmp2
- > - c1 * ( u(5,i,j+1,k) * tmp1 ) )
- > * ( u(3,i,j+1,k) * tmp1 ) )
- > - dt * ty1
- > * ( - ( c34 - c1345 )*tmp3*(u(2,i,j+1,k)**2)
- > - ( r43*c34 - c1345 )*tmp3*(u(3,i,j+1,k)**2)
- > - ( c34 - c1345 )*tmp3*(u(4,i,j+1,k)**2)
- > - c1345*tmp2*u(5,i,j+1,k) )
- b(5,2,i,j) = dt * ty2
- > * ( - c2 * ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 )
- > - dt * ty1
- > * ( c34 - c1345 ) * tmp2 * u(2,i,j+1,k)
- b(5,3,i,j) = dt * ty2
- > * ( c1 * ( u(5,i,j+1,k) * tmp1 )
- > - 0.50d+00 * c2
- > * ( ( u(2,i,j+1,k)*u(2,i,j+1,k)
- > + 3.0d+00 * u(3,i,j+1,k)*u(3,i,j+1,k)
- > + u(4,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) )
- > - dt * ty1
- > * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j+1,k)
- b(5,4,i,j) = dt * ty2
- > * ( - c2 * ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 )
- > - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j+1,k)
- b(5,5,i,j) = dt * ty2
- > * ( c1 * ( u(3,i,j+1,k) * tmp1 ) )
- > - dt * ty1 * c1345 * tmp1
- > - dt * ty1 * dy5
-
-c---------------------------------------------------------------------
-c form the third block sub-diagonal
-c---------------------------------------------------------------------
- tmp1 = 1.0d+00 / u(1,i,j,k+1)
- tmp2 = tmp1 * tmp1
- tmp3 = tmp1 * tmp2
-
- c(1,1,i,j) = - dt * tz1 * dz1
- c(1,2,i,j) = 0.0d+00
- c(1,3,i,j) = 0.0d+00
- c(1,4,i,j) = dt * tz2
- c(1,5,i,j) = 0.0d+00
-
- c(2,1,i,j) = dt * tz2
- > * ( - ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
- > - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k+1) )
- c(2,2,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 )
- > - dt * tz1 * c34 * tmp1
- > - dt * tz1 * dz2
- c(2,3,i,j) = 0.0d+00
- c(2,4,i,j) = dt * tz2 * ( u(2,i,j,k+1) * tmp1 )
- c(2,5,i,j) = 0.0d+00
-
- c(3,1,i,j) = dt * tz2
- > * ( - ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
- > - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k+1) )
- c(3,2,i,j) = 0.0d+00
- c(3,3,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 )
- > - dt * tz1 * ( c34 * tmp1 )
- > - dt * tz1 * dz3
- c(3,4,i,j) = dt * tz2 * ( u(3,i,j,k+1) * tmp1 )
- c(3,5,i,j) = 0.0d+00
-
- c(4,1,i,j) = dt * tz2
- > * ( - ( u(4,i,j,k+1) * tmp1 ) ** 2
- > + 0.50d+00 * c2
- > * ( ( u(2,i,j,k+1) * u(2,i,j,k+1)
- > + u(3,i,j,k+1) * u(3,i,j,k+1)
- > + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2 ) )
- > - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k+1) )
- c(4,2,i,j) = dt * tz2
- > * ( - c2 * ( u(2,i,j,k+1) * tmp1 ) )
- c(4,3,i,j) = dt * tz2
- > * ( - c2 * ( u(3,i,j,k+1) * tmp1 ) )
- c(4,4,i,j) = dt * tz2 * ( 2.0d+00 - c2 )
- > * ( u(4,i,j,k+1) * tmp1 )
- > - dt * tz1 * ( r43 * c34 * tmp1 )
- > - dt * tz1 * dz4
- c(4,5,i,j) = dt * tz2 * c2
-
- c(5,1,i,j) = dt * tz2
- > * ( ( c2 * ( u(2,i,j,k+1) * u(2,i,j,k+1)
- > + u(3,i,j,k+1) * u(3,i,j,k+1)
- > + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2
- > - c1 * ( u(5,i,j,k+1) * tmp1 ) )
- > * ( u(4,i,j,k+1) * tmp1 ) )
- > - dt * tz1
- > * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k+1)**2)
- > - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k+1)**2)
- > - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k+1)**2)
- > - c1345 * tmp2 * u(5,i,j,k+1) )
- c(5,2,i,j) = dt * tz2
- > * ( - c2 * ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
- > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k+1)
- c(5,3,i,j) = dt * tz2
- > * ( - c2 * ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 )
- > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k+1)
- c(5,4,i,j) = dt * tz2
- > * ( c1 * ( u(5,i,j,k+1) * tmp1 )
- > - 0.50d+00 * c2
- > * ( ( u(2,i,j,k+1)*u(2,i,j,k+1)
- > + u(3,i,j,k+1)*u(3,i,j,k+1)
- > + 3.0d+00*u(4,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) )
- > - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k+1)
- c(5,5,i,j) = dt * tz2
- > * ( c1 * ( u(4,i,j,k+1) * tmp1 ) )
- > - dt * tz1 * c1345 * tmp1
- > - dt * tz1 * dz5
-
- end do
- end do
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine l2norm ( ldx, ldy, ldz,
- > nx0, ny0, nz0,
- > ist, iend,
- > jst, jend,
- > v, sum )
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c to compute the l2-norm of vector v.
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- integer ldx, ldy, ldz
- integer nx0, ny0, nz0
- integer ist, iend
- integer jst, jend
- double precision v(5,-1:ldx+2,-1:ldy+2,*), sum(5)
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, k, m
- double precision dummy(5)
-
- integer IERROR
-
-
- do m = 1, 5
- dummy(m) = 0.0d+00
- end do
-
- do k = 2, nz0-1
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- dummy(m) = dummy(m) + v(m,i,j,k) * v(m,i,j,k)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c compute the global sum of individual contributions to dot product.
-c---------------------------------------------------------------------
- call MPI_ALLREDUCE( dummy,
- > sum,
- > 5,
- > dp_type,
- > MPI_SUM,
- > MPI_COMM_WORLD,
- > IERROR )
-
- do m = 1, 5
- sum(m) = sqrt ( sum(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) )
- end do
-
- return
- end
+++ /dev/null
-!-------------------------------------------------------------------------!
-! !
-! N A S P A R A L L E L B E N C H M A R K S 3.3 !
-! !
-! L U !
-! !
-!-------------------------------------------------------------------------!
-! !
-! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
-! It is described in NAS Technical Reports 95-020 and 02-007 !
-! !
-! Permission to use, copy, distribute and modify this software !
-! for any purpose with or without fee is hereby granted. We !
-! request, however, that all derived work reference the NAS !
-! Parallel Benchmarks 3.3. This software is provided "as is" !
-! without express or implied warranty. !
-! !
-! Information on NPB 3.3, including the technical report, the !
-! original specifications, source code, results and information !
-! on how to submit new results, is available at: !
-! !
-! http://www.nas.nasa.gov/Software/NPB/ !
-! !
-! Send comments or suggestions to npb@nas.nasa.gov !
-! !
-! NAS Parallel Benchmarks Group !
-! NASA Ames Research Center !
-! Mail Stop: T27A-1 !
-! Moffett Field, CA 94035-1000 !
-! !
-! E-mail: npb@nas.nasa.gov !
-! Fax: (650) 604-3957 !
-! !
-!-------------------------------------------------------------------------!
-
-c---------------------------------------------------------------------
-c
-c Authors: S. Weeratunga
-c V. Venkatakrishnan
-c E. Barszcz
-c M. Yarrow
-c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
- program applu
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c driver for the performance evaluation of the solver for
-c five coupled parabolic/elliptic partial differential equations.
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
- character class
- logical verified
- double precision mflops
- integer ierr
-
-c---------------------------------------------------------------------
-c initialize communications
-c---------------------------------------------------------------------
- call init_comm()
-
-c---------------------------------------------------------------------
-c read input data
-c---------------------------------------------------------------------
- call read_input()
-
-c---------------------------------------------------------------------
-c set up processor grid
-c---------------------------------------------------------------------
- call proc_grid()
-
-c---------------------------------------------------------------------
-c determine the neighbors
-c---------------------------------------------------------------------
- call neighbors()
-
-c---------------------------------------------------------------------
-c set up sub-domain sizes
-c---------------------------------------------------------------------
- call subdomain()
-
-c---------------------------------------------------------------------
-c set up coefficients
-c---------------------------------------------------------------------
- call setcoeff()
-
-c---------------------------------------------------------------------
-c set the masks required for comm
-c---------------------------------------------------------------------
- call sethyper()
-
-c---------------------------------------------------------------------
-c set the boundary values for dependent variables
-c---------------------------------------------------------------------
- call setbv()
-
-c---------------------------------------------------------------------
-c set the initial values for dependent variables
-c---------------------------------------------------------------------
- call setiv()
-
-c---------------------------------------------------------------------
-c compute the forcing term based on prescribed exact solution
-c---------------------------------------------------------------------
- call erhs()
-
-c---------------------------------------------------------------------
-c perform one SSOR iteration to touch all data and program pages
-c---------------------------------------------------------------------
- call ssor(1)
-
-c---------------------------------------------------------------------
-c reset the boundary and initial values
-c---------------------------------------------------------------------
- call setbv()
- call setiv()
-
-c---------------------------------------------------------------------
-c perform the SSOR iterations
-c---------------------------------------------------------------------
- call ssor(itmax)
-
-c---------------------------------------------------------------------
-c compute the solution error
-c---------------------------------------------------------------------
- call error()
-
-c---------------------------------------------------------------------
-c compute the surface integral
-c---------------------------------------------------------------------
- call pintgr()
-
-c---------------------------------------------------------------------
-c verification test
-c---------------------------------------------------------------------
- IF (id.eq.0) THEN
- call verify ( rsdnm, errnm, frc, class, verified )
- mflops = float(itmax)*(1984.77*float( nx0 )
- > *float( ny0 )
- > *float( nz0 )
- > -10923.3*(float( nx0+ny0+nz0 )/3.)**2
- > +27770.9* float( nx0+ny0+nz0 )/3.
- > -144010.)
- > / (maxtime*1000000.)
-
- call print_results('LU', class, nx0,
- > ny0, nz0, itmax, nnodes_compiled,
- > num, maxtime, mflops, ' floating point', verified,
- > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6,
- > '(none)')
-
- END IF
-
- call mpi_finalize(ierr)
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'mpif.h'
-
- integer node, no_nodes, root, comm_setup,
- > comm_solve, comm_rhs, dp_type
- common /mpistuff/ node, no_nodes, root, comm_setup,
- > comm_solve, comm_rhs, dp_type
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine neighbors ()
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c figure out the neighbors and their wrap numbers for each processor
-c---------------------------------------------------------------------
-
- south = -1
- east = -1
- north = -1
- west = -1
-
- if (row.gt.1) then
- north = id -1
- else
- north = -1
- end if
-
- if (row.lt.xdim) then
- south = id + 1
- else
- south = -1
- end if
-
- if (col.gt.1) then
- west = id- xdim
- else
- west = -1
- end if
-
- if (col.lt.ydim) then
- east = id + xdim
- else
- east = -1
- end if
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- integer function nodedim(num)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c compute the exponent where num = 2**nodedim
-c NOTE: assumes a power-of-two number of nodes
-c
-c---------------------------------------------------------------------
-
- implicit none
-
-c---------------------------------------------------------------------
-c input parameters
-c---------------------------------------------------------------------
- integer num
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- double precision fnum
-
-
- fnum = dble(num)
- nodedim = log(fnum)/log(2.0d+0) + 0.00001
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine pintgr
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, k
- integer ibeg, ifin, ifin1
- integer jbeg, jfin, jfin1
- integer iglob, iglob1, iglob2
- integer jglob, jglob1, jglob2
- integer ind1, ind2
- double precision phi1(0:isiz2+1,0:isiz3+1),
- > phi2(0:isiz2+1,0:isiz3+1)
- double precision frc1, frc2, frc3
- double precision dummy
-
- integer IERROR
-
-
-c---------------------------------------------------------------------
-c set up the sub-domains for integeration in each processor
-c---------------------------------------------------------------------
- ibeg = nx + 1
- ifin = 0
- iglob1 = ipt + 1
- iglob2 = ipt + nx
- if (iglob1.ge.ii1.and.iglob2.lt.ii2+nx) ibeg = 1
- if (iglob1.gt.ii1-nx.and.iglob2.le.ii2) ifin = nx
- if (ii1.ge.iglob1.and.ii1.le.iglob2) ibeg = ii1 - ipt
- if (ii2.ge.iglob1.and.ii2.le.iglob2) ifin = ii2 - ipt
- jbeg = ny + 1
- jfin = 0
- jglob1 = jpt + 1
- jglob2 = jpt + ny
- if (jglob1.ge.ji1.and.jglob2.lt.ji2+ny) jbeg = 1
- if (jglob1.gt.ji1-ny.and.jglob2.le.ji2) jfin = ny
- if (ji1.ge.jglob1.and.ji1.le.jglob2) jbeg = ji1 - jpt
- if (ji2.ge.jglob1.and.ji2.le.jglob2) jfin = ji2 - jpt
- ifin1 = ifin
- jfin1 = jfin
- if (ipt + ifin1.eq.ii2) ifin1 = ifin -1
- if (jpt + jfin1.eq.ji2) jfin1 = jfin -1
-
-c---------------------------------------------------------------------
-c initialize
-c---------------------------------------------------------------------
- do i = 0,isiz2+1
- do k = 0,isiz3+1
- phi1(i,k) = 0.
- phi2(i,k) = 0.
- end do
- end do
-
- do j = jbeg,jfin
- jglob = jpt + j
- do i = ibeg,ifin
- iglob = ipt + i
-
- k = ki1
-
- phi1(i,j) = c2*( u(5,i,j,k)
- > - 0.50d+00 * ( u(2,i,j,k) ** 2
- > + u(3,i,j,k) ** 2
- > + u(4,i,j,k) ** 2 )
- > / u(1,i,j,k) )
-
- k = ki2
-
- phi2(i,j) = c2*( u(5,i,j,k)
- > - 0.50d+00 * ( u(2,i,j,k) ** 2
- > + u(3,i,j,k) ** 2
- > + u(4,i,j,k) ** 2 )
- > / u(1,i,j,k) )
- end do
- end do
-
-c---------------------------------------------------------------------
-c communicate in i and j directions
-c---------------------------------------------------------------------
- call exchange_4(phi1,phi2,ibeg,ifin1,jbeg,jfin1)
-
- frc1 = 0.0d+00
-
- do j = jbeg,jfin1
- do i = ibeg, ifin1
- frc1 = frc1 + ( phi1(i,j)
- > + phi1(i+1,j)
- > + phi1(i,j+1)
- > + phi1(i+1,j+1)
- > + phi2(i,j)
- > + phi2(i+1,j)
- > + phi2(i,j+1)
- > + phi2(i+1,j+1) )
- end do
- end do
-
-c---------------------------------------------------------------------
-c compute the global sum of individual contributions to frc1
-c---------------------------------------------------------------------
- dummy = frc1
- call MPI_ALLREDUCE( dummy,
- > frc1,
- > 1,
- > dp_type,
- > MPI_SUM,
- > MPI_COMM_WORLD,
- > IERROR )
-
- frc1 = dxi * deta * frc1
-
-c---------------------------------------------------------------------
-c initialize
-c---------------------------------------------------------------------
- do i = 0,isiz2+1
- do k = 0,isiz3+1
- phi1(i,k) = 0.
- phi2(i,k) = 0.
- end do
- end do
- jglob = jpt + jbeg
- ind1 = 0
- if (jglob.eq.ji1) then
- ind1 = 1
- do k = ki1, ki2
- do i = ibeg, ifin
- iglob = ipt + i
- phi1(i,k) = c2*( u(5,i,jbeg,k)
- > - 0.50d+00 * ( u(2,i,jbeg,k) ** 2
- > + u(3,i,jbeg,k) ** 2
- > + u(4,i,jbeg,k) ** 2 )
- > / u(1,i,jbeg,k) )
- end do
- end do
- end if
-
- jglob = jpt + jfin
- ind2 = 0
- if (jglob.eq.ji2) then
- ind2 = 1
- do k = ki1, ki2
- do i = ibeg, ifin
- iglob = ipt + i
- phi2(i,k) = c2*( u(5,i,jfin,k)
- > - 0.50d+00 * ( u(2,i,jfin,k) ** 2
- > + u(3,i,jfin,k) ** 2
- > + u(4,i,jfin,k) ** 2 )
- > / u(1,i,jfin,k) )
- end do
- end do
- end if
-
-c---------------------------------------------------------------------
-c communicate in i direction
-c---------------------------------------------------------------------
- if (ind1.eq.1) then
- call exchange_5(phi1,ibeg,ifin1)
- end if
- if (ind2.eq.1) then
- call exchange_5 (phi2,ibeg,ifin1)
- end if
-
- frc2 = 0.0d+00
- do k = ki1, ki2-1
- do i = ibeg, ifin1
- frc2 = frc2 + ( phi1(i,k)
- > + phi1(i+1,k)
- > + phi1(i,k+1)
- > + phi1(i+1,k+1)
- > + phi2(i,k)
- > + phi2(i+1,k)
- > + phi2(i,k+1)
- > + phi2(i+1,k+1) )
- end do
- end do
-
-c---------------------------------------------------------------------
-c compute the global sum of individual contributions to frc2
-c---------------------------------------------------------------------
- dummy = frc2
- call MPI_ALLREDUCE( dummy,
- > frc2,
- > 1,
- > dp_type,
- > MPI_SUM,
- > MPI_COMM_WORLD,
- > IERROR )
-
- frc2 = dxi * dzeta * frc2
-
-c---------------------------------------------------------------------
-c initialize
-c---------------------------------------------------------------------
- do i = 0,isiz2+1
- do k = 0,isiz3+1
- phi1(i,k) = 0.
- phi2(i,k) = 0.
- end do
- end do
- iglob = ipt + ibeg
- ind1 = 0
- if (iglob.eq.ii1) then
- ind1 = 1
- do k = ki1, ki2
- do j = jbeg, jfin
- jglob = jpt + j
- phi1(j,k) = c2*( u(5,ibeg,j,k)
- > - 0.50d+00 * ( u(2,ibeg,j,k) ** 2
- > + u(3,ibeg,j,k) ** 2
- > + u(4,ibeg,j,k) ** 2 )
- > / u(1,ibeg,j,k) )
- end do
- end do
- end if
-
- iglob = ipt + ifin
- ind2 = 0
- if (iglob.eq.ii2) then
- ind2 = 1
- do k = ki1, ki2
- do j = jbeg, jfin
- jglob = jpt + j
- phi2(j,k) = c2*( u(5,ifin,j,k)
- > - 0.50d+00 * ( u(2,ifin,j,k) ** 2
- > + u(3,ifin,j,k) ** 2
- > + u(4,ifin,j,k) ** 2 )
- > / u(1,ifin,j,k) )
- end do
- end do
- end if
-
-c---------------------------------------------------------------------
-c communicate in j direction
-c---------------------------------------------------------------------
- if (ind1.eq.1) then
- call exchange_6(phi1,jbeg,jfin1)
- end if
- if (ind2.eq.1) then
- call exchange_6(phi2,jbeg,jfin1)
- end if
-
- frc3 = 0.0d+00
-
- do k = ki1, ki2-1
- do j = jbeg, jfin1
- frc3 = frc3 + ( phi1(j,k)
- > + phi1(j+1,k)
- > + phi1(j,k+1)
- > + phi1(j+1,k+1)
- > + phi2(j,k)
- > + phi2(j+1,k)
- > + phi2(j,k+1)
- > + phi2(j+1,k+1) )
- end do
- end do
-
-c---------------------------------------------------------------------
-c compute the global sum of individual contributions to frc3
-c---------------------------------------------------------------------
- dummy = frc3
- call MPI_ALLREDUCE( dummy,
- > frc3,
- > 1,
- > dp_type,
- > MPI_SUM,
- > MPI_COMM_WORLD,
- > IERROR )
-
- frc3 = deta * dzeta * frc3
- frc = 0.25d+00 * ( frc1 + frc2 + frc3 )
-c if (id.eq.0) write (*,1001) frc
-
- return
-
- 1001 format (//5x,'surface integral = ',1pe12.5//)
-
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine proc_grid
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c set up a two-d grid for processors: column-major ordering of unknowns
-c NOTE: assumes a power-of-two number of processors
-c
-c---------------------------------------------------------------------
-
- xdim = 2**(ndim/2)
- if (mod(ndim,2).eq.1) xdim = xdim + xdim
- ydim = num/xdim
-
- row = mod(id,xdim) + 1
- col = id/xdim + 1
-
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine read_input
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
- integer IERROR, fstatus, nnodes
-
-
-c---------------------------------------------------------------------
-c only root reads the input file
-c if input file does not exist, it uses defaults
-c ipr = 1 for detailed progress output
-c inorm = how often the norm is printed (once every inorm iterations)
-c itmax = number of pseudo time steps
-c dt = time step
-c omega 1 over-relaxation factor for SSOR
-c tolrsd = steady state residual tolerance levels
-c nx, ny, nz = number of grid points in x, y, z directions
-c---------------------------------------------------------------------
- ROOT = 0
- if (id .eq. ROOT) then
-
- write(*, 1000)
-
- open (unit=3,file='inputlu.data',status='old',
- > access='sequential',form='formatted', iostat=fstatus)
- if (fstatus .eq. 0) then
-
- write(*, *) 'Reading from input file inputlu.data'
-
- read (3,*)
- read (3,*)
- read (3,*) ipr, inorm
- read (3,*)
- read (3,*)
- read (3,*) itmax
- read (3,*)
- read (3,*)
- read (3,*) dt
- read (3,*)
- read (3,*)
- read (3,*) omega
- read (3,*)
- read (3,*)
- read (3,*) tolrsd(1),tolrsd(2),tolrsd(3),tolrsd(4),tolrsd(5)
- read (3,*)
- read (3,*)
- read (3,*) nx0, ny0, nz0
- close(3)
- else
- ipr = ipr_default
- inorm = inorm_default
- itmax = itmax_default
- dt = dt_default
- omega = omega_default
- tolrsd(1) = tolrsd1_def
- tolrsd(2) = tolrsd2_def
- tolrsd(3) = tolrsd3_def
- tolrsd(4) = tolrsd4_def
- tolrsd(5) = tolrsd5_def
- nx0 = isiz01
- ny0 = isiz02
- nz0 = isiz03
- endif
-
-c---------------------------------------------------------------------
-c check problem size
-c---------------------------------------------------------------------
- call MPI_COMM_SIZE(MPI_COMM_WORLD, nnodes, ierror)
- if (nnodes .ne. nnodes_compiled) then
- write (*, 2000) nnodes, nnodes_compiled
- 2000 format (5x,'Warning: program is running on',i3,' processors'
- > /5x,'but was compiled for ', i3)
- endif
-
- if ( ( nx0 .lt. 4 ) .or.
- > ( ny0 .lt. 4 ) .or.
- > ( nz0 .lt. 4 ) ) then
-
- write (*,2001)
- 2001 format (5x,'PROBLEM SIZE IS TOO SMALL - ',
- > /5x,'SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5')
- CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR )
-
- end if
-
- if ( ( nx0 .gt. isiz01 ) .or.
- > ( ny0 .gt. isiz02 ) .or.
- > ( nz0 .gt. isiz03 ) ) then
-
- write (*,2002)
- 2002 format (5x,'PROBLEM SIZE IS TOO LARGE - ',
- > /5x,'NX, NY AND NZ SHOULD BE LESS THAN OR EQUAL TO ',
- > /5x,'ISIZ01, ISIZ02 AND ISIZ03 RESPECTIVELY')
- CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR )
-
- end if
-
-
- write(*, 1001) nx0, ny0, nz0
- write(*, 1002) itmax
- write(*, 1003) nnodes
-
- 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- LU Benchmark',/)
- 1001 format(' Size: ', i4, 'x', i4, 'x', i4)
- 1002 format(' Iterations: ', i4)
- 1003 format(' Number of processes: ', i5, /)
-
-
-
- end if
-
- call bcast_inputs
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the right hand sides
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, k, m
- integer iex
- integer L1, L2
- integer ist1, iend1
- integer jst1, jend1
- double precision q
- double precision u21, u31, u41
- double precision tmp
- double precision u21i, u31i, u41i, u51i
- double precision u21j, u31j, u41j, u51j
- double precision u21k, u31k, u41k, u51k
- double precision u21im1, u31im1, u41im1, u51im1
- double precision u21jm1, u31jm1, u41jm1, u51jm1
- double precision u21km1, u31km1, u41km1, u51km1
-
-
-
- do k = 1, nz
- do j = 1, ny
- do i = 1, nx
- do m = 1, 5
- rsd(m,i,j,k) = - frct(m,i,j,k)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c xi-direction flux differences
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c iex = flag : iex = 0 north/south communication
-c : iex = 1 east/west communication
-c---------------------------------------------------------------------
- iex = 0
-
-c---------------------------------------------------------------------
-c communicate and receive/send two rows of data
-c---------------------------------------------------------------------
- call exchange_3(u,iex)
-
- L1 = 0
- if (north.eq.-1) L1 = 1
- L2 = nx + 1
- if (south.eq.-1) L2 = nx
-
- ist1 = 1
- iend1 = nx
- if (north.eq.-1) ist1 = 4
- if (south.eq.-1) iend1 = nx - 3
-
- do k = 2, nz - 1
- do j = jst, jend
- do i = L1, L2
- flux(1,i,j,k) = u(2,i,j,k)
- u21 = u(2,i,j,k) / u(1,i,j,k)
-
- q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k)
- > + u(3,i,j,k) * u(3,i,j,k)
- > + u(4,i,j,k) * u(4,i,j,k) )
- > / u(1,i,j,k)
-
- flux(2,i,j,k) = u(2,i,j,k) * u21 + c2 *
- > ( u(5,i,j,k) - q )
- flux(3,i,j,k) = u(3,i,j,k) * u21
- flux(4,i,j,k) = u(4,i,j,k) * u21
- flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u21
- end do
-
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,j,k) = rsd(m,i,j,k)
- > - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) )
- end do
- end do
-
- do i = ist, L2
- tmp = 1.0d+00 / u(1,i,j,k)
-
- u21i = tmp * u(2,i,j,k)
- u31i = tmp * u(3,i,j,k)
- u41i = tmp * u(4,i,j,k)
- u51i = tmp * u(5,i,j,k)
-
- tmp = 1.0d+00 / u(1,i-1,j,k)
-
- u21im1 = tmp * u(2,i-1,j,k)
- u31im1 = tmp * u(3,i-1,j,k)
- u41im1 = tmp * u(4,i-1,j,k)
- u51im1 = tmp * u(5,i-1,j,k)
-
- flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * (u21i-u21im1)
- flux(3,i,j,k) = tx3 * ( u31i - u31im1 )
- flux(4,i,j,k) = tx3 * ( u41i - u41im1 )
- flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
- > * tx3 * ( ( u21i **2 + u31i **2 + u41i **2 )
- > - ( u21im1**2 + u31im1**2 + u41im1**2 ) )
- > + (1.0d+00/6.0d+00)
- > * tx3 * ( u21i**2 - u21im1**2 )
- > + c1 * c5 * tx3 * ( u51i - u51im1 )
- end do
-
- do i = ist, iend
- rsd(1,i,j,k) = rsd(1,i,j,k)
- > + dx1 * tx1 * ( u(1,i-1,j,k)
- > - 2.0d+00 * u(1,i,j,k)
- > + u(1,i+1,j,k) )
- rsd(2,i,j,k) = rsd(2,i,j,k)
- > + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) )
- > + dx2 * tx1 * ( u(2,i-1,j,k)
- > - 2.0d+00 * u(2,i,j,k)
- > + u(2,i+1,j,k) )
- rsd(3,i,j,k) = rsd(3,i,j,k)
- > + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) )
- > + dx3 * tx1 * ( u(3,i-1,j,k)
- > - 2.0d+00 * u(3,i,j,k)
- > + u(3,i+1,j,k) )
- rsd(4,i,j,k) = rsd(4,i,j,k)
- > + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) )
- > + dx4 * tx1 * ( u(4,i-1,j,k)
- > - 2.0d+00 * u(4,i,j,k)
- > + u(4,i+1,j,k) )
- rsd(5,i,j,k) = rsd(5,i,j,k)
- > + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) )
- > + dx5 * tx1 * ( u(5,i-1,j,k)
- > - 2.0d+00 * u(5,i,j,k)
- > + u(5,i+1,j,k) )
- end do
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation
-c---------------------------------------------------------------------
- IF (north.eq.-1) then
- do m = 1, 5
- rsd(m,2,j,k) = rsd(m,2,j,k)
- > - dssp * ( + 5.0d+00 * u(m,2,j,k)
- > - 4.0d+00 * u(m,3,j,k)
- > + u(m,4,j,k) )
- rsd(m,3,j,k) = rsd(m,3,j,k)
- > - dssp * ( - 4.0d+00 * u(m,2,j,k)
- > + 6.0d+00 * u(m,3,j,k)
- > - 4.0d+00 * u(m,4,j,k)
- > + u(m,5,j,k) )
- end do
- END IF
-
- do i = ist1,iend1
- do m = 1, 5
- rsd(m,i,j,k) = rsd(m,i,j,k)
- > - dssp * ( u(m,i-2,j,k)
- > - 4.0d+00 * u(m,i-1,j,k)
- > + 6.0d+00 * u(m,i,j,k)
- > - 4.0d+00 * u(m,i+1,j,k)
- > + u(m,i+2,j,k) )
- end do
- end do
-
- IF (south.eq.-1) then
- do m = 1, 5
- rsd(m,nx-2,j,k) = rsd(m,nx-2,j,k)
- > - dssp * ( u(m,nx-4,j,k)
- > - 4.0d+00 * u(m,nx-3,j,k)
- > + 6.0d+00 * u(m,nx-2,j,k)
- > - 4.0d+00 * u(m,nx-1,j,k) )
- rsd(m,nx-1,j,k) = rsd(m,nx-1,j,k)
- > - dssp * ( u(m,nx-3,j,k)
- > - 4.0d+00 * u(m,nx-2,j,k)
- > + 5.0d+00 * u(m,nx-1,j,k) )
- end do
- END IF
-
- end do
- end do
-
-c---------------------------------------------------------------------
-c eta-direction flux differences
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c iex = flag : iex = 0 north/south communication
-c---------------------------------------------------------------------
- iex = 1
-
-c---------------------------------------------------------------------
-c communicate and receive/send two rows of data
-c---------------------------------------------------------------------
- call exchange_3(u,iex)
-
- L1 = 0
- if (west.eq.-1) L1 = 1
- L2 = ny + 1
- if (east.eq.-1) L2 = ny
-
- jst1 = 1
- jend1 = ny
- if (west.eq.-1) jst1 = 4
- if (east.eq.-1) jend1 = ny - 3
-
- do k = 2, nz - 1
- do j = L1, L2
- do i = ist, iend
- flux(1,i,j,k) = u(3,i,j,k)
- u31 = u(3,i,j,k) / u(1,i,j,k)
-
- q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k)
- > + u(3,i,j,k) * u(3,i,j,k)
- > + u(4,i,j,k) * u(4,i,j,k) )
- > / u(1,i,j,k)
-
- flux(2,i,j,k) = u(2,i,j,k) * u31
- flux(3,i,j,k) = u(3,i,j,k) * u31 + c2 * (u(5,i,j,k)-q)
- flux(4,i,j,k) = u(4,i,j,k) * u31
- flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u31
- end do
- end do
-
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,j,k) = rsd(m,i,j,k)
- > - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) )
- end do
- end do
- end do
-
- do j = jst, L2
- do i = ist, iend
- tmp = 1.0d+00 / u(1,i,j,k)
-
- u21j = tmp * u(2,i,j,k)
- u31j = tmp * u(3,i,j,k)
- u41j = tmp * u(4,i,j,k)
- u51j = tmp * u(5,i,j,k)
-
- tmp = 1.0d+00 / u(1,i,j-1,k)
- u21jm1 = tmp * u(2,i,j-1,k)
- u31jm1 = tmp * u(3,i,j-1,k)
- u41jm1 = tmp * u(4,i,j-1,k)
- u51jm1 = tmp * u(5,i,j-1,k)
-
- flux(2,i,j,k) = ty3 * ( u21j - u21jm1 )
- flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * (u31j-u31jm1)
- flux(4,i,j,k) = ty3 * ( u41j - u41jm1 )
- flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
- > * ty3 * ( ( u21j **2 + u31j **2 + u41j **2 )
- > - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) )
- > + (1.0d+00/6.0d+00)
- > * ty3 * ( u31j**2 - u31jm1**2 )
- > + c1 * c5 * ty3 * ( u51j - u51jm1 )
- end do
- end do
-
- do j = jst, jend
- do i = ist, iend
-
- rsd(1,i,j,k) = rsd(1,i,j,k)
- > + dy1 * ty1 * ( u(1,i,j-1,k)
- > - 2.0d+00 * u(1,i,j,k)
- > + u(1,i,j+1,k) )
-
- rsd(2,i,j,k) = rsd(2,i,j,k)
- > + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) )
- > + dy2 * ty1 * ( u(2,i,j-1,k)
- > - 2.0d+00 * u(2,i,j,k)
- > + u(2,i,j+1,k) )
-
- rsd(3,i,j,k) = rsd(3,i,j,k)
- > + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) )
- > + dy3 * ty1 * ( u(3,i,j-1,k)
- > - 2.0d+00 * u(3,i,j,k)
- > + u(3,i,j+1,k) )
-
- rsd(4,i,j,k) = rsd(4,i,j,k)
- > + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) )
- > + dy4 * ty1 * ( u(4,i,j-1,k)
- > - 2.0d+00 * u(4,i,j,k)
- > + u(4,i,j+1,k) )
-
- rsd(5,i,j,k) = rsd(5,i,j,k)
- > + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) )
- > + dy5 * ty1 * ( u(5,i,j-1,k)
- > - 2.0d+00 * u(5,i,j,k)
- > + u(5,i,j+1,k) )
-
- end do
- end do
-
-c---------------------------------------------------------------------
-c fourth-order dissipation
-c---------------------------------------------------------------------
- IF (west.eq.-1) then
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,2,k) = rsd(m,i,2,k)
- > - dssp * ( + 5.0d+00 * u(m,i,2,k)
- > - 4.0d+00 * u(m,i,3,k)
- > + u(m,i,4,k) )
- rsd(m,i,3,k) = rsd(m,i,3,k)
- > - dssp * ( - 4.0d+00 * u(m,i,2,k)
- > + 6.0d+00 * u(m,i,3,k)
- > - 4.0d+00 * u(m,i,4,k)
- > + u(m,i,5,k) )
- end do
- end do
- END IF
-
- do j = jst1, jend1
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,j,k) = rsd(m,i,j,k)
- > - dssp * ( u(m,i,j-2,k)
- > - 4.0d+00 * u(m,i,j-1,k)
- > + 6.0d+00 * u(m,i,j,k)
- > - 4.0d+00 * u(m,i,j+1,k)
- > + u(m,i,j+2,k) )
- end do
- end do
- end do
-
- IF (east.eq.-1) then
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,ny-2,k) = rsd(m,i,ny-2,k)
- > - dssp * ( u(m,i,ny-4,k)
- > - 4.0d+00 * u(m,i,ny-3,k)
- > + 6.0d+00 * u(m,i,ny-2,k)
- > - 4.0d+00 * u(m,i,ny-1,k) )
- rsd(m,i,ny-1,k) = rsd(m,i,ny-1,k)
- > - dssp * ( u(m,i,ny-3,k)
- > - 4.0d+00 * u(m,i,ny-2,k)
- > + 5.0d+00 * u(m,i,ny-1,k) )
- end do
- end do
- END IF
-
- end do
-
-c---------------------------------------------------------------------
-c zeta-direction flux differences
-c---------------------------------------------------------------------
- do k = 1, nz
- do j = jst, jend
- do i = ist, iend
- flux(1,i,j,k) = u(4,i,j,k)
- u41 = u(4,i,j,k) / u(1,i,j,k)
-
- q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k)
- > + u(3,i,j,k) * u(3,i,j,k)
- > + u(4,i,j,k) * u(4,i,j,k) )
- > / u(1,i,j,k)
-
- flux(2,i,j,k) = u(2,i,j,k) * u41
- flux(3,i,j,k) = u(3,i,j,k) * u41
- flux(4,i,j,k) = u(4,i,j,k) * u41 + c2 * (u(5,i,j,k)-q)
- flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u41
- end do
- end do
- end do
-
- do k = 2, nz - 1
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,j,k) = rsd(m,i,j,k)
- > - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) )
- end do
- end do
- end do
- end do
-
- do k = 2, nz
- do j = jst, jend
- do i = ist, iend
- tmp = 1.0d+00 / u(1,i,j,k)
-
- u21k = tmp * u(2,i,j,k)
- u31k = tmp * u(3,i,j,k)
- u41k = tmp * u(4,i,j,k)
- u51k = tmp * u(5,i,j,k)
-
- tmp = 1.0d+00 / u(1,i,j,k-1)
-
- u21km1 = tmp * u(2,i,j,k-1)
- u31km1 = tmp * u(3,i,j,k-1)
- u41km1 = tmp * u(4,i,j,k-1)
- u51km1 = tmp * u(5,i,j,k-1)
-
- flux(2,i,j,k) = tz3 * ( u21k - u21km1 )
- flux(3,i,j,k) = tz3 * ( u31k - u31km1 )
- flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * (u41k-u41km1)
- flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 )
- > * tz3 * ( ( u21k **2 + u31k **2 + u41k **2 )
- > - ( u21km1**2 + u31km1**2 + u41km1**2 ) )
- > + (1.0d+00/6.0d+00)
- > * tz3 * ( u41k**2 - u41km1**2 )
- > + c1 * c5 * tz3 * ( u51k - u51km1 )
- end do
- end do
- end do
-
- do k = 2, nz - 1
- do j = jst, jend
- do i = ist, iend
- rsd(1,i,j,k) = rsd(1,i,j,k)
- > + dz1 * tz1 * ( u(1,i,j,k-1)
- > - 2.0d+00 * u(1,i,j,k)
- > + u(1,i,j,k+1) )
- rsd(2,i,j,k) = rsd(2,i,j,k)
- > + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) )
- > + dz2 * tz1 * ( u(2,i,j,k-1)
- > - 2.0d+00 * u(2,i,j,k)
- > + u(2,i,j,k+1) )
- rsd(3,i,j,k) = rsd(3,i,j,k)
- > + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) )
- > + dz3 * tz1 * ( u(3,i,j,k-1)
- > - 2.0d+00 * u(3,i,j,k)
- > + u(3,i,j,k+1) )
- rsd(4,i,j,k) = rsd(4,i,j,k)
- > + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) )
- > + dz4 * tz1 * ( u(4,i,j,k-1)
- > - 2.0d+00 * u(4,i,j,k)
- > + u(4,i,j,k+1) )
- rsd(5,i,j,k) = rsd(5,i,j,k)
- > + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) )
- > + dz5 * tz1 * ( u(5,i,j,k-1)
- > - 2.0d+00 * u(5,i,j,k)
- > + u(5,i,j,k+1) )
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c fourth-order dissipation
-c---------------------------------------------------------------------
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,j,2) = rsd(m,i,j,2)
- > - dssp * ( + 5.0d+00 * u(m,i,j,2)
- > - 4.0d+00 * u(m,i,j,3)
- > + u(m,i,j,4) )
- rsd(m,i,j,3) = rsd(m,i,j,3)
- > - dssp * ( - 4.0d+00 * u(m,i,j,2)
- > + 6.0d+00 * u(m,i,j,3)
- > - 4.0d+00 * u(m,i,j,4)
- > + u(m,i,j,5) )
- end do
- end do
- end do
-
- do k = 4, nz - 3
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,j,k) = rsd(m,i,j,k)
- > - dssp * ( u(m,i,j,k-2)
- > - 4.0d+00 * u(m,i,j,k-1)
- > + 6.0d+00 * u(m,i,j,k)
- > - 4.0d+00 * u(m,i,j,k+1)
- > + u(m,i,j,k+2) )
- end do
- end do
- end do
- end do
-
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,j,nz-2) = rsd(m,i,j,nz-2)
- > - dssp * ( u(m,i,j,nz-4)
- > - 4.0d+00 * u(m,i,j,nz-3)
- > + 6.0d+00 * u(m,i,j,nz-2)
- > - 4.0d+00 * u(m,i,j,nz-1) )
- rsd(m,i,j,nz-1) = rsd(m,i,j,nz-1)
- > - dssp * ( u(m,i,j,nz-3)
- > - 4.0d+00 * u(m,i,j,nz-2)
- > + 5.0d+00 * u(m,i,j,nz-1) )
- end do
- end do
- end do
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setbv
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c set the boundary values of dependent variables
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, k
- integer iglob, jglob
-
-c---------------------------------------------------------------------
-c set the dependent variable values along the top and bottom faces
-c---------------------------------------------------------------------
- do j = 1, ny
- jglob = jpt + j
- do i = 1, nx
- iglob = ipt + i
- call exact( iglob, jglob, 1, u( 1, i, j, 1 ) )
- call exact( iglob, jglob, nz, u( 1, i, j, nz ) )
- end do
- end do
-
-c---------------------------------------------------------------------
-c set the dependent variable values along north and south faces
-c---------------------------------------------------------------------
- IF (west.eq.-1) then
- do k = 1, nz
- do i = 1, nx
- iglob = ipt + i
- call exact( iglob, 1, k, u( 1, i, 1, k ) )
- end do
- end do
- END IF
-
- IF (east.eq.-1) then
- do k = 1, nz
- do i = 1, nx
- iglob = ipt + i
- call exact( iglob, ny0, k, u( 1, i, ny, k ) )
- end do
- end do
- END IF
-
-c---------------------------------------------------------------------
-c set the dependent variable values along east and west faces
-c---------------------------------------------------------------------
- IF (north.eq.-1) then
- do k = 1, nz
- do j = 1, ny
- jglob = jpt + j
- call exact( 1, jglob, k, u( 1, 1, j, k ) )
- end do
- end do
- END IF
-
- IF (south.eq.-1) then
- do k = 1, nz
- do j = 1, ny
- jglob = jpt + j
- call exact( nx0, jglob, k, u( 1, nx, j, k ) )
- end do
- end do
- END IF
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setcoeff
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
-c set up coefficients
-c---------------------------------------------------------------------
- dxi = 1.0d+00 / ( nx0 - 1 )
- deta = 1.0d+00 / ( ny0 - 1 )
- dzeta = 1.0d+00 / ( nz0 - 1 )
-
- tx1 = 1.0d+00 / ( dxi * dxi )
- tx2 = 1.0d+00 / ( 2.0d+00 * dxi )
- tx3 = 1.0d+00 / dxi
-
- ty1 = 1.0d+00 / ( deta * deta )
- ty2 = 1.0d+00 / ( 2.0d+00 * deta )
- ty3 = 1.0d+00 / deta
-
- tz1 = 1.0d+00 / ( dzeta * dzeta )
- tz2 = 1.0d+00 / ( 2.0d+00 * dzeta )
- tz3 = 1.0d+00 / dzeta
-
- ii1 = 2
- ii2 = nx0 - 1
- ji1 = 2
- ji2 = ny0 - 2
- ki1 = 3
- ki2 = nz0 - 1
-
-c---------------------------------------------------------------------
-c diffusion coefficients
-c---------------------------------------------------------------------
- dx1 = 0.75d+00
- dx2 = dx1
- dx3 = dx1
- dx4 = dx1
- dx5 = dx1
-
- dy1 = 0.75d+00
- dy2 = dy1
- dy3 = dy1
- dy4 = dy1
- dy5 = dy1
-
- dz1 = 1.00d+00
- dz2 = dz1
- dz3 = dz1
- dz4 = dz1
- dz5 = dz1
-
-c---------------------------------------------------------------------
-c fourth difference dissipation
-c---------------------------------------------------------------------
- dssp = ( max (dx1, dy1, dz1 ) ) / 4.0d+00
-
-c---------------------------------------------------------------------
-c coefficients of the exact solution to the first pde
-c---------------------------------------------------------------------
- ce(1,1) = 2.0d+00
- ce(1,2) = 0.0d+00
- ce(1,3) = 0.0d+00
- ce(1,4) = 4.0d+00
- ce(1,5) = 5.0d+00
- ce(1,6) = 3.0d+00
- ce(1,7) = 5.0d-01
- ce(1,8) = 2.0d-02
- ce(1,9) = 1.0d-02
- ce(1,10) = 3.0d-02
- ce(1,11) = 5.0d-01
- ce(1,12) = 4.0d-01
- ce(1,13) = 3.0d-01
-
-c---------------------------------------------------------------------
-c coefficients of the exact solution to the second pde
-c---------------------------------------------------------------------
- ce(2,1) = 1.0d+00
- ce(2,2) = 0.0d+00
- ce(2,3) = 0.0d+00
- ce(2,4) = 0.0d+00
- ce(2,5) = 1.0d+00
- ce(2,6) = 2.0d+00
- ce(2,7) = 3.0d+00
- ce(2,8) = 1.0d-02
- ce(2,9) = 3.0d-02
- ce(2,10) = 2.0d-02
- ce(2,11) = 4.0d-01
- ce(2,12) = 3.0d-01
- ce(2,13) = 5.0d-01
-
-c---------------------------------------------------------------------
-c coefficients of the exact solution to the third pde
-c---------------------------------------------------------------------
- ce(3,1) = 2.0d+00
- ce(3,2) = 2.0d+00
- ce(3,3) = 0.0d+00
- ce(3,4) = 0.0d+00
- ce(3,5) = 0.0d+00
- ce(3,6) = 2.0d+00
- ce(3,7) = 3.0d+00
- ce(3,8) = 4.0d-02
- ce(3,9) = 3.0d-02
- ce(3,10) = 5.0d-02
- ce(3,11) = 3.0d-01
- ce(3,12) = 5.0d-01
- ce(3,13) = 4.0d-01
-
-c---------------------------------------------------------------------
-c coefficients of the exact solution to the fourth pde
-c---------------------------------------------------------------------
- ce(4,1) = 2.0d+00
- ce(4,2) = 2.0d+00
- ce(4,3) = 0.0d+00
- ce(4,4) = 0.0d+00
- ce(4,5) = 0.0d+00
- ce(4,6) = 2.0d+00
- ce(4,7) = 3.0d+00
- ce(4,8) = 3.0d-02
- ce(4,9) = 5.0d-02
- ce(4,10) = 4.0d-02
- ce(4,11) = 2.0d-01
- ce(4,12) = 1.0d-01
- ce(4,13) = 3.0d-01
-
-c---------------------------------------------------------------------
-c coefficients of the exact solution to the fifth pde
-c---------------------------------------------------------------------
- ce(5,1) = 5.0d+00
- ce(5,2) = 4.0d+00
- ce(5,3) = 3.0d+00
- ce(5,4) = 2.0d+00
- ce(5,5) = 1.0d-01
- ce(5,6) = 4.0d-01
- ce(5,7) = 3.0d-01
- ce(5,8) = 5.0d-02
- ce(5,9) = 4.0d-02
- ce(5,10) = 3.0d-02
- ce(5,11) = 1.0d-01
- ce(5,12) = 3.0d-01
- ce(5,13) = 2.0d-01
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine sethyper
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c for each column in a hyperplane, istart = first row,
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j
- integer iglob, jglob
- integer kp
-
-c---------------------------------------------------------------------
-c compute the pointers for hyperplanes
-c---------------------------------------------------------------------
- do kp = 2,nx0+ny0
- icomms(kp) = .false.
- icommn(kp) = .false.
- icomme(kp) = .false.
- icommw(kp) = .false.
-
-c---------------------------------------------------------------------
-c check to see if comm. to south is required
-c---------------------------------------------------------------------
- if (south.ne.-1) then
- i = iend
- iglob = ipt + i
- jglob = kp - iglob
- j = jglob - jpt
- if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and.
- > j.le.jend) icomms(kp) = .true.
- end if
-
-c---------------------------------------------------------------------
-c check to see if comm. to north is required
-c---------------------------------------------------------------------
- if (north.ne.-1) then
- i = ist
- iglob = ipt + i
- jglob = kp - iglob
- j = jglob - jpt
- if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and.
- > j.le.jend) icommn(kp) = .true.
- end if
-
-c---------------------------------------------------------------------
-c check to see if comm. to east is required
-c---------------------------------------------------------------------
- if (east.ne.-1) then
- j = jend
- jglob = jpt + j
- iglob = kp - jglob
- i = iglob - ipt
- if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and.
- > i.le.iend) icomme(kp) = .true.
- end if
-
-c---------------------------------------------------------------------
-c check to see if comm. to west is required
-c---------------------------------------------------------------------
- if (west.ne.-1) then
- j = jst
- jglob = jpt + j
- iglob = kp - jglob
- i = iglob - ipt
- if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and.
- > i.le.iend) icommw(kp) = .true.
- end if
-
- end do
-
- icomms(1) = .false.
- icommn(1) = .false.
- icomme(1) = .false.
- icommw(1) = .false.
- icomms(nx0+ny0+1) = .false.
- icommn(nx0+ny0+1) = .false.
- icomme(nx0+ny0+1) = .false.
- icommw(nx0+ny0+1) = .false.
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
- subroutine setiv
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c
-c set the initial values of independent variables based on tri-linear
-c interpolation of boundary values in the computational space.
-c
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, k, m
- integer iglob, jglob
- double precision xi, eta, zeta
- double precision pxi, peta, pzeta
- double precision ue_1jk(5),ue_nx0jk(5),ue_i1k(5),
- > ue_iny0k(5),ue_ij1(5),ue_ijnz(5)
-
-
- do k = 2, nz - 1
- zeta = ( dble (k-1) ) / (nz-1)
- do j = 1, ny
- jglob = jpt + j
- IF (jglob.ne.1.and.jglob.ne.ny0) then
- eta = ( dble (jglob-1) ) / (ny0-1)
- do i = 1, nx
- iglob = ipt + i
- IF (iglob.ne.1.and.iglob.ne.nx0) then
- xi = ( dble (iglob-1) ) / (nx0-1)
- call exact (1,jglob,k,ue_1jk)
- call exact (nx0,jglob,k,ue_nx0jk)
- call exact (iglob,1,k,ue_i1k)
- call exact (iglob,ny0,k,ue_iny0k)
- call exact (iglob,jglob,1,ue_ij1)
- call exact (iglob,jglob,nz,ue_ijnz)
- do m = 1, 5
- pxi = ( 1.0d+00 - xi ) * ue_1jk(m)
- > + xi * ue_nx0jk(m)
- peta = ( 1.0d+00 - eta ) * ue_i1k(m)
- > + eta * ue_iny0k(m)
- pzeta = ( 1.0d+00 - zeta ) * ue_ij1(m)
- > + zeta * ue_ijnz(m)
-
- u( m, i, j, k ) = pxi + peta + pzeta
- > - pxi * peta - peta * pzeta - pzeta * pxi
- > + pxi * peta * pzeta
-
- end do
- END IF
- end do
- END IF
- end do
- end do
-
- return
- end
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine ssor(niter)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c to perform pseudo-time stepping SSOR iterations
-c for five nonlinear pde's.
-c---------------------------------------------------------------------
-
- implicit none
- integer niter
-
- include 'mpinpb.h'
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer i, j, k, m
- integer istep
- double precision tmp
- double precision delunm(5), tv(5,isiz1,isiz2)
-
- external timer_read
- double precision wtime, timer_read
-
- integer IERROR
-
-
- ROOT = 0
-
-c---------------------------------------------------------------------
-c begin pseudo-time stepping iterations
-c---------------------------------------------------------------------
- tmp = 1.0d+00 / ( omega * ( 2.0d+00 - omega ) )
-
-c---------------------------------------------------------------------
-c initialize a,b,c,d to zero (guarantees that page tables have been
-c formed, if applicable on given architecture, before timestepping).
-c---------------------------------------------------------------------
- do m=1,isiz2
- do k=1,isiz1
- do j=1,5
- do i=1,5
- a(i,j,k,m) = 0.d0
- b(i,j,k,m) = 0.d0
- c(i,j,k,m) = 0.d0
- d(i,j,k,m) = 0.d0
- enddo
- enddo
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c compute the steady-state residuals
-c---------------------------------------------------------------------
- call rhs
-
-c---------------------------------------------------------------------
-c compute the L2 norms of newton iteration residuals
-c---------------------------------------------------------------------
- call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
- > ist, iend, jst, jend,
- > rsd, rsdnm )
-
- call MPI_BARRIER( MPI_COMM_WORLD, IERROR )
-
- call timer_clear(1)
- call timer_start(1)
-
-c---------------------------------------------------------------------
-c the timestep loop
-c---------------------------------------------------------------------
- do istep = 1, niter
-
- if (id .eq. 0) then
- if (mod ( istep, 20) .eq. 0 .or.
- > istep .eq. itmax .or.
- > istep .eq. 1) then
- if (niter .gt. 1) write( *, 200) istep
- 200 format(' Time step ', i4)
- endif
- endif
-
-c---------------------------------------------------------------------
-c perform SSOR iteration
-c---------------------------------------------------------------------
- do k = 2, nz - 1
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- rsd(m,i,j,k) = dt * rsd(m,i,j,k)
- end do
- end do
- end do
- end do
-
- DO k = 2, nz -1
-c---------------------------------------------------------------------
-c form the lower triangular part of the jacobian matrix
-c---------------------------------------------------------------------
- call jacld(k)
-
-c---------------------------------------------------------------------
-c perform the lower triangular solution
-c---------------------------------------------------------------------
- call blts( isiz1, isiz2, isiz3,
- > nx, ny, nz, k,
- > omega,
- > rsd,
- > a, b, c, d,
- > ist, iend, jst, jend,
- > nx0, ny0, ipt, jpt)
- END DO
-
- DO k = nz - 1, 2, -1
-c---------------------------------------------------------------------
-c form the strictly upper triangular part of the jacobian matrix
-c---------------------------------------------------------------------
- call jacu(k)
-
-c---------------------------------------------------------------------
-c perform the upper triangular solution
-c---------------------------------------------------------------------
- call buts( isiz1, isiz2, isiz3,
- > nx, ny, nz, k,
- > omega,
- > rsd, tv,
- > d, a, b, c,
- > ist, iend, jst, jend,
- > nx0, ny0, ipt, jpt)
- END DO
-
-c---------------------------------------------------------------------
-c update the variables
-c---------------------------------------------------------------------
-
- do k = 2, nz-1
- do j = jst, jend
- do i = ist, iend
- do m = 1, 5
- u( m, i, j, k ) = u( m, i, j, k )
- > + tmp * rsd( m, i, j, k )
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c compute the max-norms of newton iteration corrections
-c---------------------------------------------------------------------
- if ( mod ( istep, inorm ) .eq. 0 ) then
- call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
- > ist, iend, jst, jend,
- > rsd, delunm )
-c if ( ipr .eq. 1 .and. id .eq. 0 ) then
-c write (*,1006) ( delunm(m), m = 1, 5 )
-c else if ( ipr .eq. 2 .and. id .eq. 0 ) then
-c write (*,'(i5,f15.6)') istep,delunm(5)
-c end if
- end if
-
-c---------------------------------------------------------------------
-c compute the steady-state residuals
-c---------------------------------------------------------------------
- call rhs
-
-c---------------------------------------------------------------------
-c compute the max-norms of newton iteration residuals
-c---------------------------------------------------------------------
- if ( ( mod ( istep, inorm ) .eq. 0 ) .or.
- > ( istep .eq. itmax ) ) then
- call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0,
- > ist, iend, jst, jend,
- > rsd, rsdnm )
-c if ( ipr .eq. 1.and.id.eq.0 ) then
-c write (*,1007) ( rsdnm(m), m = 1, 5 )
-c end if
- end if
-
-c---------------------------------------------------------------------
-c check the newton-iteration residuals against the tolerance levels
-c---------------------------------------------------------------------
- if ( ( rsdnm(1) .lt. tolrsd(1) ) .and.
- > ( rsdnm(2) .lt. tolrsd(2) ) .and.
- > ( rsdnm(3) .lt. tolrsd(3) ) .and.
- > ( rsdnm(4) .lt. tolrsd(4) ) .and.
- > ( rsdnm(5) .lt. tolrsd(5) ) ) then
-c if (ipr .eq. 1 .and. id.eq.0) then
-c write (*,1004) istep
-c end if
- return
- end if
-
- end do
-
- call timer_stop(1)
- wtime = timer_read(1)
-
-
- call MPI_ALLREDUCE( wtime,
- > maxtime,
- > 1,
- > MPI_DOUBLE_PRECISION,
- > MPI_MAX,
- > MPI_COMM_WORLD,
- > IERROR )
-
-
-
- return
-
- 1001 format (1x/5x,'pseudo-time SSOR iteration no.=',i4/)
- 1004 format (1x/1x,'convergence was achieved after ',i4,
- > ' pseudo-time steps' )
- 1006 format (1x/1x,'RMS-norm of SSOR-iteration correction ',
- > 'for first pde = ',1pe12.5/,
- > 1x,'RMS-norm of SSOR-iteration correction ',
- > 'for second pde = ',1pe12.5/,
- > 1x,'RMS-norm of SSOR-iteration correction ',
- > 'for third pde = ',1pe12.5/,
- > 1x,'RMS-norm of SSOR-iteration correction ',
- > 'for fourth pde = ',1pe12.5/,
- > 1x,'RMS-norm of SSOR-iteration correction ',
- > 'for fifth pde = ',1pe12.5)
- 1007 format (1x/1x,'RMS-norm of steady-state residual for ',
- > 'first pde = ',1pe12.5/,
- > 1x,'RMS-norm of steady-state residual for ',
- > 'second pde = ',1pe12.5/,
- > 1x,'RMS-norm of steady-state residual for ',
- > 'third pde = ',1pe12.5/,
- > 1x,'RMS-norm of steady-state residual for ',
- > 'fourth pde = ',1pe12.5/,
- > 1x,'RMS-norm of steady-state residual for ',
- > 'fifth pde = ',1pe12.5)
-
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine subdomain
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'applu.incl'
-
-c---------------------------------------------------------------------
-c local variables
-c---------------------------------------------------------------------
- integer mm, ierror, errorcode
-
-
-c---------------------------------------------------------------------
-c
-c set up the sub-domain sizes
-c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c x dimension
-c---------------------------------------------------------------------
- mm = mod(nx0,xdim)
- if (row.le.mm) then
- nx = nx0/xdim + 1
- ipt = (row-1)*nx
- else
- nx = nx0/xdim
- ipt = (row-1)*nx + mm
- end if
-
-c---------------------------------------------------------------------
-c y dimension
-c---------------------------------------------------------------------
- mm = mod(ny0,ydim)
- if (col.le.mm) then
- ny = ny0/ydim + 1
- jpt = (col-1)*ny
- else
- ny = ny0/ydim
- jpt = (col-1)*ny + mm
- end if
-
-c---------------------------------------------------------------------
-c z dimension
-c---------------------------------------------------------------------
- nz = nz0
-
-c---------------------------------------------------------------------
-c check the sub-domain size
-c---------------------------------------------------------------------
- if ( ( nx .lt. 4 ) .or.
- > ( ny .lt. 4 ) .or.
- > ( nz .lt. 4 ) ) then
- write (*,2001) nx, ny, nz
- 2001 format (5x,'SUBDOMAIN SIZE IS TOO SMALL - ',
- > /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS',
- > /5x,'SO THAT NX, NY AND NZ ARE GREATER THAN OR EQUAL',
- > /5x,'TO 4 THEY ARE CURRENTLY', 3I3)
- CALL MPI_ABORT( MPI_COMM_WORLD,
- > ERRORCODE,
- > IERROR )
- end if
-
- if ( ( nx .gt. isiz1 ) .or.
- > ( ny .gt. isiz2 ) .or.
- > ( nz .gt. isiz3 ) ) then
- write (*,2002) nx, ny, nz
- 2002 format (5x,'SUBDOMAIN SIZE IS TOO LARGE - ',
- > /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS',
- > /5x,'SO THAT NX, NY AND NZ ARE LESS THAN OR EQUAL TO ',
- > /5x,'ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY. THEY ARE',
- > /5x,'CURRENTLY', 3I4)
- CALL MPI_ABORT( MPI_COMM_WORLD,
- > ERRORCODE,
- > IERROR )
- end if
-
-
-c---------------------------------------------------------------------
-c set up the start and end in i and j extents for all processors
-c---------------------------------------------------------------------
- ist = 1
- iend = nx
- if (north.eq.-1) ist = 2
- if (south.eq.-1) iend = nx - 1
-
- jst = 1
- jend = ny
- if (west.eq.-1) jst = 2
- if (east.eq.-1) jend = ny - 1
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine verify(xcr, xce, xci, class, verified)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c verification routine
-c---------------------------------------------------------------------
-
- implicit none
- include 'mpinpb.h'
- include 'applu.incl'
-
- double precision xcr(5), xce(5), xci
- double precision xcrref(5),xceref(5),xciref,
- > xcrdif(5),xcedif(5),xcidif,
- > epsilon, dtref
- integer m
- character class
- logical verified
-
-c---------------------------------------------------------------------
-c tolerance level
-c---------------------------------------------------------------------
- epsilon = 1.0d-08
-
- class = 'U'
- verified = .true.
-
- do m = 1,5
- xcrref(m) = 1.0
- xceref(m) = 1.0
- end do
- xciref = 1.0
-
- if ( (nx0 .eq. 12 ) .and.
- > (ny0 .eq. 12 ) .and.
- > (nz0 .eq. 12 ) .and.
- > (itmax .eq. 50 )) then
-
- class = 'S'
- dtref = 5.0d-1
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual, for the (12X12X12) grid,
-c after 50 time steps, with DT = 5.0d-01
-c---------------------------------------------------------------------
- xcrref(1) = 1.6196343210976702d-02
- xcrref(2) = 2.1976745164821318d-03
- xcrref(3) = 1.5179927653399185d-03
- xcrref(4) = 1.5029584435994323d-03
- xcrref(5) = 3.4264073155896461d-02
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error, for the (12X12X12) grid,
-c after 50 time steps, with DT = 5.0d-01
-c---------------------------------------------------------------------
- xceref(1) = 6.4223319957960924d-04
- xceref(2) = 8.4144342047347926d-05
- xceref(3) = 5.8588269616485186d-05
- xceref(4) = 5.8474222595157350d-05
- xceref(5) = 1.3103347914111294d-03
-
-c---------------------------------------------------------------------
-c Reference value of surface integral, for the (12X12X12) grid,
-c after 50 time steps, with DT = 5.0d-01
-c---------------------------------------------------------------------
- xciref = 7.8418928865937083d+00
-
-
- elseif ( (nx0 .eq. 33) .and.
- > (ny0 .eq. 33) .and.
- > (nz0 .eq. 33) .and.
- > (itmax . eq. 300) ) then
-
- class = 'W' !SPEC95fp size
- dtref = 1.5d-3
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual, for the (33x33x33) grid,
-c after 300 time steps, with DT = 1.5d-3
-c---------------------------------------------------------------------
- xcrref(1) = 0.1236511638192d+02
- xcrref(2) = 0.1317228477799d+01
- xcrref(3) = 0.2550120713095d+01
- xcrref(4) = 0.2326187750252d+01
- xcrref(5) = 0.2826799444189d+02
-
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error, for the (33X33X33) grid,
-c---------------------------------------------------------------------
- xceref(1) = 0.4867877144216d+00
- xceref(2) = 0.5064652880982d-01
- xceref(3) = 0.9281818101960d-01
- xceref(4) = 0.8570126542733d-01
- xceref(5) = 0.1084277417792d+01
-
-
-c---------------------------------------------------------------------
-c Reference value of surface integral, for the (33X33X33) grid,
-c after 300 time steps, with DT = 1.5d-3
-c---------------------------------------------------------------------
- xciref = 0.1161399311023d+02
-
- elseif ( (nx0 .eq. 64) .and.
- > (ny0 .eq. 64) .and.
- > (nz0 .eq. 64) .and.
- > (itmax . eq. 250) ) then
-
- class = 'A'
- dtref = 2.0d+0
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual, for the (64X64X64) grid,
-c after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
- xcrref(1) = 7.7902107606689367d+02
- xcrref(2) = 6.3402765259692870d+01
- xcrref(3) = 1.9499249727292479d+02
- xcrref(4) = 1.7845301160418537d+02
- xcrref(5) = 1.8384760349464247d+03
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error, for the (64X64X64) grid,
-c after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
- xceref(1) = 2.9964085685471943d+01
- xceref(2) = 2.8194576365003349d+00
- xceref(3) = 7.3473412698774742d+00
- xceref(4) = 6.7139225687777051d+00
- xceref(5) = 7.0715315688392578d+01
-
-c---------------------------------------------------------------------
-c Reference value of surface integral, for the (64X64X64) grid,
-c after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
- xciref = 2.6030925604886277d+01
-
-
- elseif ( (nx0 .eq. 102) .and.
- > (ny0 .eq. 102) .and.
- > (nz0 .eq. 102) .and.
- > (itmax . eq. 250) ) then
-
- class = 'B'
- dtref = 2.0d+0
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual, for the (102X102X102) grid,
-c after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
- xcrref(1) = 3.5532672969982736d+03
- xcrref(2) = 2.6214750795310692d+02
- xcrref(3) = 8.8333721850952190d+02
- xcrref(4) = 7.7812774739425265d+02
- xcrref(5) = 7.3087969592545314d+03
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error, for the (102X102X102)
-c grid, after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
- xceref(1) = 1.1401176380212709d+02
- xceref(2) = 8.1098963655421574d+00
- xceref(3) = 2.8480597317698308d+01
- xceref(4) = 2.5905394567832939d+01
- xceref(5) = 2.6054907504857413d+02
-
-c---------------------------------------------------------------------
-c Reference value of surface integral, for the (102X102X102) grid,
-c after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
- xciref = 4.7887162703308227d+01
-
- elseif ( (nx0 .eq. 162) .and.
- > (ny0 .eq. 162) .and.
- > (nz0 .eq. 162) .and.
- > (itmax . eq. 250) ) then
-
- class = 'C'
- dtref = 2.0d+0
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual, for the (162X162X162) grid,
-c after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
- xcrref(1) = 1.03766980323537846d+04
- xcrref(2) = 8.92212458801008552d+02
- xcrref(3) = 2.56238814582660871d+03
- xcrref(4) = 2.19194343857831427d+03
- xcrref(5) = 1.78078057261061185d+04
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error, for the (162X162X162)
-c grid, after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
- xceref(1) = 2.15986399716949279d+02
- xceref(2) = 1.55789559239863600d+01
- xceref(3) = 5.41318863077207766d+01
- xceref(4) = 4.82262643154045421d+01
- xceref(5) = 4.55902910043250358d+02
-
-c---------------------------------------------------------------------
-c Reference value of surface integral, for the (162X162X162) grid,
-c after 250 time steps, with DT = 2.0d+00
-c---------------------------------------------------------------------
- xciref = 6.66404553572181300d+01
-
- elseif ( (nx0 .eq. 408) .and.
- > (ny0 .eq. 408) .and.
- > (nz0 .eq. 408) .and.
- > (itmax . eq. 300) ) then
-
- class = 'D'
- dtref = 1.0d+0
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual, for the (408X408X408) grid,
-c after 300 time steps, with DT = 1.0d+00
-c---------------------------------------------------------------------
- xcrref(1) = 0.4868417937025d+05
- xcrref(2) = 0.4696371050071d+04
- xcrref(3) = 0.1218114549776d+05
- xcrref(4) = 0.1033801493461d+05
- xcrref(5) = 0.7142398413817d+05
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error, for the (408X408X408)
-c grid, after 300 time steps, with DT = 1.0d+00
-c---------------------------------------------------------------------
- xceref(1) = 0.3752393004482d+03
- xceref(2) = 0.3084128893659d+02
- xceref(3) = 0.9434276905469d+02
- xceref(4) = 0.8230686681928d+02
- xceref(5) = 0.7002620636210d+03
-
-c---------------------------------------------------------------------
-c Reference value of surface integral, for the (408X408X408) grid,
-c after 300 time steps, with DT = 1.0d+00
-c---------------------------------------------------------------------
- xciref = 0.8334101392503d+02
-
- elseif ( (nx0 .eq. 1020) .and.
- > (ny0 .eq. 1020) .and.
- > (nz0 .eq. 1020) .and.
- > (itmax . eq. 300) ) then
-
- class = 'E'
- dtref = 0.5d+0
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual, for the (1020X1020X1020) grid,
-c after 300 time steps, with DT = 0.5d+00
-c---------------------------------------------------------------------
- xcrref(1) = 0.2099641687874d+06
- xcrref(2) = 0.2130403143165d+05
- xcrref(3) = 0.5319228789371d+05
- xcrref(4) = 0.4509761639833d+05
- xcrref(5) = 0.2932360006590d+06
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error, for the (1020X1020X1020)
-c grid, after 300 time steps, with DT = 0.5d+00
-c---------------------------------------------------------------------
- xceref(1) = 0.4800572578333d+03
- xceref(2) = 0.4221993400184d+02
- xceref(3) = 0.1210851906824d+03
- xceref(4) = 0.1047888986770d+03
- xceref(5) = 0.8363028257389d+03
-
-c---------------------------------------------------------------------
-c Reference value of surface integral, for the (1020X1020X1020) grid,
-c after 300 time steps, with DT = 0.5d+00
-c---------------------------------------------------------------------
- xciref = 0.9512163272273d+02
-
- else
- verified = .FALSE.
- endif
-
-c---------------------------------------------------------------------
-c verification test for residuals if gridsize is one of
-c the defined grid sizes above (class .ne. 'U')
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Compute the difference of solution values and the known reference values.
-c---------------------------------------------------------------------
- do m = 1, 5
-
- xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m))
- xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
-
- enddo
- xcidif = dabs((xci - xciref)/xciref)
-
-
-c---------------------------------------------------------------------
-c Output the comparison of computed results to known cases.
-c---------------------------------------------------------------------
-
- if (class .ne. 'U') then
- write(*, 1990) class
- 1990 format(/, ' Verification being performed for class ', a)
- write (*,2000) epsilon
- 2000 format(' Accuracy setting for epsilon = ', E20.13)
- verified = (dabs(dt-dtref) .le. epsilon)
- if (.not.verified) then
- class = 'U'
- write (*,1000) dtref
- 1000 format(' DT does not match the reference value of ',
- > E15.8)
- endif
- else
- write(*, 1995)
- 1995 format(' Unknown class')
- endif
-
-
- if (class .ne. 'U') then
- write (*,2001)
- else
- write (*, 2005)
- endif
-
- 2001 format(' Comparison of RMS-norms of residual')
- 2005 format(' RMS-norms of residual')
- do m = 1, 5
- if (class .eq. 'U') then
- write(*, 2015) m, xcr(m)
- else if (xcrdif(m) .le. epsilon) then
- write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
- else
- verified = .false.
- write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
- endif
- enddo
-
- if (class .ne. 'U') then
- write (*,2002)
- else
- write (*,2006)
- endif
- 2002 format(' Comparison of RMS-norms of solution error')
- 2006 format(' RMS-norms of solution error')
-
- do m = 1, 5
- if (class .eq. 'U') then
- write(*, 2015) m, xce(m)
- else if (xcedif(m) .le. epsilon) then
- write (*,2011) m,xce(m),xceref(m),xcedif(m)
- else
- verified = .false.
- write (*,2010) m,xce(m),xceref(m),xcedif(m)
- endif
- enddo
-
- 2010 format(' FAILURE: ', i2, 2x, E20.13, E20.13, E20.13)
- 2011 format(' ', i2, 2x, E20.13, E20.13, E20.13)
- 2015 format(' ', i2, 2x, E20.13)
-
- if (class .ne. 'U') then
- write (*,2025)
- else
- write (*,2026)
- endif
- 2025 format(' Comparison of surface integral')
- 2026 format(' Surface integral')
-
-
- if (class .eq. 'U') then
- write(*, 2030) xci
- else if (xcidif .le. epsilon) then
- write(*, 2032) xci, xciref, xcidif
- else
- verified = .false.
- write(*, 2031) xci, xciref, xcidif
- endif
-
- 2030 format(' ', 4x, E20.13)
- 2031 format(' FAILURE: ', 4x, E20.13, E20.13, E20.13)
- 2032 format(' ', 4x, E20.13, E20.13, E20.13)
-
-
-
- if (class .eq. 'U') then
- write(*, 2022)
- write(*, 2023)
- 2022 format(' No reference values provided')
- 2023 format(' No verification performed')
- else if (verified) then
- write(*, 2020)
- 2020 format(' Verification Successful')
- else
- write(*, 2021)
- 2021 format(' Verification failed')
- endif
-
- return
-
-
- end
+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=mg
-BENCHMARKU=MG
-
-include ../config/make.def
-
-OBJS = mg.o ${COMMON}/print_results.o \
- ${COMMON}/${RAND}.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-${PROGRAM}: config ${OBJS}
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-mg.o: mg.f globals.h mpinpb.h npbparams.h
- ${FCOMPILE} mg.f
-
-clean:
- - rm -f *.o *~
- - rm -f npbparams.h core
-
-
-
+++ /dev/null
-Some info about the MG benchmark
-================================
-
-'mg_demo' demonstrates the capabilities of a very simple multigrid
-solver in computing a three dimensional potential field. This is
-a simplified multigrid solver in two important respects:
-
- (1) it solves only a constant coefficient equation,
- and that only on a uniform cubical grid,
-
- (2) it solves only a single equation, representing
- a scalar field rather than a vector field.
-
-We chose it for its portability and simplicity, and expect that a
-supercomputer which can run it effectively will also be able to
-run more complex multigrid programs at least as well.
-
- Eric Barszcz Paul Frederickson
- RIACS
- NASA Ames Research Center NASA Ames Research Center
-
-========================================================================
-Running the program: (Note: also see parameter lm information in the
- two sections immediately below this section)
-
-The program may be run with or without an input deck (called "mg.input").
-The following describes a few things about the input deck if you want to
-use one.
-
-The four lines below are the "mg.input" file required to run a
-problem of total size 256x256x256, for 4 iterations (Class "A"),
-and presumes the use of 8 processors:
-
- 8 = top level
- 256 256 256 = nx ny nz
- 4 = nit
- 0 0 0 0 0 0 0 0 = debug_vec
-
-The first line of input indicates how many levels of multi-grid
-cycle will be applied to a particular subpartition. Presuming that
-8 processors are solving this problem (recall that the number of
-processors is specified to MPI as a run parameter, and MPI subsequently
-determines this for the code via an MPI subroutine call), a 2x2x2
-processor grid is formed, and thus each partition on a processor is
-of size 128x128x128. Therefore, a maximum of 8 multi-grid levels may
-be used. These are of size 128,64,32,16,8,4,2,1, with the coarsest
-level being a single point on a given processor.
-
-
-Next, consider the same size problem but running on 1 processor. The
-following "mg.input" file is appropriate:
-
- 9 = top level
- 256 256 256 = nx ny nz
- 4 = nit
- 0 0 0 0 0 0 0 0 = debug_vec
-
-Since this processor must solve the full 256x256x256 problem, this
-permits 9 multi-grid levels (256,128,64,32,16,8,4,2,1), resulting in
-a coarsest multi-grid level of a single point on the processor
-
-
-Next, consider the same size problem but running on 2 processors. The
-following "mg.input" file is required:
-
- 8 = top level
- 256 256 256 = nx ny nz
- 4 = nit
- 0 0 0 0 0 0 0 0 = debug_vec
-
-The algorithm for partitioning the full grid onto some power of 2 number
-of processors is to start by splitting the last dimension of the grid
-(z dimension) in 2: the problem is now partitioned onto 2 processors.
-Next the middle dimension (y dimension) is split in 2: the problem is now
-partitioned onto 4 processors. Next, first dimension (x dimension) is
-split in 2: the problem is now partitioned onto 8 processors. Next, the
-last dimension (z dimension) is split again in 2: the problem is now
-partitioned onto 16 processors. This partitioning is repeated until all
-of the power of 2 processors have been allocated.
-
-Thus to run the above problem on 2 processors, the grid partitioning
-algorithm will allocate the two processors across the last dimension,
-creating two partitions each of size 256x256x128. The coarsest level of
-multi-grid must be a single point surrounded by a cubic number of grid
-points. Therefore, each of the two processor partitions will contain 4
-coarsest multi-grid level points, each surrounded by a cube of grid points
-of size 128x128x128, indicated by a top level of 8.
-
-
-Next, consider the same size problem but running on 4 processors. The
-following "mg.input" file is required:
-
- 8 = top level
- 256 256 256 = nx ny nz
- 4 = nit
- 0 0 0 0 0 0 0 0 = debug_vec
-
-The partitioning algorithm will create 4 partitions, each of size
-256x128x128. Each partition will contain 2 coarsest multi-grid level
-points each surrounded by a cube of grid points of size 128x128x128,
-indicated by a top level of 8.
-
-
-Next, consider the same size problem but running on 16 processors. The
-following "mg.input" file is required:
-
- 7 = top level
- 256 256 256 = nx ny nz
- 4 = nit
- 0 0 0 0 0 0 0 0 = debug_vec
-
-On each node a partition of size 128x128x64 will be created. A maximum
-of 7 multi-grid levels (64,32,16,8,4,2,1) may be used, resulting in each
-partions containing 4 coarsest multi-grid level points, each surrounded
-by a cube of grid points of size 64x64x64, indicated by a top level of 7.
-
-
-
-
-Note that non-cubic problem sizes may also be considered:
-
-The four lines below are the "mg.input" file appropriate for running a
-problem of total size 256x512x512, for 20 iterations and presumes the
-use of 32 processors (note: this is NOT a class C problem):
-
- 8 = top level
- 256 512 512 = nx ny nz
- 20 = nit
- 0 0 0 0 0 0 0 0 = debug_vec
-
-The first line of input indicates how many levels of multi-grid
-cycle will be applied to a particular subpartition. Presuming that
-32 processors are solving this problem, a 2x4x4 processor grid is
-formed, and thus each partition on a processor is of size 128x128x128.
-Therefore, a maximum of 8 multi-grid levels may be used. These are of
-size 128,64,32,16,8,4,2,1, with the coarsest level being a single
-point on a given processor.
-
+++ /dev/null
-c---------------------------------------------------------------------
-c Parameter lm (declared and set in "npbparams.h") is the log-base2 of
-c the edge size max for the partition on a given node, so must be changed
-c either to save space (if running a small case) or made bigger for larger
-c cases, for example, 512^3. Thus lm=7 means that the largest dimension
-c of a partition that can be solved on a node is 2^7 = 128. lm is set
-c automatically in npbparams.h
-c Parameters ndim1, ndim2, ndim3 are the local problem dimensions.
-c---------------------------------------------------------------------
-
- include 'npbparams.h'
-
- integer nm ! actual dimension including ghost cells for communications
- > , nv ! size of rhs array
- > , nr ! size of residual array
- > , nm2 ! size of communication buffer
- > , maxlevel! maximum number of levels
-
- parameter( nm=2+2**lm, nv=(2+2**ndim1)*(2+2**ndim2)*(2+2**ndim3) )
- parameter( nm2=2*nm*nm, maxlevel=(lt_default+1) )
- parameter( nr = (8*(nv+nm**2+5*nm+14*lt_default-7*lm))/7 )
- integer maxprocs
- parameter( maxprocs = 131072 ) ! this is the upper proc limit that
- ! the current "nr" parameter can handle
-c---------------------------------------------------------------------
- integer nbr(3,-1:1,maxlevel), msg_type(3,-1:1)
- integer msg_id(3,-1:1,2),nx(maxlevel),ny(maxlevel),nz(maxlevel)
- common /mg3/ nbr,msg_type,msg_id,nx,ny,nz
-
- character class
- common /ClassType/class
-
- integer debug_vec(0:7)
- common /my_debug/ debug_vec
-
- integer ir(maxlevel), m1(maxlevel), m2(maxlevel), m3(maxlevel)
- integer lt, lb
- common /fap/ ir,m1,m2,m3,lt,lb
-
- logical dead(maxlevel), give_ex(3,maxlevel), take_ex(3,maxlevel)
- common /comm_ex/ dead, give_ex, take_ex
-
-c---------------------------------------------------------------------
-c Set at m=1024, can handle cases up to 1024^3 case
-c---------------------------------------------------------------------
- integer m
-c parameter( m=1037 )
- parameter( m=nm+1 )
-
- double precision buff(nm2,4)
- common /buffer/ buff
-
-
-
-
+++ /dev/null
-!-------------------------------------------------------------------------!
-! !
-! N A S P A R A L L E L B E N C H M A R K S 3.3 !
-! !
-! M G !
-! !
-!-------------------------------------------------------------------------!
-! !
-! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
-! It is described in NAS Technical Reports 95-020 and 02-007 !
-! !
-! Permission to use, copy, distribute and modify this software !
-! for any purpose with or without fee is hereby granted. We !
-! request, however, that all derived work reference the NAS !
-! Parallel Benchmarks 3.3. This software is provided "as is" !
-! without express or implied warranty. !
-! !
-! Information on NPB 3.3, including the technical report, the !
-! original specifications, source code, results and information !
-! on how to submit new results, is available at: !
-! !
-! http://www.nas.nasa.gov/Software/NPB/ !
-! !
-! Send comments or suggestions to npb@nas.nasa.gov !
-! !
-! NAS Parallel Benchmarks Group !
-! NASA Ames Research Center !
-! Mail Stop: T27A-1 !
-! Moffett Field, CA 94035-1000 !
-! !
-! E-mail: npb@nas.nasa.gov !
-! Fax: (650) 604-3957 !
-! !
-!-------------------------------------------------------------------------!
-
-
-c---------------------------------------------------------------------
-c
-c Authors: E. Barszcz
-c P. Frederickson
-c A. Woo
-c M. Yarrow
-c R. F. Van der Wijngaart
-c
-c---------------------------------------------------------------------
-
-
-c---------------------------------------------------------------------
- program mg_mpi
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
-c---------------------------------------------------------------------------c
-c k is the current level. It is passed down through subroutine args
-c and is NOT global. it is the current iteration
-c---------------------------------------------------------------------------c
-
- integer k, it
-
- external timer_read
- double precision t, t0, tinit, mflops, timer_read
-
-c---------------------------------------------------------------------------c
-c These arrays are in common because they are quite large
-c and probably shouldn't be allocated on the stack. They
-c are always passed as subroutine args.
-c---------------------------------------------------------------------------c
-
- double precision u(nr),v(nv),r(nr),a(0:3),c(0:3)
- common /noautom/ u,v,r
-
- double precision rnm2, rnmu, old2, oldu, epsilon
- integer n1, n2, n3, nit
- double precision nn, verify_value, err
- logical verified
-
- integer ierr,i, fstatus
- integer T_bench, T_init
- parameter (T_bench=1, T_init=2)
-
- call mpi_init(ierr)
- call mpi_comm_rank(mpi_comm_world, me, ierr)
- call mpi_comm_size(mpi_comm_world, nprocs, ierr)
-
- root = 0
- if (nprocs_compiled .gt. maxprocs) then
- if (me .eq. root) write(*,20) nprocs_compiled, maxprocs
- 20 format(' ERROR: compiled for ',i8,' processes'//
- & ' The maximum size allowed for this benchmark is ',i6)
- call mpi_abort(MPI_COMM_WORLD, ierr)
- stop
- endif
-
- if (.not. convertdouble) then
- dp_type = MPI_DOUBLE_PRECISION
- else
- dp_type = MPI_REAL
- endif
-
-
- call timer_clear(T_bench)
- call timer_clear(T_init)
-
- call mpi_barrier(MPI_COMM_WORLD, ierr)
-
- call timer_start(T_init)
-
-
-c---------------------------------------------------------------------
-c Read in and broadcast input data
-c---------------------------------------------------------------------
-
- if( me .eq. root )then
- write (*, 1000)
-
- open(unit=7,file="mg.input", status="old", iostat=fstatus)
- if (fstatus .eq. 0) then
- write(*,50)
- 50 format(' Reading from input file mg.input')
- read(7,*) lt
- read(7,*) nx(lt), ny(lt), nz(lt)
- read(7,*) nit
- read(7,*) (debug_vec(i),i=0,7)
- else
- write(*,51)
- 51 format(' No input file. Using compiled defaults ')
- lt = lt_default
- nit = nit_default
- nx(lt) = nx_default
- ny(lt) = ny_default
- nz(lt) = nz_default
- do i = 0,7
- debug_vec(i) = debug_default
- end do
- endif
- endif
-
- call mpi_bcast(lt, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
- call mpi_bcast(nit, 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
- call mpi_bcast(nx(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
- call mpi_bcast(ny(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
- call mpi_bcast(nz(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr)
- call mpi_bcast(debug_vec(0), 8, MPI_INTEGER, 0,
- > mpi_comm_world, ierr)
-
- if ( (nx(lt) .ne. ny(lt)) .or. (nx(lt) .ne. nz(lt)) ) then
- Class = 'U'
- else if( nx(lt) .eq. 32 .and. nit .eq. 4 ) then
- Class = 'S'
- else if( nx(lt) .eq. 128 .and. nit .eq. 4 ) then
- Class = 'W'
- else if( nx(lt) .eq. 256 .and. nit .eq. 4 ) then
- Class = 'A'
- else if( nx(lt) .eq. 256 .and. nit .eq. 20 ) then
- Class = 'B'
- else if( nx(lt) .eq. 512 .and. nit .eq. 20 ) then
- Class = 'C'
- else if( nx(lt) .eq. 1024 .and. nit .eq. 50 ) then
- Class = 'D'
- else if( nx(lt) .eq. 2048 .and. nit .eq. 50 ) then
- Class = 'E'
- else
- Class = 'U'
- endif
-
-c---------------------------------------------------------------------
-c Use these for debug info:
-c---------------------------------------------------------------------
-c debug_vec(0) = 1 !=> report all norms
-c debug_vec(1) = 1 !=> some setup information
-c debug_vec(1) = 2 !=> more setup information
-c debug_vec(2) = k => at level k or below, show result of resid
-c debug_vec(3) = k => at level k or below, show result of psinv
-c debug_vec(4) = k => at level k or below, show result of rprj
-c debug_vec(5) = k => at level k or below, show result of interp
-c debug_vec(6) = 1 => (unused)
-c debug_vec(7) = 1 => (unused)
-c---------------------------------------------------------------------
- a(0) = -8.0D0/3.0D0
- a(1) = 0.0D0
- a(2) = 1.0D0/6.0D0
- a(3) = 1.0D0/12.0D0
-
- if(Class .eq. 'A' .or. Class .eq. 'S'.or. Class .eq.'W') then
-c---------------------------------------------------------------------
-c Coefficients for the S(a) smoother
-c---------------------------------------------------------------------
- c(0) = -3.0D0/8.0D0
- c(1) = +1.0D0/32.0D0
- c(2) = -1.0D0/64.0D0
- c(3) = 0.0D0
- else
-c---------------------------------------------------------------------
-c Coefficients for the S(b) smoother
-c---------------------------------------------------------------------
- c(0) = -3.0D0/17.0D0
- c(1) = +1.0D0/33.0D0
- c(2) = -1.0D0/61.0D0
- c(3) = 0.0D0
- endif
- lb = 1
- k = lt
-
- call setup(n1,n2,n3,k)
- call zero3(u,n1,n2,n3)
- call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
-
- call norm2u3(v,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
-
- if( me .eq. root )then
- write (*, 1001) nx(lt),ny(lt),nz(lt), Class
- write (*, 1002) nit
-
- 1000 format(//,' NAS Parallel Benchmarks 3.3 -- MG Benchmark', /)
- 1001 format(' Size: ', i4, 'x', i4, 'x', i4, ' (class ', A, ')' )
- 1002 format(' Iterations: ', i4)
- 1003 format(' Number of processes: ', i6)
- if (nprocs .ne. nprocs_compiled) then
- write (*, 1004) nprocs_compiled
- write (*, 1005) nprocs
- 1004 format(' WARNING: compiled for ', i6, ' processes ')
- 1005 format(' Number of active processes: ', i6, /)
- else
- write (*, 1003) nprocs
- endif
- endif
-
- call resid(u,v,r,n1,n2,n3,a,k)
- call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
- old2 = rnm2
- oldu = rnmu
-
-c---------------------------------------------------------------------
-c One iteration for startup
-c---------------------------------------------------------------------
- call mg3P(u,v,r,a,c,n1,n2,n3,k)
- call resid(u,v,r,n1,n2,n3,a,k)
- call setup(n1,n2,n3,k)
- call zero3(u,n1,n2,n3)
- call zran3(v,n1,n2,n3,nx(lt),ny(lt),k)
-
- call timer_stop(T_init)
- if( me .eq. root )then
- tinit = timer_read(T_init)
- write( *,'(/A,F15.3,A/)' )
- > ' Initialization time: ',tinit, ' seconds'
- endif
-
- call mpi_barrier(mpi_comm_world,ierr)
-
- call timer_start(T_bench)
-
- call resid(u,v,r,n1,n2,n3,a,k)
- call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
- old2 = rnm2
- oldu = rnmu
-
- do it=1,nit
- if (it.eq.1 .or. it.eq.nit .or. mod(it,5).eq.0) then
- if (me .eq. root) write(*,80) it
- 80 format(' iter ',i4)
- endif
- call mg3P(u,v,r,a,c,n1,n2,n3,k)
- call resid(u,v,r,n1,n2,n3,a,k)
- enddo
-
-
- call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt))
-
- call timer_stop(T_bench)
-
- t0 = timer_read(T_bench)
-
- call mpi_reduce(t0,t,1,dp_type,
- > mpi_max,root,mpi_comm_world,ierr)
- verified = .FALSE.
- verify_value = 0.0
- if( me .eq. root )then
- write(*,100)
- 100 format(/' Benchmark completed ')
-
- epsilon = 1.d-8
- if (Class .ne. 'U') then
- if(Class.eq.'S') then
- verify_value = 0.5307707005734d-04
- elseif(Class.eq.'W') then
- verify_value = 0.6467329375339d-05
- elseif(Class.eq.'A') then
- verify_value = 0.2433365309069d-05
- elseif(Class.eq.'B') then
- verify_value = 0.1800564401355d-05
- elseif(Class.eq.'C') then
- verify_value = 0.5706732285740d-06
- elseif(Class.eq.'D') then
- verify_value = 0.1583275060440d-09
- elseif(Class.eq.'E') then
- verify_value = 0.5630442584711d-10
- endif
-
- err = abs( rnm2 - verify_value ) / verify_value
- if( err .le. epsilon ) then
- verified = .TRUE.
- write(*, 200)
- write(*, 201) rnm2
- write(*, 202) err
- 200 format(' VERIFICATION SUCCESSFUL ')
- 201 format(' L2 Norm is ', E20.13)
- 202 format(' Error is ', E20.13)
- else
- verified = .FALSE.
- write(*, 300)
- write(*, 301) rnm2
- write(*, 302) verify_value
- 300 format(' VERIFICATION FAILED')
- 301 format(' L2 Norm is ', E20.13)
- 302 format(' The correct L2 Norm is ', E20.13)
- endif
- else
- verified = .FALSE.
- write (*, 400)
- write (*, 401)
- write (*, 201) rnm2
- 400 format(' Problem size unknown')
- 401 format(' NO VERIFICATION PERFORMED')
- endif
-
- nn = 1.0d0*nx(lt)*ny(lt)*nz(lt)
-
- if( t .ne. 0. ) then
- mflops = 58.*1.0D-6*nit*nn / t
- else
- mflops = 0.0
- endif
-
- call print_results('MG', class, nx(lt), ny(lt), nz(lt),
- > nit, nprocs_compiled, nprocs, t,
- > mflops, ' floating point',
- > verified, npbversion, compiletime,
- > cs1, cs2, cs3, cs4, cs5, cs6, cs7)
-
-
- endif
-
-
- call mpi_finalize(ierr)
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setup(n1,n2,n3,k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer is1, is2, is3, ie1, ie2, ie3
- common /grid/ is1,is2,is3,ie1,ie2,ie3
-
- integer n1,n2,n3,k
- integer dx, dy, log_p, d, i, j
-
- integer ax, next(3),mi(3,maxlevel),mip(3,maxlevel)
- integer ng(3,maxlevel)
- integer idi(3), pi(3), idin(3,-1:1)
- integer s, dir,ierr
-
- do j=-1,1,1
- do d=1,3
- msg_type(d,j) = 100*(j+2+10*d)
- enddo
- enddo
-
- ng(1,lt) = nx(lt)
- ng(2,lt) = ny(lt)
- ng(3,lt) = nz(lt)
- do ax=1,3
- next(ax) = 1
- do k=lt-1,1,-1
- ng(ax,k) = ng(ax,k+1)/2
- enddo
- enddo
- 61 format(10i4)
- do k=lt,1,-1
- nx(k) = ng(1,k)
- ny(k) = ng(2,k)
- nz(k) = ng(3,k)
- enddo
-
- log_p = log(float(nprocs)+0.0001)/log(2.0)
- dx = log_p/3
- pi(1) = 2**dx
- idi(1) = mod(me,pi(1))
-
- dy = (log_p-dx)/2
- pi(2) = 2**dy
- idi(2) = mod((me/pi(1)),pi(2))
-
- pi(3) = nprocs/(pi(1)*pi(2))
- idi(3) = me/(pi(1)*pi(2))
-
- do k = lt,1,-1
- dead(k) = .false.
- do ax = 1,3
- take_ex(ax,k) = .false.
- give_ex(ax,k) = .false.
-
- mi(ax,k) = 2 +
- > ((idi(ax)+1)*ng(ax,k))/pi(ax) -
- > ((idi(ax)+0)*ng(ax,k))/pi(ax)
- mip(ax,k) = 2 +
- > ((next(ax)+idi(ax)+1)*ng(ax,k))/pi(ax) -
- > ((next(ax)+idi(ax)+0)*ng(ax,k))/pi(ax)
-
- if(mip(ax,k).eq.2.or.mi(ax,k).eq.2)then
- next(ax) = 2*next(ax)
- endif
-
- if( k+1 .le. lt )then
- if((mip(ax,k).eq.2).and.(mi(ax,k).eq.3))then
- give_ex(ax,k+1) = .true.
- endif
- if((mip(ax,k).eq.3).and.(mi(ax,k).eq.2))then
- take_ex(ax,k+1) = .true.
- endif
- endif
- enddo
-
- if( mi(1,k).eq.2 .or.
- > mi(2,k).eq.2 .or.
- > mi(3,k).eq.2 )then
- dead(k) = .true.
- endif
- m1(k) = mi(1,k)
- m2(k) = mi(2,k)
- m3(k) = mi(3,k)
-
- do ax=1,3
- idin(ax,+1) = mod( idi(ax) + next(ax) + pi(ax) , pi(ax) )
- idin(ax,-1) = mod( idi(ax) - next(ax) + pi(ax) , pi(ax) )
- enddo
- do dir = 1,-1,-2
- nbr(1,dir,k) = idin(1,dir) + pi(1)
- > *(idi(2) + pi(2)
- > * idi(3))
- nbr(2,dir,k) = idi(1) + pi(1)
- > *(idin(2,dir) + pi(2)
- > * idi(3))
- nbr(3,dir,k) = idi(1) + pi(1)
- > *(idi(2) + pi(2)
- > * idin(3,dir))
- enddo
- enddo
-
- k = lt
- is1 = 2 + ng(1,k) - ((pi(1) -idi(1))*ng(1,lt))/pi(1)
- ie1 = 1 + ng(1,k) - ((pi(1)-1-idi(1))*ng(1,lt))/pi(1)
- n1 = 3 + ie1 - is1
- is2 = 2 + ng(2,k) - ((pi(2) -idi(2))*ng(2,lt))/pi(2)
- ie2 = 1 + ng(2,k) - ((pi(2)-1-idi(2))*ng(2,lt))/pi(2)
- n2 = 3 + ie2 - is2
- is3 = 2 + ng(3,k) - ((pi(3) -idi(3))*ng(3,lt))/pi(3)
- ie3 = 1 + ng(3,k) - ((pi(3)-1-idi(3))*ng(3,lt))/pi(3)
- n3 = 3 + ie3 - is3
-
-
- ir(lt)=1
- do j = lt-1, 1, -1
- ir(j)=ir(j+1)+m1(j+1)*m2(j+1)*m3(j+1)
- enddo
-
-
- if( debug_vec(1) .ge. 1 )then
- if( me .eq. root )write(*,*)' in setup, '
- if( me .eq. root )write(*,*)' me k lt nx ny nz ',
- > ' n1 n2 n3 is1 is2 is3 ie1 ie2 ie3'
- do i=0,nprocs-1
- if( me .eq. i )then
- write(*,9) me,k,lt,ng(1,k),ng(2,k),ng(3,k),
- > n1,n2,n3,is1,is2,is3,ie1,ie2,ie3
- 9 format(15i4)
- endif
- call mpi_barrier(mpi_comm_world,ierr)
- enddo
- endif
- if( debug_vec(1) .ge. 2 )then
- do i=0,nprocs-1
- if( me .eq. i )then
- write(*,*)' '
- write(*,*)' processor =',me
- do k=lt,1,-1
- write(*,7)k,idi(1),idi(2),idi(3),
- > ((nbr(d,j,k),j=-1,1,2),d=1,3),
- > (mi(d,k),d=1,3)
- enddo
- 7 format(i4,'idi=',3i4,'nbr=',3(2i4,' '),'mi=',3i4,' ')
- write(*,*)'idi(s) = ',(idi(s),s=1,3)
- write(*,*)'dead(2), dead(1) = ',dead(2),dead(1)
- do ax=1,3
- write(*,*)'give_ex(ax,2)= ',give_ex(ax,2)
- write(*,*)'take_ex(ax,2)= ',take_ex(ax,2)
- enddo
- endif
- call mpi_barrier(mpi_comm_world,ierr)
- enddo
- endif
-
- k = lt
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine mg3P(u,v,r,a,c,n1,n2,n3,k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c multigrid V-cycle routine
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer n1, n2, n3, k
- double precision u(nr),v(nv),r(nr)
- double precision a(0:3),c(0:3)
-
- integer j
-
-c---------------------------------------------------------------------
-c down cycle.
-c restrict the residual from the find grid to the coarse
-c---------------------------------------------------------------------
-
- do k= lt, lb+1 , -1
- j = k-1
- call rprj3(r(ir(k)),m1(k),m2(k),m3(k),
- > r(ir(j)),m1(j),m2(j),m3(j),k)
- enddo
-
- k = lb
-c---------------------------------------------------------------------
-c compute an approximate solution on the coarsest grid
-c---------------------------------------------------------------------
- call zero3(u(ir(k)),m1(k),m2(k),m3(k))
- call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
-
- do k = lb+1, lt-1
- j = k-1
-c---------------------------------------------------------------------
-c prolongate from level k-1 to k
-c---------------------------------------------------------------------
- call zero3(u(ir(k)),m1(k),m2(k),m3(k))
- call interp(u(ir(j)),m1(j),m2(j),m3(j),
- > u(ir(k)),m1(k),m2(k),m3(k),k)
-c---------------------------------------------------------------------
-c compute residual for level k
-c---------------------------------------------------------------------
- call resid(u(ir(k)),r(ir(k)),r(ir(k)),m1(k),m2(k),m3(k),a,k)
-c---------------------------------------------------------------------
-c apply smoother
-c---------------------------------------------------------------------
- call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k)
- enddo
- 200 continue
- j = lt - 1
- k = lt
- call interp(u(ir(j)),m1(j),m2(j),m3(j),u,n1,n2,n3,k)
- call resid(u,v,r,n1,n2,n3,a,k)
- call psinv(r,u,n1,n2,n3,c,k)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine psinv( r,u,n1,n2,n3,c,k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c psinv applies an approximate inverse as smoother: u = u + Cr
-c
-c This implementation costs 15A + 4M per result, where
-c A and M denote the costs of Addition and Multiplication.
-c Presuming coefficient c(3) is zero (the NPB assumes this,
-c but it is thus not a general case), 2A + 1M may be eliminated,
-c resulting in 13A + 3M.
-c Note that this vectorizes, and is also fine for cache
-c based machines.
-c---------------------------------------------------------------------
- implicit none
-
- include 'globals.h'
-
- integer n1,n2,n3,k
- double precision u(n1,n2,n3),r(n1,n2,n3),c(0:3)
- integer i3, i2, i1
-
- double precision r1(m), r2(m)
-
- do i3=2,n3-1
- do i2=2,n2-1
- do i1=1,n1
- r1(i1) = r(i1,i2-1,i3) + r(i1,i2+1,i3)
- > + r(i1,i2,i3-1) + r(i1,i2,i3+1)
- r2(i1) = r(i1,i2-1,i3-1) + r(i1,i2+1,i3-1)
- > + r(i1,i2-1,i3+1) + r(i1,i2+1,i3+1)
- enddo
- do i1=2,n1-1
- u(i1,i2,i3) = u(i1,i2,i3)
- > + c(0) * r(i1,i2,i3)
- > + c(1) * ( r(i1-1,i2,i3) + r(i1+1,i2,i3)
- > + r1(i1) )
- > + c(2) * ( r2(i1) + r1(i1-1) + r1(i1+1) )
-c---------------------------------------------------------------------
-c Assume c(3) = 0 (Enable line below if c(3) not= 0)
-c---------------------------------------------------------------------
-c > + c(3) * ( r2(i1-1) + r2(i1+1) )
-c---------------------------------------------------------------------
- enddo
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c exchange boundary points
-c---------------------------------------------------------------------
- call comm3(u,n1,n2,n3,k)
-
- if( debug_vec(0) .ge. 1 )then
- call rep_nrm(u,n1,n2,n3,' psinv',k)
- endif
-
- if( debug_vec(3) .ge. k )then
- call showall(u,n1,n2,n3)
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine resid( u,v,r,n1,n2,n3,a,k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c resid computes the residual: r = v - Au
-c
-c This implementation costs 15A + 4M per result, where
-c A and M denote the costs of Addition (or Subtraction) and
-c Multiplication, respectively.
-c Presuming coefficient a(1) is zero (the NPB assumes this,
-c but it is thus not a general case), 3A + 1M may be eliminated,
-c resulting in 12A + 3M.
-c Note that this vectorizes, and is also fine for cache
-c based machines.
-c---------------------------------------------------------------------
- implicit none
-
- include 'globals.h'
-
- integer n1,n2,n3,k
- double precision u(n1,n2,n3),v(n1,n2,n3),r(n1,n2,n3),a(0:3)
- integer i3, i2, i1
- double precision u1(m), u2(m)
-
- do i3=2,n3-1
- do i2=2,n2-1
- do i1=1,n1
- u1(i1) = u(i1,i2-1,i3) + u(i1,i2+1,i3)
- > + u(i1,i2,i3-1) + u(i1,i2,i3+1)
- u2(i1) = u(i1,i2-1,i3-1) + u(i1,i2+1,i3-1)
- > + u(i1,i2-1,i3+1) + u(i1,i2+1,i3+1)
- enddo
- do i1=2,n1-1
- r(i1,i2,i3) = v(i1,i2,i3)
- > - a(0) * u(i1,i2,i3)
-c---------------------------------------------------------------------
-c Assume a(1) = 0 (Enable 2 lines below if a(1) not= 0)
-c---------------------------------------------------------------------
-c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3)
-c > + u1(i1) )
-c---------------------------------------------------------------------
- > - a(2) * ( u2(i1) + u1(i1-1) + u1(i1+1) )
- > - a(3) * ( u2(i1-1) + u2(i1+1) )
- enddo
- enddo
- enddo
-
-c---------------------------------------------------------------------
-c exchange boundary data
-c---------------------------------------------------------------------
- call comm3(r,n1,n2,n3,k)
-
- if( debug_vec(0) .ge. 1 )then
- call rep_nrm(r,n1,n2,n3,' resid',k)
- endif
-
- if( debug_vec(2) .ge. k )then
- call showall(r,n1,n2,n3)
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine rprj3( r,m1k,m2k,m3k,s,m1j,m2j,m3j,k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c rprj3 projects onto the next coarser grid,
-c using a trilinear Finite Element projection: s = r' = P r
-c
-c This implementation costs 20A + 4M per result, where
-c A and M denote the costs of Addition and Multiplication.
-c Note that this vectorizes, and is also fine for cache
-c based machines.
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer m1k, m2k, m3k, m1j, m2j, m3j,k
- double precision r(m1k,m2k,m3k), s(m1j,m2j,m3j)
- integer j3, j2, j1, i3, i2, i1, d1, d2, d3, j
-
- double precision x1(m), y1(m), x2,y2
-
-
- if(m1k.eq.3)then
- d1 = 2
- else
- d1 = 1
- endif
-
- if(m2k.eq.3)then
- d2 = 2
- else
- d2 = 1
- endif
-
- if(m3k.eq.3)then
- d3 = 2
- else
- d3 = 1
- endif
-
- do j3=2,m3j-1
- i3 = 2*j3-d3
-C i3 = 2*j3-1
- do j2=2,m2j-1
- i2 = 2*j2-d2
-C i2 = 2*j2-1
-
- do j1=2,m1j
- i1 = 2*j1-d1
-C i1 = 2*j1-1
- x1(i1-1) = r(i1-1,i2-1,i3 ) + r(i1-1,i2+1,i3 )
- > + r(i1-1,i2, i3-1) + r(i1-1,i2, i3+1)
- y1(i1-1) = r(i1-1,i2-1,i3-1) + r(i1-1,i2-1,i3+1)
- > + r(i1-1,i2+1,i3-1) + r(i1-1,i2+1,i3+1)
- enddo
-
- do j1=2,m1j-1
- i1 = 2*j1-d1
-C i1 = 2*j1-1
- y2 = r(i1, i2-1,i3-1) + r(i1, i2-1,i3+1)
- > + r(i1, i2+1,i3-1) + r(i1, i2+1,i3+1)
- x2 = r(i1, i2-1,i3 ) + r(i1, i2+1,i3 )
- > + r(i1, i2, i3-1) + r(i1, i2, i3+1)
- s(j1,j2,j3) =
- > 0.5D0 * r(i1,i2,i3)
- > + 0.25D0 * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) + x2)
- > + 0.125D0 * ( x1(i1-1) + x1(i1+1) + y2)
- > + 0.0625D0 * ( y1(i1-1) + y1(i1+1) )
- enddo
-
- enddo
- enddo
-
-
- j = k-1
- call comm3(s,m1j,m2j,m3j,j)
-
- if( debug_vec(0) .ge. 1 )then
- call rep_nrm(s,m1j,m2j,m3j,' rprj3',k-1)
- endif
-
- if( debug_vec(4) .ge. k )then
- call showall(s,m1j,m2j,m3j)
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine interp( z,mm1,mm2,mm3,u,n1,n2,n3,k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c interp adds the trilinear interpolation of the correction
-c from the coarser grid to the current approximation: u = u + Qu'
-c
-c Observe that this implementation costs 16A + 4M, where
-c A and M denote the costs of Addition and Multiplication.
-c Note that this vectorizes, and is also fine for cache
-c based machines. Vector machines may get slightly better
-c performance however, with 8 separate "do i1" loops, rather than 4.
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer mm1, mm2, mm3, n1, n2, n3,k
- double precision z(mm1,mm2,mm3),u(n1,n2,n3)
- integer i3, i2, i1, d1, d2, d3, t1, t2, t3
-
-c note that m = 1037 in globals.h but for this only need to be
-c 535 to handle up to 1024^3
-c integer m
-c parameter( m=535 )
- double precision z1(m),z2(m),z3(m)
-
-
- if( n1 .ne. 3 .and. n2 .ne. 3 .and. n3 .ne. 3 ) then
-
- do i3=1,mm3-1
- do i2=1,mm2-1
-
- do i1=1,mm1
- z1(i1) = z(i1,i2+1,i3) + z(i1,i2,i3)
- z2(i1) = z(i1,i2,i3+1) + z(i1,i2,i3)
- z3(i1) = z(i1,i2+1,i3+1) + z(i1,i2,i3+1) + z1(i1)
- enddo
-
- do i1=1,mm1-1
- u(2*i1-1,2*i2-1,2*i3-1)=u(2*i1-1,2*i2-1,2*i3-1)
- > +z(i1,i2,i3)
- u(2*i1,2*i2-1,2*i3-1)=u(2*i1,2*i2-1,2*i3-1)
- > +0.5d0*(z(i1+1,i2,i3)+z(i1,i2,i3))
- enddo
- do i1=1,mm1-1
- u(2*i1-1,2*i2,2*i3-1)=u(2*i1-1,2*i2,2*i3-1)
- > +0.5d0 * z1(i1)
- u(2*i1,2*i2,2*i3-1)=u(2*i1,2*i2,2*i3-1)
- > +0.25d0*( z1(i1) + z1(i1+1) )
- enddo
- do i1=1,mm1-1
- u(2*i1-1,2*i2-1,2*i3)=u(2*i1-1,2*i2-1,2*i3)
- > +0.5d0 * z2(i1)
- u(2*i1,2*i2-1,2*i3)=u(2*i1,2*i2-1,2*i3)
- > +0.25d0*( z2(i1) + z2(i1+1) )
- enddo
- do i1=1,mm1-1
- u(2*i1-1,2*i2,2*i3)=u(2*i1-1,2*i2,2*i3)
- > +0.25d0* z3(i1)
- u(2*i1,2*i2,2*i3)=u(2*i1,2*i2,2*i3)
- > +0.125d0*( z3(i1) + z3(i1+1) )
- enddo
- enddo
- enddo
-
- else
-
- if(n1.eq.3)then
- d1 = 2
- t1 = 1
- else
- d1 = 1
- t1 = 0
- endif
-
- if(n2.eq.3)then
- d2 = 2
- t2 = 1
- else
- d2 = 1
- t2 = 0
- endif
-
- if(n3.eq.3)then
- d3 = 2
- t3 = 1
- else
- d3 = 1
- t3 = 0
- endif
-
- do i3=d3,mm3-1
- do i2=d2,mm2-1
- do i1=d1,mm1-1
- u(2*i1-d1,2*i2-d2,2*i3-d3)=u(2*i1-d1,2*i2-d2,2*i3-d3)
- > +z(i1,i2,i3)
- enddo
- do i1=1,mm1-1
- u(2*i1-t1,2*i2-d2,2*i3-d3)=u(2*i1-t1,2*i2-d2,2*i3-d3)
- > +0.5D0*(z(i1+1,i2,i3)+z(i1,i2,i3))
- enddo
- enddo
- do i2=1,mm2-1
- do i1=d1,mm1-1
- u(2*i1-d1,2*i2-t2,2*i3-d3)=u(2*i1-d1,2*i2-t2,2*i3-d3)
- > +0.5D0*(z(i1,i2+1,i3)+z(i1,i2,i3))
- enddo
- do i1=1,mm1-1
- u(2*i1-t1,2*i2-t2,2*i3-d3)=u(2*i1-t1,2*i2-t2,2*i3-d3)
- > +0.25D0*(z(i1+1,i2+1,i3)+z(i1+1,i2,i3)
- > +z(i1, i2+1,i3)+z(i1, i2,i3))
- enddo
- enddo
- enddo
-
- do i3=1,mm3-1
- do i2=d2,mm2-1
- do i1=d1,mm1-1
- u(2*i1-d1,2*i2-d2,2*i3-t3)=u(2*i1-d1,2*i2-d2,2*i3-t3)
- > +0.5D0*(z(i1,i2,i3+1)+z(i1,i2,i3))
- enddo
- do i1=1,mm1-1
- u(2*i1-t1,2*i2-d2,2*i3-t3)=u(2*i1-t1,2*i2-d2,2*i3-t3)
- > +0.25D0*(z(i1+1,i2,i3+1)+z(i1,i2,i3+1)
- > +z(i1+1,i2,i3 )+z(i1,i2,i3 ))
- enddo
- enddo
- do i2=1,mm2-1
- do i1=d1,mm1-1
- u(2*i1-d1,2*i2-t2,2*i3-t3)=u(2*i1-d1,2*i2-t2,2*i3-t3)
- > +0.25D0*(z(i1,i2+1,i3+1)+z(i1,i2,i3+1)
- > +z(i1,i2+1,i3 )+z(i1,i2,i3 ))
- enddo
- do i1=1,mm1-1
- u(2*i1-t1,2*i2-t2,2*i3-t3)=u(2*i1-t1,2*i2-t2,2*i3-t3)
- > +0.125D0*(z(i1+1,i2+1,i3+1)+z(i1+1,i2,i3+1)
- > +z(i1 ,i2+1,i3+1)+z(i1 ,i2,i3+1)
- > +z(i1+1,i2+1,i3 )+z(i1+1,i2,i3 )
- > +z(i1 ,i2+1,i3 )+z(i1 ,i2,i3 ))
- enddo
- enddo
- enddo
-
- endif
-
- call comm3_ex(u,n1,n2,n3,k)
-
- if( debug_vec(0) .ge. 1 )then
- call rep_nrm(z,mm1,mm2,mm3,'z: inter',k-1)
- call rep_nrm(u,n1,n2,n3,'u: inter',k)
- endif
-
- if( debug_vec(5) .ge. k )then
- call showall(z,mm1,mm2,mm3)
- call showall(u,n1,n2,n3)
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine norm2u3(r,n1,n2,n3,rnm2,rnmu,nx,ny,nz)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c norm2u3 evaluates approximations to the L2 norm and the
-c uniform (or L-infinity or Chebyshev) norm, under the
-c assumption that the boundaries are periodic or zero. Add the
-c boundaries in with half weight (quarter weight on the edges
-c and eighth weight at the corners) for inhomogeneous boundaries.
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
-
- integer n1, n2, n3, nx, ny, nz
- double precision rnm2, rnmu, r(n1,n2,n3)
- double precision s, a, ss
- integer i3, i2, i1, ierr
-
- double precision dn
-
- dn = 1.0d0*nx*ny*nz
-
- s=0.0D0
- rnmu = 0.0D0
- do i3=2,n3-1
- do i2=2,n2-1
- do i1=2,n1-1
- s=s+r(i1,i2,i3)**2
- a=abs(r(i1,i2,i3))
- if(a.gt.rnmu)rnmu=a
- enddo
- enddo
- enddo
-
- call mpi_allreduce(rnmu,ss,1,dp_type,
- > mpi_max,mpi_comm_world,ierr)
- rnmu = ss
- call mpi_allreduce(s, ss, 1, dp_type,
- > mpi_sum,mpi_comm_world,ierr)
- s = ss
- rnm2=sqrt( s / dn )
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine rep_nrm(u,n1,n2,n3,title,kk)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c report on norm
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer n1, n2, n3, kk
- double precision u(n1,n2,n3)
- character*8 title
-
- double precision rnm2, rnmu
-
-
- call norm2u3(u,n1,n2,n3,rnm2,rnmu,nx(kk),ny(kk),nz(kk))
- if( me .eq. root )then
- write(*,7)kk,title,rnm2,rnmu
- 7 format(' Level',i2,' in ',a8,': norms =',D21.14,D21.14)
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine comm3(u,n1,n2,n3,kk)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c comm3 organizes the communication on all borders
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer n1, n2, n3, kk
- double precision u(n1,n2,n3)
- integer axis
-
- if( .not. dead(kk) )then
- do axis = 1, 3
- if( nprocs .ne. 1) then
-
- call ready( axis, -1, kk )
- call ready( axis, +1, kk )
-
- call give3( axis, +1, u, n1, n2, n3, kk )
- call give3( axis, -1, u, n1, n2, n3, kk )
-
- call take3( axis, -1, u, n1, n2, n3 )
- call take3( axis, +1, u, n1, n2, n3 )
-
- else
- call comm1p( axis, u, n1, n2, n3, kk )
- endif
- enddo
- else
- call zero3(u,n1,n2,n3)
- endif
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine comm3_ex(u,n1,n2,n3,kk)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c comm3_ex communicates to expand the number of processors
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer n1, n2, n3, kk
- double precision u(n1,n2,n3)
- integer axis
-
- do axis = 1, 3
- if( nprocs .ne. 1 ) then
- if( take_ex( axis, kk ) )then
- call ready( axis, -1, kk )
- call ready( axis, +1, kk )
- call take3_ex( axis, -1, u, n1, n2, n3 )
- call take3_ex( axis, +1, u, n1, n2, n3 )
- endif
-
- if( give_ex( axis, kk ) )then
- call give3_ex( axis, +1, u, n1, n2, n3, kk )
- call give3_ex( axis, -1, u, n1, n2, n3, kk )
- endif
- else
- call comm1p_ex( axis, u, n1, n2, n3, kk )
- endif
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine ready( axis, dir, k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c ready allocates a buffer to take in a message
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer axis, dir, k
- integer buff_id,buff_len,i,ierr
-
- buff_id = 3 + dir
- buff_len = nm2
-
- do i=1,nm2
- buff(i,buff_id) = 0.0D0
- enddo
-
-
-c---------------------------------------------------------------------
-c fake message request type
-c---------------------------------------------------------------------
- msg_id(axis,dir,1) = msg_type(axis,dir) +1000*me
-
- call mpi_irecv( buff(1,buff_id), buff_len,
- > dp_type, nbr(axis,-dir,k), msg_type(axis,dir),
- > mpi_comm_world, msg_id(axis,dir,1), ierr)
- return
- end
-
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine give3( axis, dir, u, n1, n2, n3, k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c give3 sends border data out in the requested direction
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer axis, dir, n1, n2, n3, k, ierr
- double precision u( n1, n2, n3 )
-
- integer i3, i2, i1, buff_len,buff_id
-
- buff_id = 2 + dir
- buff_len = 0
-
- if( axis .eq. 1 )then
- if( dir .eq. -1 )then
-
- do i3=2,n3-1
- do i2=2,n2-1
- buff_len = buff_len + 1
- buff(buff_len,buff_id ) = u( 2, i2,i3)
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- else if( dir .eq. +1 ) then
-
- do i3=2,n3-1
- do i2=2,n2-1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( n1-1, i2,i3)
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- endif
- endif
-
- if( axis .eq. 2 )then
- if( dir .eq. -1 )then
-
- do i3=2,n3-1
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1, 2,i3)
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- else if( dir .eq. +1 ) then
-
- do i3=2,n3-1
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id )= u( i1,n2-1,i3)
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- endif
- endif
-
- if( axis .eq. 3 )then
- if( dir .eq. -1 )then
-
- do i2=1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1,i2,2)
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- else if( dir .eq. +1 ) then
-
- do i2=1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1,i2,n3-1)
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- endif
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine take3( axis, dir, u, n1, n2, n3 )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c take3 copies in border data from the requested direction
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer axis, dir, n1, n2, n3
- double precision u( n1, n2, n3 )
-
- integer buff_id, indx
-
- integer status(mpi_status_size), ierr
-
- integer i3, i2, i1
-
- call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
- buff_id = 3 + dir
- indx = 0
-
- if( axis .eq. 1 )then
- if( dir .eq. -1 )then
-
- do i3=2,n3-1
- do i2=2,n2-1
- indx = indx + 1
- u(n1,i2,i3) = buff(indx, buff_id )
- enddo
- enddo
-
- else if( dir .eq. +1 ) then
-
- do i3=2,n3-1
- do i2=2,n2-1
- indx = indx + 1
- u(1,i2,i3) = buff(indx, buff_id )
- enddo
- enddo
-
- endif
- endif
-
- if( axis .eq. 2 )then
- if( dir .eq. -1 )then
-
- do i3=2,n3-1
- do i1=1,n1
- indx = indx + 1
- u(i1,n2,i3) = buff(indx, buff_id )
- enddo
- enddo
-
- else if( dir .eq. +1 ) then
-
- do i3=2,n3-1
- do i1=1,n1
- indx = indx + 1
- u(i1,1,i3) = buff(indx, buff_id )
- enddo
- enddo
-
- endif
- endif
-
- if( axis .eq. 3 )then
- if( dir .eq. -1 )then
-
- do i2=1,n2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,n3) = buff(indx, buff_id )
- enddo
- enddo
-
- else if( dir .eq. +1 ) then
-
- do i2=1,n2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,1) = buff(indx, buff_id )
- enddo
- enddo
-
- endif
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine give3_ex( axis, dir, u, n1, n2, n3, k )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c give3_ex sends border data out to expand number of processors
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer axis, dir, n1, n2, n3, k, ierr
- double precision u( n1, n2, n3 )
-
- integer i3, i2, i1, buff_len, buff_id
-
- buff_id = 2 + dir
- buff_len = 0
-
- if( axis .eq. 1 )then
- if( dir .eq. -1 )then
-
- do i3=1,n3
- do i2=1,n2
- buff_len = buff_len + 1
- buff(buff_len,buff_id ) = u( 2, i2,i3)
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- else if( dir .eq. +1 ) then
-
- do i3=1,n3
- do i2=1,n2
- do i1=n1-1,n1
- buff_len = buff_len + 1
- buff(buff_len,buff_id)= u(i1,i2,i3)
- enddo
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- endif
- endif
-
- if( axis .eq. 2 )then
- if( dir .eq. -1 )then
-
- do i3=1,n3
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1, 2,i3)
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- else if( dir .eq. +1 ) then
-
- do i3=1,n3
- do i2=n2-1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len,buff_id )= u(i1,i2,i3)
- enddo
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- endif
- endif
-
- if( axis .eq. 3 )then
- if( dir .eq. -1 )then
-
- do i2=1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1,i2,2)
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- else if( dir .eq. +1 ) then
-
- do i3=n3-1,n3
- do i2=1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1,i2,i3)
- enddo
- enddo
- enddo
-
- call mpi_send(
- > buff(1, buff_id ), buff_len,dp_type,
- > nbr( axis, dir, k ), msg_type(axis,dir),
- > mpi_comm_world, ierr)
-
- endif
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine take3_ex( axis, dir, u, n1, n2, n3 )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c take3_ex copies in border data to expand number of processors
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer axis, dir, n1, n2, n3
- double precision u( n1, n2, n3 )
-
- integer buff_id, indx
-
- integer status(mpi_status_size) , ierr
-
- integer i3, i2, i1
-
- call mpi_wait( msg_id( axis, dir, 1 ),status,ierr)
- buff_id = 3 + dir
- indx = 0
-
- if( axis .eq. 1 )then
- if( dir .eq. -1 )then
-
- do i3=1,n3
- do i2=1,n2
- indx = indx + 1
- u(n1,i2,i3) = buff(indx, buff_id )
- enddo
- enddo
-
- else if( dir .eq. +1 ) then
-
- do i3=1,n3
- do i2=1,n2
- do i1=1,2
- indx = indx + 1
- u(i1,i2,i3) = buff(indx,buff_id)
- enddo
- enddo
- enddo
-
- endif
- endif
-
- if( axis .eq. 2 )then
- if( dir .eq. -1 )then
-
- do i3=1,n3
- do i1=1,n1
- indx = indx + 1
- u(i1,n2,i3) = buff(indx, buff_id )
- enddo
- enddo
-
- else if( dir .eq. +1 ) then
-
- do i3=1,n3
- do i2=1,2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,i3) = buff(indx,buff_id)
- enddo
- enddo
- enddo
-
- endif
- endif
-
- if( axis .eq. 3 )then
- if( dir .eq. -1 )then
-
- do i2=1,n2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,n3) = buff(indx, buff_id )
- enddo
- enddo
-
- else if( dir .eq. +1 ) then
-
- do i3=1,2
- do i2=1,n2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,i3) = buff(indx,buff_id)
- enddo
- enddo
- enddo
-
- endif
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine comm1p( axis, u, n1, n2, n3, kk )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer axis, dir, n1, n2, n3
- double precision u( n1, n2, n3 )
-
- integer i3, i2, i1, buff_len,buff_id
- integer i, kk, indx
-
- dir = -1
-
- buff_id = 3 + dir
- buff_len = nm2
-
- do i=1,nm2
- buff(i,buff_id) = 0.0D0
- enddo
-
-
- dir = +1
-
- buff_id = 3 + dir
- buff_len = nm2
-
- do i=1,nm2
- buff(i,buff_id) = 0.0D0
- enddo
-
- dir = +1
-
- buff_id = 2 + dir
- buff_len = 0
-
- if( axis .eq. 1 )then
- do i3=2,n3-1
- do i2=2,n2-1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( n1-1, i2,i3)
- enddo
- enddo
- endif
-
- if( axis .eq. 2 )then
- do i3=2,n3-1
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id )= u( i1,n2-1,i3)
- enddo
- enddo
- endif
-
- if( axis .eq. 3 )then
- do i2=1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1,i2,n3-1)
- enddo
- enddo
- endif
-
- dir = -1
-
- buff_id = 2 + dir
- buff_len = 0
-
- if( axis .eq. 1 )then
- do i3=2,n3-1
- do i2=2,n2-1
- buff_len = buff_len + 1
- buff(buff_len,buff_id ) = u( 2, i2,i3)
- enddo
- enddo
- endif
-
- if( axis .eq. 2 )then
- do i3=2,n3-1
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1, 2,i3)
- enddo
- enddo
- endif
-
- if( axis .eq. 3 )then
- do i2=1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1,i2,2)
- enddo
- enddo
- endif
-
- do i=1,nm2
- buff(i,4) = buff(i,3)
- buff(i,2) = buff(i,1)
- enddo
-
- dir = -1
-
- buff_id = 3 + dir
- indx = 0
-
- if( axis .eq. 1 )then
- do i3=2,n3-1
- do i2=2,n2-1
- indx = indx + 1
- u(n1,i2,i3) = buff(indx, buff_id )
- enddo
- enddo
- endif
-
- if( axis .eq. 2 )then
- do i3=2,n3-1
- do i1=1,n1
- indx = indx + 1
- u(i1,n2,i3) = buff(indx, buff_id )
- enddo
- enddo
- endif
-
- if( axis .eq. 3 )then
- do i2=1,n2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,n3) = buff(indx, buff_id )
- enddo
- enddo
- endif
-
-
- dir = +1
-
- buff_id = 3 + dir
- indx = 0
-
- if( axis .eq. 1 )then
- do i3=2,n3-1
- do i2=2,n2-1
- indx = indx + 1
- u(1,i2,i3) = buff(indx, buff_id )
- enddo
- enddo
- endif
-
- if( axis .eq. 2 )then
- do i3=2,n3-1
- do i1=1,n1
- indx = indx + 1
- u(i1,1,i3) = buff(indx, buff_id )
- enddo
- enddo
- endif
-
- if( axis .eq. 3 )then
- do i2=1,n2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,1) = buff(indx, buff_id )
- enddo
- enddo
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine comm1p_ex( axis, u, n1, n2, n3, kk )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- include 'globals.h'
-
- integer axis, dir, n1, n2, n3
- double precision u( n1, n2, n3 )
-
- integer i3, i2, i1, buff_len,buff_id
- integer i, kk, indx
-
- if( take_ex( axis, kk ) ) then
-
- dir = -1
-
- buff_id = 3 + dir
- buff_len = nm2
-
- do i=1,nm2
- buff(i,buff_id) = 0.0D0
- enddo
-
-
- dir = +1
-
- buff_id = 3 + dir
- buff_len = nm2
-
- do i=1,nm2
- buff(i,buff_id) = 0.0D0
- enddo
-
-
- dir = -1
-
- buff_id = 3 + dir
- indx = 0
-
- if( axis .eq. 1 )then
- do i3=1,n3
- do i2=1,n2
- indx = indx + 1
- u(n1,i2,i3) = buff(indx, buff_id )
- enddo
- enddo
- endif
-
- if( axis .eq. 2 )then
- do i3=1,n3
- do i1=1,n1
- indx = indx + 1
- u(i1,n2,i3) = buff(indx, buff_id )
- enddo
- enddo
- endif
-
- if( axis .eq. 3 )then
- do i2=1,n2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,n3) = buff(indx, buff_id )
- enddo
- enddo
- endif
-
- dir = +1
-
- buff_id = 3 + dir
- indx = 0
-
- if( axis .eq. 1 )then
- do i3=1,n3
- do i2=1,n2
- do i1=1,2
- indx = indx + 1
- u(i1,i2,i3) = buff(indx,buff_id)
- enddo
- enddo
- enddo
- endif
-
- if( axis .eq. 2 )then
- do i3=1,n3
- do i2=1,2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,i3) = buff(indx,buff_id)
- enddo
- enddo
- enddo
- endif
-
- if( axis .eq. 3 )then
- do i3=1,2
- do i2=1,n2
- do i1=1,n1
- indx = indx + 1
- u(i1,i2,i3) = buff(indx,buff_id)
- enddo
- enddo
- enddo
- endif
-
- endif
-
- if( give_ex( axis, kk ) )then
-
- dir = +1
-
- buff_id = 2 + dir
- buff_len = 0
-
- if( axis .eq. 1 )then
- do i3=1,n3
- do i2=1,n2
- do i1=n1-1,n1
- buff_len = buff_len + 1
- buff(buff_len,buff_id)= u(i1,i2,i3)
- enddo
- enddo
- enddo
- endif
-
- if( axis .eq. 2 )then
- do i3=1,n3
- do i2=n2-1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len,buff_id )= u(i1,i2,i3)
- enddo
- enddo
- enddo
- endif
-
- if( axis .eq. 3 )then
- do i3=n3-1,n3
- do i2=1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1,i2,i3)
- enddo
- enddo
- enddo
- endif
-
- dir = -1
-
- buff_id = 2 + dir
- buff_len = 0
-
- if( axis .eq. 1 )then
- do i3=1,n3
- do i2=1,n2
- buff_len = buff_len + 1
- buff(buff_len,buff_id ) = u( 2, i2,i3)
- enddo
- enddo
- endif
-
- if( axis .eq. 2 )then
- do i3=1,n3
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1, 2,i3)
- enddo
- enddo
- endif
-
- if( axis .eq. 3 )then
- do i2=1,n2
- do i1=1,n1
- buff_len = buff_len + 1
- buff(buff_len, buff_id ) = u( i1,i2,2)
- enddo
- enddo
- endif
-
- endif
-
- do i=1,nm2
- buff(i,4) = buff(i,3)
- buff(i,2) = buff(i,1)
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine zran3(z,n1,n2,n3,nx,ny,k)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c zran3 loads +1 at ten randomly chosen points,
-c loads -1 at a different ten random points,
-c and zero elsewhere.
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
-
- integer is1, is2, is3, ie1, ie2, ie3
- common /grid/ is1,is2,is3,ie1,ie2,ie3
-
- integer n1, n2, n3, k, nx, ny, ierr, i0, m0, m1
- double precision z(n1,n2,n3)
-
- integer mm, i1, i2, i3, d1, e1, e2, e3
- double precision x, a
- double precision xx, x0, x1, a1, a2, ai, power
- parameter( mm = 10, a = 5.D0 ** 13, x = 314159265.D0)
- double precision ten( mm, 0:1 ), temp, best
- integer i, j1( mm, 0:1 ), j2( mm, 0:1 ), j3( mm, 0:1 )
- integer jg( 0:3, mm, 0:1 ), jg_temp(4)
-
- external randlc
- double precision randlc, rdummy
-
- a1 = power( a, nx, 1, 0 )
- a2 = power( a, nx, ny, 0 )
-
- call zero3(z,n1,n2,n3)
-
-c i = is1-2+nx*(is2-2+ny*(is3-2))
-
- ai = power( a, nx, is2-2+ny*(is3-2), is1-2 )
- d1 = ie1 - is1 + 1
- e1 = ie1 - is1 + 2
- e2 = ie2 - is2 + 2
- e3 = ie3 - is3 + 2
- x0 = x
- rdummy = randlc( x0, ai )
- do i3 = 2, e3
- x1 = x0
- do i2 = 2, e2
- xx = x1
- call vranlc( d1, xx, a, z( 2, i2, i3 ))
- rdummy = randlc( x1, a1 )
- enddo
- rdummy = randlc( x0, a2 )
- enddo
-
-c---------------------------------------------------------------------
-c call comm3(z,n1,n2,n3)
-c call showall(z,n1,n2,n3)
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c each processor looks for twenty candidates
-c---------------------------------------------------------------------
- do i=1,mm
- ten( i, 1 ) = 0.0D0
- j1( i, 1 ) = 0
- j2( i, 1 ) = 0
- j3( i, 1 ) = 0
- ten( i, 0 ) = 1.0D0
- j1( i, 0 ) = 0
- j2( i, 0 ) = 0
- j3( i, 0 ) = 0
- enddo
-
- do i3=2,n3-1
- do i2=2,n2-1
- do i1=2,n1-1
- if( z(i1,i2,i3) .gt. ten( 1, 1 ) )then
- ten(1,1) = z(i1,i2,i3)
- j1(1,1) = i1
- j2(1,1) = i2
- j3(1,1) = i3
- call bubble( ten, j1, j2, j3, mm, 1 )
- endif
- if( z(i1,i2,i3) .lt. ten( 1, 0 ) )then
- ten(1,0) = z(i1,i2,i3)
- j1(1,0) = i1
- j2(1,0) = i2
- j3(1,0) = i3
- call bubble( ten, j1, j2, j3, mm, 0 )
- endif
- enddo
- enddo
- enddo
-
- call mpi_barrier(mpi_comm_world,ierr)
-
-c---------------------------------------------------------------------
-c Now which of these are globally best?
-c---------------------------------------------------------------------
- i1 = mm
- i0 = mm
- do i=mm,1,-1
-
- best = z( j1(i1,1), j2(i1,1), j3(i1,1) )
- call mpi_allreduce(best,temp,1,dp_type,
- > mpi_max,mpi_comm_world,ierr)
- best = temp
- if(best.eq.z(j1(i1,1),j2(i1,1),j3(i1,1)))then
- jg( 0, i, 1) = me
- jg( 1, i, 1) = is1 - 2 + j1( i1, 1 )
- jg( 2, i, 1) = is2 - 2 + j2( i1, 1 )
- jg( 3, i, 1) = is3 - 2 + j3( i1, 1 )
- i1 = i1-1
- else
- jg( 0, i, 1) = 0
- jg( 1, i, 1) = 0
- jg( 2, i, 1) = 0
- jg( 3, i, 1) = 0
- endif
- ten( i, 1 ) = best
- call mpi_allreduce(jg(0,i,1), jg_temp,4,MPI_INTEGER,
- > mpi_max,mpi_comm_world,ierr)
- jg( 0, i, 1) = jg_temp(1)
- jg( 1, i, 1) = jg_temp(2)
- jg( 2, i, 1) = jg_temp(3)
- jg( 3, i, 1) = jg_temp(4)
-
- best = z( j1(i0,0), j2(i0,0), j3(i0,0) )
- call mpi_allreduce(best,temp,1,dp_type,
- > mpi_min,mpi_comm_world,ierr)
- best = temp
- if(best.eq.z(j1(i0,0),j2(i0,0),j3(i0,0)))then
- jg( 0, i, 0) = me
- jg( 1, i, 0) = is1 - 2 + j1( i0, 0 )
- jg( 2, i, 0) = is2 - 2 + j2( i0, 0 )
- jg( 3, i, 0) = is3 - 2 + j3( i0, 0 )
- i0 = i0-1
- else
- jg( 0, i, 0) = 0
- jg( 1, i, 0) = 0
- jg( 2, i, 0) = 0
- jg( 3, i, 0) = 0
- endif
- ten( i, 0 ) = best
- call mpi_allreduce(jg(0,i,0), jg_temp,4,MPI_INTEGER,
- > mpi_max,mpi_comm_world,ierr)
- jg( 0, i, 0) = jg_temp(1)
- jg( 1, i, 0) = jg_temp(2)
- jg( 2, i, 0) = jg_temp(3)
- jg( 3, i, 0) = jg_temp(4)
-
- enddo
- m1 = i1+1
- m0 = i0+1
-
-c if( me .eq. root) then
-c write(*,*)' '
-c write(*,*)' negative charges at'
-c write(*,9)(jg(1,i,0),jg(2,i,0),jg(3,i,0),i=1,mm)
-c write(*,*)' positive charges at'
-c write(*,9)(jg(1,i,1),jg(2,i,1),jg(3,i,1),i=1,mm)
-c write(*,*)' small random numbers were'
-c write(*,8)(ten( i,0),i=mm,1,-1)
-c write(*,*)' and they were found on processor number'
-c write(*,7)(jg(0,i,0),i=mm,1,-1)
-c write(*,*)' large random numbers were'
-c write(*,8)(ten( i,1),i=mm,1,-1)
-c write(*,*)' and they were found on processor number'
-c write(*,7)(jg(0,i,1),i=mm,1,-1)
-c endif
-c 9 format(5(' (',i3,2(',',i3),')'))
-c 8 format(5D15.8)
-c 7 format(10i4)
- call mpi_barrier(mpi_comm_world,ierr)
- do i3=1,n3
- do i2=1,n2
- do i1=1,n1
- z(i1,i2,i3) = 0.0D0
- enddo
- enddo
- enddo
- do i=mm,m0,-1
- z( j1(i,0), j2(i,0), j3(i,0) ) = -1.0D0
- enddo
- do i=mm,m1,-1
- z( j1(i,1), j2(i,1), j3(i,1) ) = +1.0D0
- enddo
- call comm3(z,n1,n2,n3,k)
-
-c---------------------------------------------------------------------
-c call showall(z,n1,n2,n3)
-c---------------------------------------------------------------------
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine show_l(z,n1,n2,n3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
-
- integer n1,n2,n3,i1,i2,i3,ierr
- double precision z(n1,n2,n3)
- integer m1, m2, m3,i
-
- m1 = min(n1,18)
- m2 = min(n2,14)
- m3 = min(n3,18)
-
- write(*,*)' '
- do i=0,nprocs-1
- if( me .eq. i )then
- write(*,*)' id = ', me
- do i3=1,m3
- do i1=1,m1
- write(*,6)(z(i1,i2,i3),i2=1,m2)
- enddo
- write(*,*)' - - - - - - - '
- enddo
- write(*,*)' '
- 6 format(6f15.11)
- endif
- call mpi_barrier(mpi_comm_world,ierr)
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine showall(z,n1,n2,n3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
-
- integer n1,n2,n3,i1,i2,i3,i,ierr
- double precision z(n1,n2,n3)
- integer m1, m2, m3
-
- m1 = min(n1,18)
- m2 = min(n2,14)
- m3 = min(n3,18)
-
- write(*,*)' '
- do i=0,nprocs-1
- if( me .eq. i )then
- write(*,*)' id = ', me
- do i3=1,m3
- do i1=1,m1
- write(*,6)(z(i1,i2,i3),i2=1,m2)
- enddo
- write(*,*)' - - - - - - - '
- enddo
- write(*,*)' '
- 6 format(15f6.3)
- endif
- call mpi_barrier(mpi_comm_world,ierr)
- enddo
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine show(z,n1,n2,n3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
- integer n1,n2,n3,i1,i2,i3,ierr,i
- double precision z(n1,n2,n3)
-
- write(*,*)' '
- do i=0,nprocs-1
- if( me .eq. i )then
- write(*,*)' id = ', me
- do i3=2,n3-1
- do i1=2,n1-1
- write(*,6)(z(i1,i2,i3),i2=2,n1-1)
- enddo
- write(*,*)' - - - - - - - '
- enddo
- write(*,*)' '
- 6 format(8D10.3)
- endif
- call mpi_barrier(mpi_comm_world,ierr)
- enddo
-
-c call comm3(z,n1,n2,n3)
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- double precision function power( a, n1, n2, n3 )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c power raises an integer, disguised as a double
-c precision real, to an integer power.
-c This version tries to avoid integer overflow by treating
-c it as expressed in a form of "n1*n2+n3".
-c---------------------------------------------------------------------
- implicit none
-
- double precision a, aj
- integer n1, n2, n3
-
- integer n1j, n2j, nj
- external randlc
- double precision randlc, rdummy
-
- power = 1.0d0
- aj = a
- nj = n3
- n1j = n1
- n2j = n2
- 100 continue
-
- if( n2j .gt. 0 ) then
- if( mod(n2j,2) .eq. 1 ) nj = nj + n1j
- n2j = n2j/2
- else if( nj .eq. 0 ) then
- go to 200
- endif
- if( mod(nj,2) .eq. 1 ) rdummy = randlc( power, aj )
- rdummy = randlc( aj, aj )
- nj = nj/2
- go to 100
-
- 200 continue
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine bubble( ten, j1, j2, j3, m, ind )
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c bubble does a bubble sort in direction dir
-c---------------------------------------------------------------------
- implicit none
-
- include 'mpinpb.h'
-
- integer m, ind, j1( m, 0:1 ), j2( m, 0:1 ), j3( m, 0:1 )
- double precision ten( m, 0:1 )
- double precision temp
- integer i, j_temp
-
- if( ind .eq. 1 )then
-
- do i=1,m-1
- if( ten(i,ind) .gt. ten(i+1,ind) )then
-
- temp = ten( i+1, ind )
- ten( i+1, ind ) = ten( i, ind )
- ten( i, ind ) = temp
-
- j_temp = j1( i+1, ind )
- j1( i+1, ind ) = j1( i, ind )
- j1( i, ind ) = j_temp
-
- j_temp = j2( i+1, ind )
- j2( i+1, ind ) = j2( i, ind )
- j2( i, ind ) = j_temp
-
- j_temp = j3( i+1, ind )
- j3( i+1, ind ) = j3( i, ind )
- j3( i, ind ) = j_temp
-
- else
- return
- endif
- enddo
-
- else
-
- do i=1,m-1
- if( ten(i,ind) .lt. ten(i+1,ind) )then
-
- temp = ten( i+1, ind )
- ten( i+1, ind ) = ten( i, ind )
- ten( i, ind ) = temp
-
- j_temp = j1( i+1, ind )
- j1( i+1, ind ) = j1( i, ind )
- j1( i, ind ) = j_temp
-
- j_temp = j2( i+1, ind )
- j2( i+1, ind ) = j2( i, ind )
- j2( i, ind ) = j_temp
-
- j_temp = j3( i+1, ind )
- j3( i+1, ind ) = j3( i, ind )
- j3( i, ind ) = j_temp
-
- else
- return
- endif
- enddo
-
- endif
-
- return
- end
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine zero3(z,n1,n2,n3)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
- include 'mpinpb.h'
-
- integer n1, n2, n3
- double precision z(n1,n2,n3)
- integer i1, i2, i3
-
- do i3=1,n3
- do i2=1,n2
- do i1=1,n1
- z(i1,i2,i3)=0.0D0
- enddo
- enddo
- enddo
-
- return
- end
-
-
-c----- end of program ------------------------------------------------
+++ /dev/null
- 8 = top level
- 256 256 256 = nx ny nz
- 20 = nit
- 0 0 0 0 0 0 0 0 = debug_vec
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'mpif.h'
-
- integer me, nprocs, root, dp_type
- common /mpistuff/ me, nprocs, root, dp_type
-
default: header
@ sys/print_instructions
-BT: bt
-bt: header
- cd BT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) SUBTYPE=$(SUBTYPE) VERSION=$(VERSION)
-
-SP: sp
-sp: header
- cd SP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
-LU: lu
-lu: header
- cd LU; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) VERSION=$(VERSION)
-
-MG: mg
-mg: header
- cd MG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
-FT: ft
-ft: header
- cd FT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
IS: is
is: header
cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
is-trace: header
cd IS-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-CG: cg
-cg: header
- cd CG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
-
EP: ep
ep: header
cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS)
+++ /dev/null
-SHELL=/bin/sh
-BENCHMARK=sp
-BENCHMARKU=SP
-
-include ../config/make.def
-
-
-OBJS = sp.o make_set.o initialize.o exact_solution.o exact_rhs.o \
- set_constants.o adi.o define.o copy_faces.o rhs.o \
- lhsx.o lhsy.o lhsz.o x_solve.o ninvr.o y_solve.o pinvr.o \
- z_solve.o tzetar.o add.o txinvr.o error.o verify.o setup_mpi.o \
- ${COMMON}/print_results.o ${COMMON}/timers.o
-
-include ../sys/make.common
-
-# npbparams.h is included by header.h
-# The following rule should do the trick but many make programs (not gmake)
-# will do the wrong thing and rebuild the world every time (because the
-# mod time on header.h is not changed. One solution would be to
-# touch header.h but this might cause confusion if someone has
-# accidentally deleted it. Instead, make the dependency on npbparams.h
-# explicit in all the lines below (even though dependence is indirect).
-
-# header.h: npbparams.h
-
-${PROGRAM}: config ${OBJS}
- ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB}
-
-.f.o:
- ${FCOMPILE} $<
-
-sp.o: sp.f header.h npbparams.h mpinpb.h
-make_set.o: make_set.f header.h npbparams.h mpinpb.h
-initialize.o: initialize.f header.h npbparams.h
-exact_solution.o: exact_solution.f header.h npbparams.h
-exact_rhs.o: exact_rhs.f header.h npbparams.h
-set_constants.o: set_constants.f header.h npbparams.h
-adi.o: adi.f header.h npbparams.h
-define.o: define.f header.h npbparams.h
-copy_faces.o: copy_faces.f header.h npbparams.h mpinpb.h
-rhs.o: rhs.f header.h npbparams.h
-lhsx.o: lhsx.f header.h npbparams.h
-lhsy.o: lhsy.f header.h npbparams.h
-lhsz.o: lhsz.f header.h npbparams.h
-x_solve.o: x_solve.f header.h npbparams.h mpinpb.h
-ninvr.o: ninvr.f header.h npbparams.h
-y_solve.o: y_solve.f header.h npbparams.h mpinpb.h
-pinvr.o: pinvr.f header.h npbparams.h
-z_solve.o: z_solve.f header.h npbparams.h mpinpb.h
-tzetar.o: tzetar.f header.h npbparams.h
-add.o: add.f header.h npbparams.h
-txinvr.o: txinvr.f header.h npbparams.h
-error.o: error.f header.h npbparams.h mpinpb.h
-verify.o: verify.f header.h npbparams.h mpinpb.h
-setup_mpi.o: setup_mpi.f mpinpb.h npbparams.h
-
-
-clean:
- - rm -f *.o *~ mputil*
- - rm -f npbparams.h core
+++ /dev/null
-
-This code implements a 3D Multi-partition algorithm for the solution
-of the uncoupled systems of linear equations resulting from
-Beam-Warming approximate factorization. Consequently, the program
-must be run on a square number of processors. The included file
-"npbparams.h" contains a parameter statement which sets "maxcells"
-and "problem_size". The parameter maxcells must be set to the
-square root of the number of processors. For example, if running
-on 25 processors, then set max_cells=5. The standard problem sizes
-are problem_size=64 for class A, 102 for class B, and 162 for class C.
-
-The number of time steps and the time step size dt are set in the
-npbparams.h but may be overridden in the input deck "inputsp.data".
-The number of time steps is 400 for all three
-standard problems, and the appropriate time step sizes "dt" are
-0.0015d0 for class A, 0.001d0 for class B, and 0.00067 for class C.
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine add
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c addition of update to the vector u
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, i, j, k, m
-
- do c = 1, ncells
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- u(i,j,k,m,c) = u(i,j,k,m,c) + rhs(i,j,k,m,c)
- end do
- end do
- end do
- end do
- end do
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine adi
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- call copy_faces
-
- call txinvr
-
- call x_solve
-
- call y_solve
-
- call z_solve
-
- call add
-
- return
- end
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine copy_faces
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function copies the face values of a variable defined on a set
-c of cells to the overlap locations of the adjacent sets of cells.
-c Because a set of cells interfaces in each direction with exactly one
-c other set, we only need to fill six different buffers. We could try to
-c overlap communication with computation, by computing
-c some internal values while communicating boundary values, but this
-c adds so much overhead that it's not clearly useful.
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i, j, k, c, m, requests(0:11), p0, p1,
- > p2, p3, p4, p5, b_size(0:5), ss(0:5),
- > sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11)
-
-c---------------------------------------------------------------------
-c exit immediately if there are no faces to be copied
-c---------------------------------------------------------------------
- if (no_nodes .eq. 1) then
- call compute_rhs
- return
- endif
-
-
- ss(0) = start_send_east
- ss(1) = start_send_west
- ss(2) = start_send_north
- ss(3) = start_send_south
- ss(4) = start_send_top
- ss(5) = start_send_bottom
-
- sr(0) = start_recv_east
- sr(1) = start_recv_west
- sr(2) = start_recv_north
- sr(3) = start_recv_south
- sr(4) = start_recv_top
- sr(5) = start_recv_bottom
-
- b_size(0) = east_size
- b_size(1) = west_size
- b_size(2) = north_size
- b_size(3) = south_size
- b_size(4) = top_size
- b_size(5) = bottom_size
-
-c---------------------------------------------------------------------
-c because the difference stencil for the diagonalized scheme is
-c orthogonal, we do not have to perform the staged copying of faces,
-c but can send all face information simultaneously to the neighboring
-c cells in all directions
-c---------------------------------------------------------------------
- p0 = 0
- p1 = 0
- p2 = 0
- p3 = 0
- p4 = 0
- p5 = 0
-
- do c = 1, ncells
- do m = 1, 5
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to eastern neighbors (i-dir)
-c---------------------------------------------------------------------
- if (cell_coord(1,c) .ne. ncells) then
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = cell_size(1,c)-2, cell_size(1,c)-1
- out_buffer(ss(0)+p0) = u(i,j,k,m,c)
- p0 = p0 + 1
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to western neighbors
-c---------------------------------------------------------------------
- if (cell_coord(1,c) .ne. 1) then
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, 1
- out_buffer(ss(1)+p1) = u(i,j,k,m,c)
- p1 = p1 + 1
- end do
- end do
- end do
-
-
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to northern neighbors (j_dir)
-c---------------------------------------------------------------------
- if (cell_coord(2,c) .ne. ncells) then
- do k = 0, cell_size(3,c)-1
- do j = cell_size(2,c)-2, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- out_buffer(ss(2)+p2) = u(i,j,k,m,c)
- p2 = p2 + 1
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to southern neighbors
-c---------------------------------------------------------------------
- if (cell_coord(2,c).ne. 1) then
- do k = 0, cell_size(3,c)-1
- do j = 0, 1
- do i = 0, cell_size(1,c)-1
- out_buffer(ss(3)+p3) = u(i,j,k,m,c)
- p3 = p3 + 1
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to top neighbors (k-dir)
-c---------------------------------------------------------------------
- if (cell_coord(3,c) .ne. ncells) then
- do k = cell_size(3,c)-2, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- out_buffer(ss(4)+p4) = u(i,j,k,m,c)
- p4 = p4 + 1
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c fill the buffer to be sent to bottom neighbors
-c---------------------------------------------------------------------
- if (cell_coord(3,c).ne. 1) then
- do k=0, 1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- out_buffer(ss(5)+p5) = u(i,j,k,m,c)
- p5 = p5 + 1
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c m loop
-c---------------------------------------------------------------------
- end do
-
-c---------------------------------------------------------------------
-c cell loop
-c---------------------------------------------------------------------
- end do
-
- call mpi_irecv(in_buffer(sr(0)), b_size(0),
- > dp_type, successor(1), WEST,
- > comm_rhs, requests(0), error)
- call mpi_irecv(in_buffer(sr(1)), b_size(1),
- > dp_type, predecessor(1), EAST,
- > comm_rhs, requests(1), error)
- call mpi_irecv(in_buffer(sr(2)), b_size(2),
- > dp_type, successor(2), SOUTH,
- > comm_rhs, requests(2), error)
- call mpi_irecv(in_buffer(sr(3)), b_size(3),
- > dp_type, predecessor(2), NORTH,
- > comm_rhs, requests(3), error)
- call mpi_irecv(in_buffer(sr(4)), b_size(4),
- > dp_type, successor(3), BOTTOM,
- > comm_rhs, requests(4), error)
- call mpi_irecv(in_buffer(sr(5)), b_size(5),
- > dp_type, predecessor(3), TOP,
- > comm_rhs, requests(5), error)
-
- call mpi_isend(out_buffer(ss(0)), b_size(0),
- > dp_type, successor(1), EAST,
- > comm_rhs, requests(6), error)
- call mpi_isend(out_buffer(ss(1)), b_size(1),
- > dp_type, predecessor(1), WEST,
- > comm_rhs, requests(7), error)
- call mpi_isend(out_buffer(ss(2)), b_size(2),
- > dp_type,successor(2), NORTH,
- > comm_rhs, requests(8), error)
- call mpi_isend(out_buffer(ss(3)), b_size(3),
- > dp_type,predecessor(2), SOUTH,
- > comm_rhs, requests(9), error)
- call mpi_isend(out_buffer(ss(4)), b_size(4),
- > dp_type,successor(3), TOP,
- > comm_rhs, requests(10), error)
- call mpi_isend(out_buffer(ss(5)), b_size(5),
- > dp_type,predecessor(3), BOTTOM,
- > comm_rhs,requests(11), error)
-
-
- call mpi_waitall(12, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c unpack the data that has just been received;
-c---------------------------------------------------------------------
- p0 = 0
- p1 = 0
- p2 = 0
- p3 = 0
- p4 = 0
- p5 = 0
-
- do c = 1, ncells
- do m = 1, 5
-
- if (cell_coord(1,c) .ne. 1) then
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = -2, -1
- u(i,j,k,m,c) = in_buffer(sr(1)+p0)
- p0 = p0 + 1
- end do
- end do
- end do
- endif
-
- if (cell_coord(1,c) .ne. ncells) then
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = cell_size(1,c), cell_size(1,c)+1
- u(i,j,k,m,c) = in_buffer(sr(0)+p1)
- p1 = p1 + 1
- end do
- end do
- end do
- end if
-
- if (cell_coord(2,c) .ne. 1) then
- do k = 0, cell_size(3,c)-1
- do j = -2, -1
- do i = 0, cell_size(1,c)-1
- u(i,j,k,m,c) = in_buffer(sr(3)+p2)
- p2 = p2 + 1
- end do
- end do
- end do
-
- endif
-
- if (cell_coord(2,c) .ne. ncells) then
- do k = 0, cell_size(3,c)-1
- do j = cell_size(2,c), cell_size(2,c)+1
- do i = 0, cell_size(1,c)-1
- u(i,j,k,m,c) = in_buffer(sr(2)+p3)
- p3 = p3 + 1
- end do
- end do
- end do
- endif
-
- if (cell_coord(3,c) .ne. 1) then
- do k = -2, -1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- u(i,j,k,m,c) = in_buffer(sr(5)+p4)
- p4 = p4 + 1
- end do
- end do
- end do
- endif
-
- if (cell_coord(3,c) .ne. ncells) then
- do k = cell_size(3,c), cell_size(3,c)+1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- u(i,j,k,m,c) = in_buffer(sr(4)+p5)
- p5 = p5 + 1
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c m loop
-c---------------------------------------------------------------------
- end do
-
-c---------------------------------------------------------------------
-c cells loop
-c---------------------------------------------------------------------
- end do
-
-c---------------------------------------------------------------------
-c now that we have all the data, compute the rhs
-c---------------------------------------------------------------------
- call compute_rhs
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine compute_buffer_size(dim)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, dim, face_size
-
- if (ncells .eq. 1) return
-
-c---------------------------------------------------------------------
-c compute the actual sizes of the buffers; note that there is
-c always one cell face that doesn't need buffer space, because it
-c is at the boundary of the grid
-c---------------------------------------------------------------------
-
- west_size = 0
- east_size = 0
-
- do c = 1, ncells
- face_size = cell_size(2,c) * cell_size(3,c) * dim * 2
- if (cell_coord(1,c).ne.1) west_size = west_size + face_size
- if (cell_coord(1,c).ne.ncells) east_size = east_size +
- > face_size
- end do
-
- north_size = 0
- south_size = 0
- do c = 1, ncells
- face_size = cell_size(1,c)*cell_size(3,c) * dim * 2
- if (cell_coord(2,c).ne.1) south_size = south_size + face_size
- if (cell_coord(2,c).ne.ncells) north_size = north_size +
- > face_size
- end do
-
- top_size = 0
- bottom_size = 0
- do c = 1, ncells
- face_size = cell_size(1,c) * cell_size(2,c) * dim * 2
- if (cell_coord(3,c).ne.1) bottom_size = bottom_size +
- > face_size
- if (cell_coord(3,c).ne.ncells) top_size = top_size +
- > face_size
- end do
-
- start_send_west = 1
- start_send_east = start_send_west + west_size
- start_send_south = start_send_east + east_size
- start_send_north = start_send_south + south_size
- start_send_bottom = start_send_north + north_size
- start_send_top = start_send_bottom + bottom_size
- start_recv_west = 1
- start_recv_east = start_recv_west + west_size
- start_recv_south = start_recv_east + east_size
- start_recv_north = start_recv_south + south_size
- start_recv_bottom = start_recv_north + north_size
- start_recv_top = start_recv_bottom + bottom_size
-
- return
- end
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine error_norm(rms)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function computes the norm of the difference between the
-c computed solution and the exact solution
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer c, i, j, k, m, ii, jj, kk, d, error
- double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5),
- > add
-
- do m = 1, 5
- rms_work(m) = 0.0d0
- end do
-
- do c = 1, ncells
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- ii = 0
- do i = cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
- call exact_solution(xi, eta, zeta, u_exact)
-
- do m = 1, 5
- add = u(ii,jj,kk,m,c)-u_exact(m)
- rms_work(m) = rms_work(m) + add*add
- end do
- ii = ii + 1
- end do
- jj = jj + 1
- end do
- kk = kk + 1
- end do
- end do
-
- call mpi_allreduce(rms_work, rms, 5, dp_type,
- > MPI_SUM, comm_setup, error)
-
- do m = 1, 5
- do d = 1, 3
- rms(m) = rms(m) / dble(grid_points(d)-2)
- end do
- rms(m) = dsqrt(rms(m))
- end do
-
- return
- end
-
-
-
- subroutine rhs_norm(rms)
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer c, i, j, k, d, m, error
- double precision rms(5), rms_work(5), add
-
- do m = 1, 5
- rms_work(m) = 0.0d0
- end do
-
- do c = 1, ncells
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- do m = 1, 5
- add = rhs(i,j,k,m,c)
- rms_work(m) = rms_work(m) + add*add
- end do
- end do
- end do
- end do
- end do
-
-
-
- call mpi_allreduce(rms_work, rms, 5, dp_type,
- > MPI_SUM, comm_setup, error)
-
- do m = 1, 5
- do d = 1, 3
- rms(m) = rms(m) / dble(grid_points(d)-2)
- end do
- rms(m) = dsqrt(rms(m))
- end do
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exact_rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c compute the right hand side based on exact solution
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- double precision dtemp(5), xi, eta, zeta, dtpp
- integer c, m, i, j, k, ip1, im1, jp1,
- > jm1, km1, kp1
-
-c---------------------------------------------------------------------
-c loop over all cells owned by this node
-c---------------------------------------------------------------------
- do c = 1, ncells
-
-c---------------------------------------------------------------------
-c initialize
-c---------------------------------------------------------------------
- do m = 1, 5
- do k= 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- forcing(i,j,k,m,c) = 0.0d0
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c xi-direction flux differences
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- zeta = dble(k+cell_low(3,c)) * dnzm1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- eta = dble(j+cell_low(2,c)) * dnym1
-
- do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c)
- xi = dble(i+cell_low(1,c)) * dnxm1
-
- call exact_solution(xi, eta, zeta, dtemp)
- do m = 1, 5
- ue(i,m) = dtemp(m)
- end do
-
- dtpp = 1.0d0 / dtemp(1)
-
- do m = 2, 5
- buf(i,m) = dtpp * dtemp(m)
- end do
-
- cuf(i) = buf(i,2) * buf(i,2)
- buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) +
- > buf(i,4) * buf(i,4)
- q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
- > buf(i,4)*ue(i,4))
-
- end do
-
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- im1 = i-1
- ip1 = i+1
-
- forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
- > tx2*( ue(ip1,2)-ue(im1,2) )+
- > dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1))
-
- forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tx2 * (
- > (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
- > (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
- > xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+
- > dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2))
-
- forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tx2 * (
- > ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
- > xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+
- > dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3))
-
- forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tx2*(
- > ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
- > xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+
- > dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4))
-
- forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tx2*(
- > buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
- > buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
- > 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+
- > buf(im1,1))+
- > xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+
- > xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+
- > dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5))
- end do
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation
-c---------------------------------------------------------------------
- if (start(1,c) .gt. 0) then
- do m = 1, 5
- i = 1
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m))
- i = 2
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) -
- > 4.0d0*ue(i+1,m) + ue(i+2,m))
- end do
- endif
-
- do m = 1, 5
- do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
- > (ue(i-2,m) - 4.0d0*ue(i-1,m) +
- > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m))
- end do
- end do
-
- if (end(1,c) .gt. 0) then
- do m = 1, 5
- i = cell_size(1,c)-3
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (ue(i-2,m) - 4.0d0*ue(i-1,m) +
- > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m))
- i = cell_size(1,c)-2
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m))
- end do
- endif
-
- end do
- end do
-c---------------------------------------------------------------------
-c eta-direction flux differences
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- zeta = dble(k+cell_low(3,c)) * dnzm1
- do i=start(1,c), cell_size(1,c)-end(1,c)-1
- xi = dble(i+cell_low(1,c)) * dnxm1
-
- do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c)
- eta = dble(j+cell_low(2,c)) * dnym1
-
- call exact_solution(xi, eta, zeta, dtemp)
- do m = 1, 5
- ue(j,m) = dtemp(m)
- end do
- dtpp = 1.0d0/dtemp(1)
-
- do m = 2, 5
- buf(j,m) = dtpp * dtemp(m)
- end do
-
- cuf(j) = buf(j,3) * buf(j,3)
- buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) +
- > buf(j,4) * buf(j,4)
- q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
- > buf(j,4)*ue(j,4))
- end do
-
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- jm1 = j-1
- jp1 = j+1
-
- forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
- > ty2*( ue(jp1,3)-ue(jm1,3) )+
- > dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1))
-
- forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - ty2*(
- > ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
- > yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+
- > dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2))
-
- forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - ty2*(
- > (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
- > (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
- > yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+
- > dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3))
-
- forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - ty2*(
- > ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
- > yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+
- > dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4))
-
- forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - ty2*(
- > buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
- > buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
- > 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+
- > buf(jm1,1))+
- > yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+
- > yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+
- > dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5))
- end do
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation
-c---------------------------------------------------------------------
- if (start(2,c) .gt. 0) then
- do m = 1, 5
- j = 1
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m))
- j = 2
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) -
- > 4.0d0*ue(j+1,m) + ue(j+2,m))
- end do
- endif
-
- do m = 1, 5
- do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
- > (ue(j-2,m) - 4.0d0*ue(j-1,m) +
- > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m))
- end do
- end do
- if (end(2,c) .gt. 0) then
- do m = 1, 5
- j = cell_size(2,c)-3
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (ue(j-2,m) - 4.0d0*ue(j-1,m) +
- > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m))
- j = cell_size(2,c)-2
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m))
-
- end do
- endif
-
- end do
- end do
-
-c---------------------------------------------------------------------
-c zeta-direction flux differences
-c---------------------------------------------------------------------
- do j=start(2,c), cell_size(2,c)-end(2,c)-1
- eta = dble(j+cell_low(2,c)) * dnym1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- xi = dble(i+cell_low(1,c)) * dnxm1
-
- do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c)
- zeta = dble(k+cell_low(3,c)) * dnzm1
-
- call exact_solution(xi, eta, zeta, dtemp)
- do m = 1, 5
- ue(k,m) = dtemp(m)
- end do
-
- dtpp = 1.0d0/dtemp(1)
-
- do m = 2, 5
- buf(k,m) = dtpp * dtemp(m)
- end do
-
- cuf(k) = buf(k,4) * buf(k,4)
- buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) +
- > buf(k,3) * buf(k,3)
- q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
- > buf(k,4)*ue(k,4))
- end do
-
- do k=start(3,c), cell_size(3,c)-end(3,c)-1
- km1 = k-1
- kp1 = k+1
-
- forcing(i,j,k,1,c) = forcing(i,j,k,1,c) -
- > tz2*( ue(kp1,4)-ue(km1,4) )+
- > dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1))
-
- forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tz2 * (
- > ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
- > zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+
- > dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2))
-
- forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tz2 * (
- > ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
- > zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+
- > dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3))
-
- forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tz2 * (
- > (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
- > (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
- > zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+
- > dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4))
-
- forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tz2 * (
- > buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
- > buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
- > 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1)
- > +buf(km1,1))+
- > zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+
- > zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+
- > dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5))
- end do
-
-c---------------------------------------------------------------------
-c Fourth-order dissipation
-c---------------------------------------------------------------------
- if (start(3,c) .gt. 0) then
- do m = 1, 5
- k = 1
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m))
- k = 2
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) -
- > 4.0d0*ue(k+1,m) + ue(k+2,m))
- end do
- endif
-
- do m = 1, 5
- do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp*
- > (ue(k-2,m) - 4.0d0*ue(k-1,m) +
- > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m))
- end do
- end do
-
- if (end(3,c) .gt. 0) then
- do m = 1, 5
- k = cell_size(3,c)-3
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (ue(k-2,m) - 4.0d0*ue(k-1,m) +
- > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m))
- k = cell_size(3,c)-2
- forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp *
- > (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m))
- end do
- endif
-
- end do
- end do
-c---------------------------------------------------------------------
-c now change the sign of the forcing function,
-c---------------------------------------------------------------------
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- forcing(i,j,k,m,c) = -1.d0 * forcing(i,j,k,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c cell loop
-c---------------------------------------------------------------------
- end do
-
- return
- end
-
-
-
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine exact_solution(xi,eta,zeta,dtemp)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function returns the exact solution at point xi, eta, zeta
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- double precision xi, eta, zeta, dtemp(5)
- integer m
-
- do m = 1, 5
- dtemp(m) = ce(m,1) +
- > xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
- > eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
- > zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) +
- > zeta*ce(m,13))))
- end do
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- implicit none
-
-c---------------------------------------------------------------------
-c The following include file is generated automatically by the
-c "setparams" utility. It defines
-c maxcells: the square root of the maximum number of processors
-c problem_size: 12, 64, 102, 162 (for class T, A, B, C)
-c dt_default: default time step for this problem size if no
-c config file
-c niter_default: default number of iterations for this problem size
-c---------------------------------------------------------------------
-
- include 'npbparams.h'
-
- integer ncells, grid_points(3)
- common /global/ ncells, grid_points
-
- double precision tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
- > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4,
- > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt,
- > ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2,
- > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
- > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
- > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
- > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1,
- > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1,
- > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2,
- > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
- > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1,
- > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6,
- > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
-
- common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
- > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4,
- > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt,
- > ce, dxmax, dymax, dzmax, xxcon1, xxcon2,
- > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
- > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
- > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
- > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1,
- > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1,
- > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2,
- > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
- > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1,
- > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6,
- > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
-
- integer EAST, WEST, NORTH, SOUTH,
- > BOTTOM, TOP
-
- parameter (EAST=2000, WEST=3000, NORTH=4000, SOUTH=5000,
- > BOTTOM=6000, TOP=7000)
-
- integer cell_coord (3,maxcells), cell_low (3,maxcells),
- > cell_high (3,maxcells), cell_size(3,maxcells),
- > predecessor(3), slice (3,maxcells),
- > grid_size (3), successor(3),
- > start (3,maxcells), end (3,maxcells)
- common /partition/ cell_coord, cell_low, cell_high, cell_size,
- > grid_size, successor, predecessor, slice,
- > start, end
-
- integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE, IMAXP, JMAXP
-
- parameter (MAX_CELL_DIM = (problem_size/maxcells)+1)
-
- parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM)
- parameter (IMAXP=IMAX/2*2+1,JMAXP=JMAX/2*2+1)
-
-c---------------------------------------------------------------------
-c +1 at end to avoid zero length arrays for 1 node
-c---------------------------------------------------------------------
- parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60*2+1)
-
- double precision
- > u (-2:IMAXP+1,-2:JMAXP+1,-2:KMAX+1, 5,maxcells),
- > us (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > vs (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > ws (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > qs (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > ainv (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > rho_i (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > speed (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > square (-1:IMAX, -1:JMAX, -1:KMAX, maxcells),
- > rhs ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells),
- > forcing ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells),
- > lhs ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1,15,maxcells),
- > in_buffer(BUF_SIZE), out_buffer(BUF_SIZE)
- common /fields/ u, us, vs, ws, qs, ainv, rho_i, speed, square,
- > rhs, forcing, lhs, in_buffer, out_buffer
-
- double precision cv(-2:MAX_CELL_DIM+1), rhon(-2:MAX_CELL_DIM+1),
- > rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1),
- > cuf(-2:MAX_CELL_DIM+1), q(-2:MAX_CELL_DIM+1),
- > ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5)
- common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf
-
- integer west_size, east_size, bottom_size, top_size,
- > north_size, south_size, start_send_west,
- > start_send_east, start_send_south, start_send_north,
- > start_send_bottom, start_send_top, start_recv_west,
- > start_recv_east, start_recv_south, start_recv_north,
- > start_recv_bottom, start_recv_top
- common /box/ west_size, east_size, bottom_size,
- > top_size, north_size, south_size,
- > start_send_west, start_send_east, start_send_south,
- > start_send_north, start_send_bottom, start_send_top,
- > start_recv_west, start_recv_east, start_recv_south,
- > start_recv_north, start_recv_bottom, start_recv_top
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine initialize
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This subroutine initializes the field variable u using
-c tri-linear transfinite interpolation of the boundary values
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, i, j, k, m, ii, jj, kk, ix, iy, iz
- double precision xi, eta, zeta, Pface(5,3,2), Pxi, Peta,
- > Pzeta, temp(5)
-
-
-c---------------------------------------------------------------------
-c Later (in compute_rhs) we compute 1/u for every element. A few of
-c the corner elements are not used, but it convenient (and faster)
-c to compute the whole thing with a simple loop. Make sure those
-c values are nonzero by initializing the whole thing here.
-c---------------------------------------------------------------------
- do c = 1, ncells
- do kk = -1, IMAX
- do jj = -1, IMAX
- do ii = -1, IMAX
- u(ii, jj, kk, 1, c) = 1.0
- u(ii, jj, kk, 2, c) = 0.0
- u(ii, jj, kk, 3, c) = 0.0
- u(ii, jj, kk, 4, c) = 0.0
- u(ii, jj, kk, 5, c) = 1.0
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c first store the "interpolated" values everywhere on the grid
-c---------------------------------------------------------------------
- do c=1, ncells
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- ii = 0
- do i = cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
-
- do ix = 1, 2
- call exact_solution(dble(ix-1), eta, zeta,
- > Pface(1,1,ix))
- end do
-
- do iy = 1, 2
- call exact_solution(xi, dble(iy-1) , zeta,
- > Pface(1,2,iy))
- end do
-
- do iz = 1, 2
- call exact_solution(xi, eta, dble(iz-1),
- > Pface(1,3,iz))
- end do
-
- do m = 1, 5
- Pxi = xi * Pface(m,1,2) +
- > (1.0d0-xi) * Pface(m,1,1)
- Peta = eta * Pface(m,2,2) +
- > (1.0d0-eta) * Pface(m,2,1)
- Pzeta = zeta * Pface(m,3,2) +
- > (1.0d0-zeta) * Pface(m,3,1)
-
- u(ii,jj,kk,m,c) = Pxi + Peta + Pzeta -
- > Pxi*Peta - Pxi*Pzeta - Peta*Pzeta +
- > Pxi*Peta*Pzeta
-
- end do
- ii = ii + 1
- end do
- jj = jj + 1
- end do
- kk = kk+1
- end do
- end do
-
-c---------------------------------------------------------------------
-c now store the exact values on the boundaries
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c west face
-c---------------------------------------------------------------------
- c = slice(1,1)
- ii = 0
- xi = 0.0d0
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(ii,jj,kk,m,c) = temp(m)
- end do
- jj = jj + 1
- end do
- kk = kk + 1
- end do
-
-c---------------------------------------------------------------------
-c east face
-c---------------------------------------------------------------------
- c = slice(1,ncells)
- ii = cell_size(1,c)-1
- xi = 1.0d0
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(ii,jj,kk,m,c) = temp(m)
- end do
- jj = jj + 1
- end do
- kk = kk + 1
- end do
-
-c---------------------------------------------------------------------
-c south face
-c---------------------------------------------------------------------
- c = slice(2,1)
- jj = 0
- eta = 0.0d0
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- ii = 0
- do i = cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(ii,jj,kk,m,c) = temp(m)
- end do
- ii = ii + 1
- end do
- kk = kk + 1
- end do
-
-
-c---------------------------------------------------------------------
-c north face
-c---------------------------------------------------------------------
- c = slice(2,ncells)
- jj = cell_size(2,c)-1
- eta = 1.0d0
- kk = 0
- do k = cell_low(3,c), cell_high(3,c)
- zeta = dble(k) * dnzm1
- ii = 0
- do i = cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(ii,jj,kk,m,c) = temp(m)
- end do
- ii = ii + 1
- end do
- kk = kk + 1
- end do
-
-c---------------------------------------------------------------------
-c bottom face
-c---------------------------------------------------------------------
- c = slice(3,1)
- kk = 0
- zeta = 0.0d0
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- ii = 0
- do i =cell_low(1,c), cell_high(1,c)
- xi = dble(i) *dnxm1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(ii,jj,kk,m,c) = temp(m)
- end do
- ii = ii + 1
- end do
- jj = jj + 1
- end do
-
-c---------------------------------------------------------------------
-c top face
-c---------------------------------------------------------------------
- c = slice(3,ncells)
- kk = cell_size(3,c)-1
- zeta = 1.0d0
- jj = 0
- do j = cell_low(2,c), cell_high(2,c)
- eta = dble(j) * dnym1
- ii = 0
- do i =cell_low(1,c), cell_high(1,c)
- xi = dble(i) * dnxm1
- call exact_solution(xi, eta, zeta, temp)
- do m = 1, 5
- u(ii,jj,kk,m,c) = temp(m)
- end do
- ii = ii + 1
- end do
- jj = jj + 1
- end do
-
- return
- end
-
-
- subroutine lhsinit
-
- include 'header.h'
-
- integer i, j, k, d, c, n
-
-c---------------------------------------------------------------------
-c loop over all cells
-c---------------------------------------------------------------------
- do c = 1, ncells
-
-c---------------------------------------------------------------------
-c first, initialize the start and end arrays
-c---------------------------------------------------------------------
- do d = 1, 3
- if (cell_coord(d,c) .eq. 1) then
- start(d,c) = 1
- else
- start(d,c) = 0
- endif
- if (cell_coord(d,c) .eq. ncells) then
- end(d,c) = 1
- else
- end(d,c) = 0
- endif
- end do
-
-c---------------------------------------------------------------------
-c zap the whole left hand side for starters
-c---------------------------------------------------------------------
- do n = 1, 15
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- lhs(i,j,k,n,c) = 0.0d0
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c next, set all diagonal values to 1. This is overkill, but convenient
-c---------------------------------------------------------------------
- do n = 1, 3
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- lhs(i,j,k,5*n-2,c) = 1.0d0
- end do
- end do
- end do
- end do
-
- end do
-
- return
- end
-
-
-
+++ /dev/null
-400 number of time steps
-0.0015d0 dt for class A = 0.0015d0. class B = 0.001d0 class C = 0.00067d0
-64 64 64
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine lhsx(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three x-factors
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- double precision ru1
- integer i, j, k, c
-
-
-c---------------------------------------------------------------------
-c treat only cell c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c first fill the lhs for the u-eigenvalue
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c)-1, cell_size(1,c)-end(1,c)
- ru1 = c3c4*rho_i(i,j,k,c)
- cv(i) = us(i,j,k,c)
- rhon(i) = dmax1(dx2+con43*ru1,
- > dx5+c1c5*ru1,
- > dxmax+ru1,
- > dx1)
- end do
-
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- lhs(i,j,k,1,c) = 0.0d0
- lhs(i,j,k,2,c) = - dttx2 * cv(i-1) - dttx1 * rhon(i-1)
- lhs(i,j,k,3,c) = 1.0d0 + c2dttx1 * rhon(i)
- lhs(i,j,k,4,c) = dttx2 * cv(i+1) - dttx1 * rhon(i+1)
- lhs(i,j,k,5,c) = 0.0d0
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c add fourth order dissipation
-c---------------------------------------------------------------------
- if (start(1,c) .gt. 0) then
- i = 1
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
- lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
- lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
-
- lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4
- lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz6
- lhs(i+1,j,k,4,c) = lhs(i+1,j,k,4,c) - comz4
- lhs(i+1,j,k,5,c) = lhs(i+1,j,k,5,c) + comz1
- end do
- end do
- endif
-
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i=3*start(1,c), cell_size(1,c)-3*end(1,c)-1
- lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
- lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
- lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
- lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
- lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
- end do
- end do
- end do
-
- if (end(1,c) .gt. 0) then
- i = cell_size(1,c)-3
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
- lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
- lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
- lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-
- lhs(i+1,j,k,1,c) = lhs(i+1,j,k,1,c) + comz1
- lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4
- lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz5
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c subsequently, fill the other factors (u+c), (u-c) by a4ing to
-c the first
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c)
- lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) -
- > dttx2 * speed(i-1,j,k,c)
- lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c)
- lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) +
- > dttx2 * speed(i+1,j,k,c)
- lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
- lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
- lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) +
- > dttx2 * speed(i-1,j,k,c)
- lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
- lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) -
- > dttx2 * speed(i+1,j,k,c)
- lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
- end do
- end do
- end do
-
- return
- end
-
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine lhsy(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three y-factors
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- double precision ru1
- integer i, j, k, c
-
-c---------------------------------------------------------------------
-c treat only cell c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c first fill the lhs for the u-eigenvalue
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
-
- do j = start(2,c)-1, cell_size(2,c)-end(2,c)
- ru1 = c3c4*rho_i(i,j,k,c)
- cv(j) = vs(i,j,k,c)
- rhoq(j) = dmax1( dy3 + con43 * ru1,
- > dy5 + c1c5*ru1,
- > dymax + ru1,
- > dy1)
- end do
-
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- lhs(i,j,k,1,c) = 0.0d0
- lhs(i,j,k,2,c) = -dtty2 * cv(j-1) - dtty1 * rhoq(j-1)
- lhs(i,j,k,3,c) = 1.0 + c2dtty1 * rhoq(j)
- lhs(i,j,k,4,c) = dtty2 * cv(j+1) - dtty1 * rhoq(j+1)
- lhs(i,j,k,5,c) = 0.0d0
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c add fourth order dissipation
-c---------------------------------------------------------------------
- if (start(2,c) .gt. 0) then
- j = 1
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
-
- lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
- lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
- lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
-
- lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4
- lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz6
- lhs(i,j+1,k,4,c) = lhs(i,j+1,k,4,c) - comz4
- lhs(i,j+1,k,5,c) = lhs(i,j+1,k,5,c) + comz1
- end do
- end do
- endif
-
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j=3*start(2,c), cell_size(2,c)-3*end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
-
- lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
- lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
- lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
- lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
- lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
- end do
- end do
- end do
-
- if (end(2,c) .gt. 0) then
- j = cell_size(2,c)-3
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
- lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
- lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
- lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-
- lhs(i,j+1,k,1,c) = lhs(i,j+1,k,1,c) + comz1
- lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4
- lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz5
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c subsequently, do the other two factors
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c)
- lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) -
- > dtty2 * speed(i,j-1,k,c)
- lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c)
- lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) +
- > dtty2 * speed(i,j+1,k,c)
- lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
- lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
- lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) +
- > dtty2 * speed(i,j-1,k,c)
- lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
- lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) -
- > dtty2 * speed(i,j+1,k,c)
- lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
- end do
- end do
- end do
-
- return
- end
-
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine lhsz(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This function computes the left hand side for the three z-factors
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- double precision ru1
- integer i, j, k, c
-
-c---------------------------------------------------------------------
-c treat only cell c
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c first fill the lhs for the u-eigenvalue
-c---------------------------------------------------------------------
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
-
- do k = start(3,c)-1, cell_size(3,c)-end(3,c)
- ru1 = c3c4*rho_i(i,j,k,c)
- cv(k) = ws(i,j,k,c)
- rhos(k) = dmax1(dz4 + con43 * ru1,
- > dz5 + c1c5 * ru1,
- > dzmax + ru1,
- > dz1)
- end do
-
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- lhs(i,j,k,1,c) = 0.0d0
- lhs(i,j,k,2,c) = -dttz2 * cv(k-1) - dttz1 * rhos(k-1)
- lhs(i,j,k,3,c) = 1.0 + c2dttz1 * rhos(k)
- lhs(i,j,k,4,c) = dttz2 * cv(k+1) - dttz1 * rhos(k+1)
- lhs(i,j,k,5,c) = 0.0d0
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c add fourth order dissipation
-c---------------------------------------------------------------------
- if (start(3,c) .gt. 0) then
- k = 1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5
- lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
- lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
-
- lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4
- lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz6
- lhs(i,j,k+1,4,c) = lhs(i,j,k+1,4,c) - comz4
- lhs(i,j,k+1,5,c) = lhs(i,j,k+1,5,c) + comz1
- end do
- end do
- endif
-
- do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
- lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
- lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
- lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
- lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1
- end do
- end do
- end do
-
- if (end(3,c) .gt. 0) then
- k = cell_size(3,c)-3
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1
- lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4
- lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6
- lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4
-
- lhs(i,j,k+1,1,c) = lhs(i,j,k+1,1,c) + comz1
- lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4
- lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz5
- end do
- end do
- endif
-
-
-c---------------------------------------------------------------------
-c subsequently, fill the other factors (u+c), (u-c)
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c)
- lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) -
- > dttz2 * speed(i,j,k-1,c)
- lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c)
- lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) +
- > dttz2 * speed(i,j,k+1,c)
- lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c)
- lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c)
- lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) +
- > dttz2 * speed(i,j,k-1,c)
- lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c)
- lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) -
- > dttz2 * speed(i,j,k+1,c)
- lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c)
- end do
- end do
- end do
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine make_set
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c This function allocates space for a set of cells and fills the set
-c such that communication between cells on different nodes is only
-c nearest neighbor
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer p, i, j, c, dir, size, excess, ierr,ierrcode
-
-c---------------------------------------------------------------------
-c compute square root; add small number to allow for roundoff
-c (note: this is computed in setup_mpi.f also, but prefer to do
-c it twice because of some include file problems).
-c---------------------------------------------------------------------
- ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0))
-
-c---------------------------------------------------------------------
-c this makes coding easier
-c---------------------------------------------------------------------
- p = ncells
-
-c---------------------------------------------------------------------
-c determine the location of the cell at the bottom of the 3D
-c array of cells
-c---------------------------------------------------------------------
- cell_coord(1,1) = mod(node,p)
- cell_coord(2,1) = node/p
- cell_coord(3,1) = 0
-
-c---------------------------------------------------------------------
-c set the cell_coords for cells in the rest of the z-layers;
-c this comes down to a simple linear numbering in the z-direct-
-c ion, and to the doubly-cyclic numbering in the other dirs
-c---------------------------------------------------------------------
- do c=2, p
- cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p)
- cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p)
- cell_coord(3,c) = c-1
- end do
-
-c---------------------------------------------------------------------
-c offset all the coordinates by 1 to adjust for Fortran arrays
-c---------------------------------------------------------------------
- do dir = 1, 3
- do c = 1, p
- cell_coord(dir,c) = cell_coord(dir,c) + 1
- end do
- end do
-
-c---------------------------------------------------------------------
-c slice(dir,n) contains the sequence number of the cell that is in
-c coordinate plane n in the dir direction
-c---------------------------------------------------------------------
- do dir = 1, 3
- do c = 1, p
- slice(dir,cell_coord(dir,c)) = c
- end do
- end do
-
-
-c---------------------------------------------------------------------
-c fill the predecessor and successor entries, using the indices
-c of the bottom cells (they are the same at each level of k
-c anyway) acting as if full periodicity pertains; note that p is
-c added to those arguments to the mod functions that might
-c otherwise return wrong values when using the modulo function
-c---------------------------------------------------------------------
- i = cell_coord(1,1)-1
- j = cell_coord(2,1)-1
-
- predecessor(1) = mod(i-1+p,p) + p*j
- predecessor(2) = i + p*mod(j-1+p,p)
- predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p)
- successor(1) = mod(i+1,p) + p*j
- successor(2) = i + p*mod(j+1,p)
- successor(3) = mod(i-1+p,p) + p*mod(j+1,p)
-
-c---------------------------------------------------------------------
-c now compute the sizes of the cells
-c---------------------------------------------------------------------
- do dir= 1, 3
-c---------------------------------------------------------------------
-c set cell_coord range for each direction
-c---------------------------------------------------------------------
- size = grid_points(dir)/p
- excess = mod(grid_points(dir),p)
- do c=1, ncells
- if (cell_coord(dir,c) .le. excess) then
- cell_size(dir,c) = size+1
- cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1)
- cell_high(dir,c) = cell_low(dir,c)+size
- else
- cell_size(dir,c) = size
- cell_low(dir,c) = excess*(size+1)+
- > (cell_coord(dir,c)-excess-1)*size
- cell_high(dir,c) = cell_low(dir,c)+size-1
- endif
- if (cell_size(dir, c) .le. 2) then
- write(*,50)
- 50 format(' Error: Cell size too small. Min size is 3')
- call MPI_Abort(mpi_comm_world,ierrcode,ierr)
- stop
- endif
- end do
- end do
-
- return
- end
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'mpif.h'
-
- integer node, no_nodes, total_nodes, root, comm_setup,
- > comm_solve, comm_rhs, dp_type
- logical active
- common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup,
- > comm_solve, comm_rhs, dp_type, active
- integer DEFAULT_TAG
- parameter (DEFAULT_TAG = 0)
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine ninvr(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c block-diagonal matrix-vector multiplication
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, i, j, k
- double precision r1, r2, r3, r4, r5, t1, t2
-
-c---------------------------------------------------------------------
-c treat only one cell
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
-
- r1 = rhs(i,j,k,1,c)
- r2 = rhs(i,j,k,2,c)
- r3 = rhs(i,j,k,3,c)
- r4 = rhs(i,j,k,4,c)
- r5 = rhs(i,j,k,5,c)
-
- t1 = bt * r3
- t2 = 0.5d0 * ( r4 + r5 )
-
- rhs(i,j,k,1,c) = -r2
- rhs(i,j,k,2,c) = r1
- rhs(i,j,k,3,c) = bt * ( r4 - r5 )
- rhs(i,j,k,4,c) = -t1 + t2
- rhs(i,j,k,5,c) = t1 + t2
- enddo
- enddo
- enddo
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine pinvr(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c block-diagonal matrix-vector multiplication
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i, j, k, c
- double precision r1, r2, r3, r4, r5, t1, t2
-
-c---------------------------------------------------------------------
-c treat only one cell
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
-
- r1 = rhs(i,j,k,1,c)
- r2 = rhs(i,j,k,2,c)
- r3 = rhs(i,j,k,3,c)
- r4 = rhs(i,j,k,4,c)
- r5 = rhs(i,j,k,5,c)
-
- t1 = bt * r1
- t2 = 0.5d0 * ( r4 + r5 )
-
- rhs(i,j,k,1,c) = bt * ( r4 - r5 )
- rhs(i,j,k,2,c) = -r3
- rhs(i,j,k,3,c) = r2
- rhs(i,j,k,4,c) = -t1 + t2
- rhs(i,j,k,5,c) = t1 + t2
- end do
- end do
- end do
-
- return
- end
-
-
-
+++ /dev/null
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine compute_rhs
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, i, j, k, m
- double precision aux, rho_inv, uijk, up1, um1, vijk, vp1, vm1,
- > wijk, wp1, wm1
-
-
-c---------------------------------------------------------------------
-c loop over all cells owned by this node
-c---------------------------------------------------------------------
- do c = 1, ncells
-
-c---------------------------------------------------------------------
-c compute the reciprocal of density, and the kinetic energy,
-c and the speed of sound.
-c---------------------------------------------------------------------
-
- do k = -1, cell_size(3,c)
- do j = -1, cell_size(2,c)
- do i = -1, cell_size(1,c)
- rho_inv = 1.0d0/u(i,j,k,1,c)
- rho_i(i,j,k,c) = rho_inv
- us(i,j,k,c) = u(i,j,k,2,c) * rho_inv
- vs(i,j,k,c) = u(i,j,k,3,c) * rho_inv
- ws(i,j,k,c) = u(i,j,k,4,c) * rho_inv
- square(i,j,k,c) = 0.5d0* (
- > u(i,j,k,2,c)*u(i,j,k,2,c) +
- > u(i,j,k,3,c)*u(i,j,k,3,c) +
- > u(i,j,k,4,c)*u(i,j,k,4,c) ) * rho_inv
- qs(i,j,k,c) = square(i,j,k,c) * rho_inv
-c---------------------------------------------------------------------
-c (don't need speed and ainx until the lhs computation)
-c---------------------------------------------------------------------
- aux = c1c2*rho_inv* (u(i,j,k,5,c) - square(i,j,k,c))
- aux = dsqrt(aux)
- speed(i,j,k,c) = aux
- ainv(i,j,k,c) = 1.0d0/aux
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c copy the exact forcing term to the right hand side; because
-c this forcing term is known, we can store it on the whole of every
-c cell, including the boundary
-c---------------------------------------------------------------------
-
- do m = 1, 5
- do k = 0, cell_size(3,c)-1
- do j = 0, cell_size(2,c)-1
- do i = 0, cell_size(1,c)-1
- rhs(i,j,k,m,c) = forcing(i,j,k,m,c)
- end do
- end do
- end do
- end do
-
-
-c---------------------------------------------------------------------
-c compute xi-direction fluxes
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- uijk = us(i,j,k,c)
- up1 = us(i+1,j,k,c)
- um1 = us(i-1,j,k,c)
-
- rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dx1tx1 *
- > (u(i+1,j,k,1,c) - 2.0d0*u(i,j,k,1,c) +
- > u(i-1,j,k,1,c)) -
- > tx2 * (u(i+1,j,k,2,c) - u(i-1,j,k,2,c))
-
- rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dx2tx1 *
- > (u(i+1,j,k,2,c) - 2.0d0*u(i,j,k,2,c) +
- > u(i-1,j,k,2,c)) +
- > xxcon2*con43 * (up1 - 2.0d0*uijk + um1) -
- > tx2 * (u(i+1,j,k,2,c)*up1 -
- > u(i-1,j,k,2,c)*um1 +
- > (u(i+1,j,k,5,c)- square(i+1,j,k,c)-
- > u(i-1,j,k,5,c)+ square(i-1,j,k,c))*
- > c2)
-
- rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dx3tx1 *
- > (u(i+1,j,k,3,c) - 2.0d0*u(i,j,k,3,c) +
- > u(i-1,j,k,3,c)) +
- > xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) +
- > vs(i-1,j,k,c)) -
- > tx2 * (u(i+1,j,k,3,c)*up1 -
- > u(i-1,j,k,3,c)*um1)
-
- rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dx4tx1 *
- > (u(i+1,j,k,4,c) - 2.0d0*u(i,j,k,4,c) +
- > u(i-1,j,k,4,c)) +
- > xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) +
- > ws(i-1,j,k,c)) -
- > tx2 * (u(i+1,j,k,4,c)*up1 -
- > u(i-1,j,k,4,c)*um1)
-
- rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dx5tx1 *
- > (u(i+1,j,k,5,c) - 2.0d0*u(i,j,k,5,c) +
- > u(i-1,j,k,5,c)) +
- > xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) +
- > qs(i-1,j,k,c)) +
- > xxcon4 * (up1*up1 - 2.0d0*uijk*uijk +
- > um1*um1) +
- > xxcon5 * (u(i+1,j,k,5,c)*rho_i(i+1,j,k,c) -
- > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
- > u(i-1,j,k,5,c)*rho_i(i-1,j,k,c)) -
- > tx2 * ( (c1*u(i+1,j,k,5,c) -
- > c2*square(i+1,j,k,c))*up1 -
- > (c1*u(i-1,j,k,5,c) -
- > c2*square(i-1,j,k,c))*um1 )
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c add fourth order xi-direction dissipation
-c---------------------------------------------------------------------
- if (start(1,c) .gt. 0) then
- i = 1
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp *
- > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) +
- > u(i+2,j,k,m,c))
- end do
- end do
- end do
-
- i = 2
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > (-4.0d0*u(i-1,j,k,m,c) + 6.0d0*u(i,j,k,m,c) -
- > 4.0d0*u(i+1,j,k,m,c) + u(i+2,j,k,m,c))
- end do
- end do
- end do
- endif
-
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) +
- > 6.0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) +
- > u(i+2,j,k,m,c) )
- end do
- end do
- end do
- end do
-
-
- if (end(1,c) .gt. 0) then
- i = cell_size(1,c)-3
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) +
- > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) )
- end do
- end do
- end do
-
- i = cell_size(1,c)-2
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > ( u(i-2,j,k,m,c) - 4.d0*u(i-1,j,k,m,c) +
- > 5.d0*u(i,j,k,m,c) )
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c compute eta-direction fluxes
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- vijk = vs(i,j,k,c)
- vp1 = vs(i,j+1,k,c)
- vm1 = vs(i,j-1,k,c)
- rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dy1ty1 *
- > (u(i,j+1,k,1,c) - 2.0d0*u(i,j,k,1,c) +
- > u(i,j-1,k,1,c)) -
- > ty2 * (u(i,j+1,k,3,c) - u(i,j-1,k,3,c))
- rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dy2ty1 *
- > (u(i,j+1,k,2,c) - 2.0d0*u(i,j,k,2,c) +
- > u(i,j-1,k,2,c)) +
- > yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) +
- > us(i,j-1,k,c)) -
- > ty2 * (u(i,j+1,k,2,c)*vp1 -
- > u(i,j-1,k,2,c)*vm1)
- rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dy3ty1 *
- > (u(i,j+1,k,3,c) - 2.0d0*u(i,j,k,3,c) +
- > u(i,j-1,k,3,c)) +
- > yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) -
- > ty2 * (u(i,j+1,k,3,c)*vp1 -
- > u(i,j-1,k,3,c)*vm1 +
- > (u(i,j+1,k,5,c) - square(i,j+1,k,c) -
- > u(i,j-1,k,5,c) + square(i,j-1,k,c))
- > *c2)
- rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dy4ty1 *
- > (u(i,j+1,k,4,c) - 2.0d0*u(i,j,k,4,c) +
- > u(i,j-1,k,4,c)) +
- > yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) +
- > ws(i,j-1,k,c)) -
- > ty2 * (u(i,j+1,k,4,c)*vp1 -
- > u(i,j-1,k,4,c)*vm1)
- rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dy5ty1 *
- > (u(i,j+1,k,5,c) - 2.0d0*u(i,j,k,5,c) +
- > u(i,j-1,k,5,c)) +
- > yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) +
- > qs(i,j-1,k,c)) +
- > yycon4 * (vp1*vp1 - 2.0d0*vijk*vijk +
- > vm1*vm1) +
- > yycon5 * (u(i,j+1,k,5,c)*rho_i(i,j+1,k,c) -
- > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
- > u(i,j-1,k,5,c)*rho_i(i,j-1,k,c)) -
- > ty2 * ((c1*u(i,j+1,k,5,c) -
- > c2*square(i,j+1,k,c)) * vp1 -
- > (c1*u(i,j-1,k,5,c) -
- > c2*square(i,j-1,k,c)) * vm1)
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c add fourth order eta-direction dissipation
-c---------------------------------------------------------------------
- if (start(2,c) .gt. 0) then
- j = 1
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp *
- > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) +
- > u(i,j+2,k,m,c))
- end do
- end do
- end do
-
- j = 2
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > (-4.0d0*u(i,j-1,k,m,c) + 6.0d0*u(i,j,k,m,c) -
- > 4.0d0*u(i,j+1,k,m,c) + u(i,j+2,k,m,c))
- end do
- end do
- end do
- endif
-
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1
- do i = start(1,c),cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) +
- > 6.0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) +
- > u(i,j+2,k,m,c) )
- end do
- end do
- end do
- end do
-
- if (end(2,c) .gt. 0) then
- j = cell_size(2,c)-3
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) +
- > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) )
- end do
- end do
- end do
-
- j = cell_size(2,c)-2
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > ( u(i,j-2,k,m,c) - 4.d0*u(i,j-1,k,m,c) +
- > 5.d0*u(i,j,k,m,c) )
- end do
- end do
- end do
- endif
-
-
-c---------------------------------------------------------------------
-c compute zeta-direction fluxes
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- wijk = ws(i,j,k,c)
- wp1 = ws(i,j,k+1,c)
- wm1 = ws(i,j,k-1,c)
-
- rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dz1tz1 *
- > (u(i,j,k+1,1,c) - 2.0d0*u(i,j,k,1,c) +
- > u(i,j,k-1,1,c)) -
- > tz2 * (u(i,j,k+1,4,c) - u(i,j,k-1,4,c))
- rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dz2tz1 *
- > (u(i,j,k+1,2,c) - 2.0d0*u(i,j,k,2,c) +
- > u(i,j,k-1,2,c)) +
- > zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) +
- > us(i,j,k-1,c)) -
- > tz2 * (u(i,j,k+1,2,c)*wp1 -
- > u(i,j,k-1,2,c)*wm1)
- rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dz3tz1 *
- > (u(i,j,k+1,3,c) - 2.0d0*u(i,j,k,3,c) +
- > u(i,j,k-1,3,c)) +
- > zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) +
- > vs(i,j,k-1,c)) -
- > tz2 * (u(i,j,k+1,3,c)*wp1 -
- > u(i,j,k-1,3,c)*wm1)
- rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dz4tz1 *
- > (u(i,j,k+1,4,c) - 2.0d0*u(i,j,k,4,c) +
- > u(i,j,k-1,4,c)) +
- > zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) -
- > tz2 * (u(i,j,k+1,4,c)*wp1 -
- > u(i,j,k-1,4,c)*wm1 +
- > (u(i,j,k+1,5,c) - square(i,j,k+1,c) -
- > u(i,j,k-1,5,c) + square(i,j,k-1,c))
- > *c2)
- rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dz5tz1 *
- > (u(i,j,k+1,5,c) - 2.0d0*u(i,j,k,5,c) +
- > u(i,j,k-1,5,c)) +
- > zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) +
- > qs(i,j,k-1,c)) +
- > zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk +
- > wm1*wm1) +
- > zzcon5 * (u(i,j,k+1,5,c)*rho_i(i,j,k+1,c) -
- > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) +
- > u(i,j,k-1,5,c)*rho_i(i,j,k-1,c)) -
- > tz2 * ( (c1*u(i,j,k+1,5,c) -
- > c2*square(i,j,k+1,c))*wp1 -
- > (c1*u(i,j,k-1,5,c) -
- > c2*square(i,j,k-1,c))*wm1)
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c add fourth order zeta-direction dissipation
-c---------------------------------------------------------------------
- if (start(3,c) .gt. 0) then
- k = 1
- do m = 1, 5
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp *
- > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) +
- > u(i,j,k+2,m,c))
- end do
- end do
- end do
-
- k = 2
- do m = 1, 5
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > (-4.0d0*u(i,j,k-1,m,c) + 6.0d0*u(i,j,k,m,c) -
- > 4.0d0*u(i,j,k+1,m,c) + u(i,j,k+2,m,c))
- end do
- end do
- end do
- endif
-
- do m = 1, 5
- do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c),cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) +
- > 6.0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) +
- > u(i,j,k+2,m,c) )
- end do
- end do
- end do
- end do
-
- if (end(3,c) .gt. 0) then
- k = cell_size(3,c)-3
- do m = 1, 5
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) +
- > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) )
- end do
- end do
- end do
-
- k = cell_size(3,c)-2
- do m = 1, 5
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp *
- > ( u(i,j,k-2,m,c) - 4.d0*u(i,j,k-1,m,c) +
- > 5.d0*u(i,j,k,m,c) )
- end do
- end do
- end do
- endif
-
- do m = 1, 5
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) * dt
- end do
- end do
- end do
- end do
-
- end do
-
- return
- end
-
-
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine set_constants
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- ce(1,1) = 2.0d0
- ce(1,2) = 0.0d0
- ce(1,3) = 0.0d0
- ce(1,4) = 4.0d0
- ce(1,5) = 5.0d0
- ce(1,6) = 3.0d0
- ce(1,7) = 0.5d0
- ce(1,8) = 0.02d0
- ce(1,9) = 0.01d0
- ce(1,10) = 0.03d0
- ce(1,11) = 0.5d0
- ce(1,12) = 0.4d0
- ce(1,13) = 0.3d0
-
- ce(2,1) = 1.0d0
- ce(2,2) = 0.0d0
- ce(2,3) = 0.0d0
- ce(2,4) = 0.0d0
- ce(2,5) = 1.0d0
- ce(2,6) = 2.0d0
- ce(2,7) = 3.0d0
- ce(2,8) = 0.01d0
- ce(2,9) = 0.03d0
- ce(2,10) = 0.02d0
- ce(2,11) = 0.4d0
- ce(2,12) = 0.3d0
- ce(2,13) = 0.5d0
-
- ce(3,1) = 2.0d0
- ce(3,2) = 2.0d0
- ce(3,3) = 0.0d0
- ce(3,4) = 0.0d0
- ce(3,5) = 0.0d0
- ce(3,6) = 2.0d0
- ce(3,7) = 3.0d0
- ce(3,8) = 0.04d0
- ce(3,9) = 0.03d0
- ce(3,10) = 0.05d0
- ce(3,11) = 0.3d0
- ce(3,12) = 0.5d0
- ce(3,13) = 0.4d0
-
- ce(4,1) = 2.0d0
- ce(4,2) = 2.0d0
- ce(4,3) = 0.0d0
- ce(4,4) = 0.0d0
- ce(4,5) = 0.0d0
- ce(4,6) = 2.0d0
- ce(4,7) = 3.0d0
- ce(4,8) = 0.03d0
- ce(4,9) = 0.05d0
- ce(4,10) = 0.04d0
- ce(4,11) = 0.2d0
- ce(4,12) = 0.1d0
- ce(4,13) = 0.3d0
-
- ce(5,1) = 5.0d0
- ce(5,2) = 4.0d0
- ce(5,3) = 3.0d0
- ce(5,4) = 2.0d0
- ce(5,5) = 0.1d0
- ce(5,6) = 0.4d0
- ce(5,7) = 0.3d0
- ce(5,8) = 0.05d0
- ce(5,9) = 0.04d0
- ce(5,10) = 0.03d0
- ce(5,11) = 0.1d0
- ce(5,12) = 0.3d0
- ce(5,13) = 0.2d0
-
- c1 = 1.4d0
- c2 = 0.4d0
- c3 = 0.1d0
- c4 = 1.0d0
- c5 = 1.4d0
-
- bt = dsqrt(0.5d0)
-
- dnxm1 = 1.0d0 / dble(grid_points(1)-1)
- dnym1 = 1.0d0 / dble(grid_points(2)-1)
- dnzm1 = 1.0d0 / dble(grid_points(3)-1)
-
- c1c2 = c1 * c2
- c1c5 = c1 * c5
- c3c4 = c3 * c4
- c1345 = c1c5 * c3c4
-
- conz1 = (1.0d0-c1c5)
-
- tx1 = 1.0d0 / (dnxm1 * dnxm1)
- tx2 = 1.0d0 / (2.0d0 * dnxm1)
- tx3 = 1.0d0 / dnxm1
-
- ty1 = 1.0d0 / (dnym1 * dnym1)
- ty2 = 1.0d0 / (2.0d0 * dnym1)
- ty3 = 1.0d0 / dnym1
-
- tz1 = 1.0d0 / (dnzm1 * dnzm1)
- tz2 = 1.0d0 / (2.0d0 * dnzm1)
- tz3 = 1.0d0 / dnzm1
-
- dx1 = 0.75d0
- dx2 = 0.75d0
- dx3 = 0.75d0
- dx4 = 0.75d0
- dx5 = 0.75d0
-
- dy1 = 0.75d0
- dy2 = 0.75d0
- dy3 = 0.75d0
- dy4 = 0.75d0
- dy5 = 0.75d0
-
- dz1 = 1.0d0
- dz2 = 1.0d0
- dz3 = 1.0d0
- dz4 = 1.0d0
- dz5 = 1.0d0
-
- dxmax = dmax1(dx3, dx4)
- dymax = dmax1(dy2, dy4)
- dzmax = dmax1(dz2, dz3)
-
- dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) )
-
- c4dssp = 4.0d0 * dssp
- c5dssp = 5.0d0 * dssp
-
- dttx1 = dt*tx1
- dttx2 = dt*tx2
- dtty1 = dt*ty1
- dtty2 = dt*ty2
- dttz1 = dt*tz1
- dttz2 = dt*tz2
-
- c2dttx1 = 2.0d0*dttx1
- c2dtty1 = 2.0d0*dtty1
- c2dttz1 = 2.0d0*dttz1
-
- dtdssp = dt*dssp
-
- comz1 = dtdssp
- comz4 = 4.0d0*dtdssp
- comz5 = 5.0d0*dtdssp
- comz6 = 6.0d0*dtdssp
-
- c3c4tx3 = c3c4*tx3
- c3c4ty3 = c3c4*ty3
- c3c4tz3 = c3c4*tz3
-
- dx1tx1 = dx1*tx1
- dx2tx1 = dx2*tx1
- dx3tx1 = dx3*tx1
- dx4tx1 = dx4*tx1
- dx5tx1 = dx5*tx1
-
- dy1ty1 = dy1*ty1
- dy2ty1 = dy2*ty1
- dy3ty1 = dy3*ty1
- dy4ty1 = dy4*ty1
- dy5ty1 = dy5*ty1
-
- dz1tz1 = dz1*tz1
- dz2tz1 = dz2*tz1
- dz3tz1 = dz3*tz1
- dz4tz1 = dz4*tz1
- dz5tz1 = dz5*tz1
-
- c2iv = 2.5d0
- con43 = 4.0d0/3.0d0
- con16 = 1.0d0/6.0d0
-
- xxcon1 = c3c4tx3*con43*tx3
- xxcon2 = c3c4tx3*tx3
- xxcon3 = c3c4tx3*conz1*tx3
- xxcon4 = c3c4tx3*con16*tx3
- xxcon5 = c3c4tx3*c1c5*tx3
-
- yycon1 = c3c4ty3*con43*ty3
- yycon2 = c3c4ty3*ty3
- yycon3 = c3c4ty3*conz1*ty3
- yycon4 = c3c4ty3*con16*ty3
- yycon5 = c3c4ty3*c1c5*ty3
-
- zzcon1 = c3c4tz3*con43*tz3
- zzcon2 = c3c4tz3*tz3
- zzcon3 = c3c4tz3*conz1*tz3
- zzcon4 = c3c4tz3*con16*tz3
- zzcon5 = c3c4tz3*c1c5*tz3
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine setup_mpi
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c set up MPI stuff
-c---------------------------------------------------------------------
-
- implicit none
- include 'mpinpb.h'
- include 'npbparams.h'
- integer error, nc, color
-
- call mpi_init(error)
-
- call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error)
- call mpi_comm_rank(MPI_COMM_WORLD, node, error)
-
- if (.not. convertdouble) then
- dp_type = MPI_DOUBLE_PRECISION
- else
- dp_type = MPI_REAL
- endif
-
-c---------------------------------------------------------------------
-c compute square root; add small number to allow for roundoff
-c---------------------------------------------------------------------
- nc = dint(dsqrt(dble(total_nodes) + 0.00001d0))
-
-c---------------------------------------------------------------------
-c We handle a non-square number of nodes by making the excess nodes
-c inactive. However, we can never handle more cells than were compiled
-c in.
-c---------------------------------------------------------------------
-
- if (nc .gt. maxcells) nc = maxcells
-
- if (node .ge. nc*nc) then
- active = .false.
- color = 1
- else
- active = .true.
- color = 0
- end if
-
- call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error)
- if (.not. active) return
-
- call mpi_comm_size(comm_setup, no_nodes, error)
- call mpi_comm_dup(comm_setup, comm_solve, error)
- call mpi_comm_dup(comm_setup, comm_rhs, error)
-
-c---------------------------------------------------------------------
-c let node 0 be the root for the group (there is only one)
-c---------------------------------------------------------------------
- root = 0
-
- return
- end
-
+++ /dev/null
-!-------------------------------------------------------------------------!
-! !
-! N A S P A R A L L E L B E N C H M A R K S 3.3 !
-! !
-! S P !
-! !
-!-------------------------------------------------------------------------!
-! !
-! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. !
-! It is described in NAS Technical Reports 95-020 and 02-007 !
-! !
-! Permission to use, copy, distribute and modify this software !
-! for any purpose with or without fee is hereby granted. We !
-! request, however, that all derived work reference the NAS !
-! Parallel Benchmarks 3.3. This software is provided "as is" !
-! without express or implied warranty. !
-! !
-! Information on NPB 3.3, including the technical report, the !
-! original specifications, source code, results and information !
-! on how to submit new results, is available at: !
-! !
-! http://www.nas.nasa.gov/Software/NPB/ !
-! !
-! Send comments or suggestions to npb@nas.nasa.gov !
-! !
-! NAS Parallel Benchmarks Group !
-! NASA Ames Research Center !
-! Mail Stop: T27A-1 !
-! Moffett Field, CA 94035-1000 !
-! !
-! E-mail: npb@nas.nasa.gov !
-! Fax: (650) 604-3957 !
-! !
-!-------------------------------------------------------------------------!
-
-
-c---------------------------------------------------------------------
-c
-c Authors: R. F. Van der Wijngaart
-c W. Saphir
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
- program MPSP
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i, niter, step, c, error, fstatus
- external timer_read
- double precision mflops, t, tmax, timer_read
- logical verified
- character class
-
- call setup_mpi
- if (.not. active) goto 999
-
-c---------------------------------------------------------------------
-c Root node reads input file (if it exists) else takes
-c defaults from parameters
-c---------------------------------------------------------------------
- if (node .eq. root) then
-
- write(*, 1000)
- open (unit=2,file='inputsp.data',status='old', iostat=fstatus)
-c
- if (fstatus .eq. 0) then
- write(*,233)
- 233 format(' Reading from input file inputsp.data')
- read (2,*) niter
- read (2,*) dt
- read (2,*) grid_points(1), grid_points(2), grid_points(3)
- close(2)
- else
- write(*,234)
- niter = niter_default
- dt = dt_default
- grid_points(1) = problem_size
- grid_points(2) = problem_size
- grid_points(3) = problem_size
- endif
- 234 format(' No input file inputsp.data. Using compiled defaults')
-
- write(*, 1001) grid_points(1), grid_points(2), grid_points(3)
- write(*, 1002) niter, dt
- if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes
- if (no_nodes .ne. maxcells*maxcells)
- > write(*, 1005) maxcells*maxcells
- write(*, 1003) no_nodes
-
- 1000 format(//,' NAS Parallel Benchmarks 3.3 -- SP Benchmark',/)
- 1001 format(' Size: ', i4, 'x', i4, 'x', i4)
- 1002 format(' Iterations: ', i4, ' dt: ', F11.7)
- 1004 format(' Total number of processes: ', i5)
- 1005 format(' WARNING: compiled for ', i5, ' processes ')
- 1003 format(' Number of active processes: ', i5, /)
-
- endif
-
- call mpi_bcast(niter, 1, MPI_INTEGER,
- > root, comm_setup, error)
-
- call mpi_bcast(dt, 1, dp_type,
- > root, comm_setup, error)
-
- call mpi_bcast(grid_points(1), 3, MPI_INTEGER,
- > root, comm_setup, error)
-
-
- call make_set
-
- do c = 1, ncells
- if ( (cell_size(1,c) .gt. IMAX) .or.
- > (cell_size(2,c) .gt. JMAX) .or.
- > (cell_size(3,c) .gt. KMAX) ) then
- print *,node, c, (cell_size(i,c),i=1,3)
- print *,' Problem size too big for compiled array sizes'
- goto 999
- endif
- end do
-
- call set_constants
-
- call initialize
-
-c call mpi_finalize(error)
-c stop
-
- call lhsinit
-
- call exact_rhs
-
- call compute_buffer_size(5)
-
-c---------------------------------------------------------------------
-c do one time step to touch all code, and reinitialize
-c---------------------------------------------------------------------
- call adi
- call initialize
-
-c---------------------------------------------------------------------
-c Synchronize before placing time stamp
-c---------------------------------------------------------------------
- call mpi_barrier(comm_setup, error)
-
- call timer_clear(1)
- call timer_start(1)
-
- do step = 1, niter
-
- if (node .eq. root) then
- if (mod(step, 20) .eq. 0 .or.
- > step .eq. 1) then
- write(*, 200) step
- 200 format(' Time step ', i4)
- endif
- endif
-
- call adi
-
- end do
-
- call timer_stop(1)
- t = timer_read(1)
-
- call verify(niter, class, verified)
-
- call mpi_reduce(t, tmax, 1,
- > dp_type, MPI_MAX,
- > root, comm_setup, error)
-
- if( node .eq. root ) then
- if( tmax .ne. 0. ) then
- mflops = (881.174*float( problem_size )**3
- > -4683.91*float( problem_size )**2
- > +11484.5*float( problem_size )
- > -19272.4) * float( niter ) / (tmax*1000000.0d0)
- else
- mflops = 0.0
- endif
-
- call print_results('SP', class, grid_points(1),
- > grid_points(2), grid_points(3), niter, maxcells*maxcells,
- > total_nodes, tmax, mflops, ' floating point',
- > verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5,
- > cs6, '(none)')
- endif
-
- 999 continue
- call mpi_barrier(MPI_COMM_WORLD, error)
- call mpi_finalize(error)
-
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine txinvr
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c block-diagonal matrix-vector multiplication
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer c, i, j, k
- double precision t1, t2, t3, ac, ru1, uu, vv, ww, r1, r2, r3,
- > r4, r5, ac2inv
-
-c---------------------------------------------------------------------
-c loop over all cells owned by this node
-c---------------------------------------------------------------------
- do c = 1, ncells
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
-
- ru1 = rho_i(i,j,k,c)
- uu = us(i,j,k,c)
- vv = vs(i,j,k,c)
- ww = ws(i,j,k,c)
- ac = speed(i,j,k,c)
- ac2inv = ainv(i,j,k,c)*ainv(i,j,k,c)
-
- r1 = rhs(i,j,k,1,c)
- r2 = rhs(i,j,k,2,c)
- r3 = rhs(i,j,k,3,c)
- r4 = rhs(i,j,k,4,c)
- r5 = rhs(i,j,k,5,c)
-
- t1 = c2 * ac2inv * ( qs(i,j,k,c)*r1 - uu*r2 -
- > vv*r3 - ww*r4 + r5 )
- t2 = bt * ru1 * ( uu * r1 - r2 )
- t3 = ( bt * ru1 * ac ) * t1
-
- rhs(i,j,k,1,c) = r1 - t1
- rhs(i,j,k,2,c) = - ru1 * ( ww*r1 - r4 )
- rhs(i,j,k,3,c) = ru1 * ( vv*r1 - r3 )
- rhs(i,j,k,4,c) = - t2 + t3
- rhs(i,j,k,5,c) = t2 + t3
- end do
- end do
- end do
- end do
-
- return
- end
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine tzetar(c)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c block-diagonal matrix-vector multiplication
-c---------------------------------------------------------------------
-
- include 'header.h'
-
- integer i, j, k, c
- double precision t1, t2, t3, ac, xvel, yvel, zvel, r1, r2, r3,
- > r4, r5, btuz, acinv, ac2u, uzik1
-
-c---------------------------------------------------------------------
-c treat only one cell
-c---------------------------------------------------------------------
- do k = start(3,c), cell_size(3,c)-end(3,c)-1
- do j = start(2,c), cell_size(2,c)-end(2,c)-1
- do i = start(1,c), cell_size(1,c)-end(1,c)-1
-
- xvel = us(i,j,k,c)
- yvel = vs(i,j,k,c)
- zvel = ws(i,j,k,c)
- ac = speed(i,j,k,c)
- acinv = ainv(i,j,k,c)
-
- ac2u = ac*ac
-
- r1 = rhs(i,j,k,1,c)
- r2 = rhs(i,j,k,2,c)
- r3 = rhs(i,j,k,3,c)
- r4 = rhs(i,j,k,4,c)
- r5 = rhs(i,j,k,5,c)
-
- uzik1 = u(i,j,k,1,c)
- btuz = bt * uzik1
-
- t1 = btuz*acinv * (r4 + r5)
- t2 = r3 + t1
- t3 = btuz * (r4 - r5)
-
- rhs(i,j,k,1,c) = t2
- rhs(i,j,k,2,c) = -uzik1*r2 + xvel*t2
- rhs(i,j,k,3,c) = uzik1*r1 + yvel*t2
- rhs(i,j,k,4,c) = zvel*t2 + t3
- rhs(i,j,k,5,c) = uzik1*(-xvel*r2 + yvel*r1) +
- > qs(i,j,k,c)*t2 + c2iv*ac2u*t1 + zvel*t3
-
- end do
- end do
- end do
-
- return
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine verify(no_time_steps, class, verified)
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c verification routine
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5),
- > epsilon, xce(5), xcr(5), dtref
- integer m, no_time_steps
- character class
- logical verified
-
-c---------------------------------------------------------------------
-c tolerance level
-c---------------------------------------------------------------------
- epsilon = 1.0d-08
-
-
-c---------------------------------------------------------------------
-c compute the error norm and the residual norm, and exit if not printing
-c---------------------------------------------------------------------
- call error_norm(xce)
- call copy_faces
-
- call rhs_norm(xcr)
-
- do m = 1, 5
- xcr(m) = xcr(m) / dt
- enddo
-
- if (node .ne. 0) return
-
- class = 'U'
- verified = .true.
-
- do m = 1,5
- xcrref(m) = 1.0
- xceref(m) = 1.0
- end do
-
-c---------------------------------------------------------------------
-c reference data for 12X12X12 grids after 100 time steps, with DT = 1.50d-02
-c---------------------------------------------------------------------
- if ( (grid_points(1) .eq. 12 ) .and.
- > (grid_points(2) .eq. 12 ) .and.
- > (grid_points(3) .eq. 12 ) .and.
- > (no_time_steps .eq. 100 )) then
-
- class = 'S'
- dtref = 1.5d-2
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 2.7470315451339479d-02
- xcrref(2) = 1.0360746705285417d-02
- xcrref(3) = 1.6235745065095532d-02
- xcrref(4) = 1.5840557224455615d-02
- xcrref(5) = 3.4849040609362460d-02
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
- xceref(1) = 2.7289258557377227d-05
- xceref(2) = 1.0364446640837285d-05
- xceref(3) = 1.6154798287166471d-05
- xceref(4) = 1.5750704994480102d-05
- xceref(5) = 3.4177666183390531d-05
-
-
-c---------------------------------------------------------------------
-c reference data for 36X36X36 grids after 400 time steps, with DT = 1.5d-03
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 36) .and.
- > (grid_points(2) .eq. 36) .and.
- > (grid_points(3) .eq. 36) .and.
- > (no_time_steps . eq. 400) ) then
-
- class = 'W'
- dtref = 1.5d-3
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 0.1893253733584d-02
- xcrref(2) = 0.1717075447775d-03
- xcrref(3) = 0.2778153350936d-03
- xcrref(4) = 0.2887475409984d-03
- xcrref(5) = 0.3143611161242d-02
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
- xceref(1) = 0.7542088599534d-04
- xceref(2) = 0.6512852253086d-05
- xceref(3) = 0.1049092285688d-04
- xceref(4) = 0.1128838671535d-04
- xceref(5) = 0.1212845639773d-03
-
-c---------------------------------------------------------------------
-c reference data for 64X64X64 grids after 400 time steps, with DT = 1.5d-03
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 64) .and.
- > (grid_points(2) .eq. 64) .and.
- > (grid_points(3) .eq. 64) .and.
- > (no_time_steps . eq. 400) ) then
-
- class = 'A'
- dtref = 1.5d-3
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 2.4799822399300195d0
- xcrref(2) = 1.1276337964368832d0
- xcrref(3) = 1.5028977888770491d0
- xcrref(4) = 1.4217816211695179d0
- xcrref(5) = 2.1292113035138280d0
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
- xceref(1) = 1.0900140297820550d-04
- xceref(2) = 3.7343951769282091d-05
- xceref(3) = 5.0092785406541633d-05
- xceref(4) = 4.7671093939528255d-05
- xceref(5) = 1.3621613399213001d-04
-
-c---------------------------------------------------------------------
-c reference data for 102X102X102 grids after 400 time steps,
-c with DT = 1.0d-03
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 102) .and.
- > (grid_points(2) .eq. 102) .and.
- > (grid_points(3) .eq. 102) .and.
- > (no_time_steps . eq. 400) ) then
-
- class = 'B'
- dtref = 1.0d-3
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 0.6903293579998d+02
- xcrref(2) = 0.3095134488084d+02
- xcrref(3) = 0.4103336647017d+02
- xcrref(4) = 0.3864769009604d+02
- xcrref(5) = 0.5643482272596d+02
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
- xceref(1) = 0.9810006190188d-02
- xceref(2) = 0.1022827905670d-02
- xceref(3) = 0.1720597911692d-02
- xceref(4) = 0.1694479428231d-02
- xceref(5) = 0.1847456263981d-01
-
-c---------------------------------------------------------------------
-c reference data for 162X162X162 grids after 400 time steps,
-c with DT = 0.67d-03
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 162) .and.
- > (grid_points(2) .eq. 162) .and.
- > (grid_points(3) .eq. 162) .and.
- > (no_time_steps . eq. 400) ) then
-
- class = 'C'
- dtref = 0.67d-3
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 0.5881691581829d+03
- xcrref(2) = 0.2454417603569d+03
- xcrref(3) = 0.3293829191851d+03
- xcrref(4) = 0.3081924971891d+03
- xcrref(5) = 0.4597223799176d+03
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
- xceref(1) = 0.2598120500183d+00
- xceref(2) = 0.2590888922315d-01
- xceref(3) = 0.5132886416320d-01
- xceref(4) = 0.4806073419454d-01
- xceref(5) = 0.5483377491301d+00
-
-c---------------------------------------------------------------------
-c reference data for 408X408X408 grids after 500 time steps,
-c with DT = 0.3d-03
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 408) .and.
- > (grid_points(2) .eq. 408) .and.
- > (grid_points(3) .eq. 408) .and.
- > (no_time_steps . eq. 500) ) then
-
- class = 'D'
- dtref = 0.30d-3
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 0.1044696216887d+05
- xcrref(2) = 0.3204427762578d+04
- xcrref(3) = 0.4648680733032d+04
- xcrref(4) = 0.4238923283697d+04
- xcrref(5) = 0.7588412036136d+04
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
- xceref(1) = 0.5089471423669d+01
- xceref(2) = 0.5323514855894d+00
- xceref(3) = 0.1187051008971d+01
- xceref(4) = 0.1083734951938d+01
- xceref(5) = 0.1164108338568d+02
-
-c---------------------------------------------------------------------
-c reference data for 1020X1020X1020 grids after 500 time steps,
-c with DT = 0.1d-03
-c---------------------------------------------------------------------
- elseif ( (grid_points(1) .eq. 1020) .and.
- > (grid_points(2) .eq. 1020) .and.
- > (grid_points(3) .eq. 1020) .and.
- > (no_time_steps . eq. 500) ) then
-
- class = 'E'
- dtref = 0.10d-3
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of residual.
-c---------------------------------------------------------------------
- xcrref(1) = 0.6255387422609d+05
- xcrref(2) = 0.1495317020012d+05
- xcrref(3) = 0.2347595750586d+05
- xcrref(4) = 0.2091099783534d+05
- xcrref(5) = 0.4770412841218d+05
-
-c---------------------------------------------------------------------
-c Reference values of RMS-norms of solution error.
-c---------------------------------------------------------------------
- xceref(1) = 0.6742735164909d+02
- xceref(2) = 0.5390656036938d+01
- xceref(3) = 0.1680647196477d+02
- xceref(4) = 0.1536963126457d+02
- xceref(5) = 0.1575330146156d+03
-
- else
- verified = .false.
- endif
-
-c---------------------------------------------------------------------
-c verification test for residuals if gridsize is one of
-c the defined grid sizes above (class .ne. 'U')
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c Compute the difference of solution values and the known reference values.
-c---------------------------------------------------------------------
- do m = 1, 5
-
- xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m))
- xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
-
- enddo
-
-c---------------------------------------------------------------------
-c Output the comparison of computed results to known cases.
-c---------------------------------------------------------------------
-
- if (class .ne. 'U') then
- write(*, 1990) class
- 1990 format(' Verification being performed for class ', a)
- write (*,2000) epsilon
- 2000 format(' accuracy setting for epsilon = ', E20.13)
- verified = (dabs(dt-dtref) .le. epsilon)
- if (.not.verified) then
- class = 'U'
- write (*,1000) dtref
- 1000 format(' DT does not match the reference value of ',
- > E15.8)
- endif
- else
- write(*, 1995)
- 1995 format(' Unknown class')
- endif
-
-
- if (class .ne. 'U') then
- write (*,2001)
- else
- write (*, 2005)
- endif
-
- 2001 format(' Comparison of RMS-norms of residual')
- 2005 format(' RMS-norms of residual')
- do m = 1, 5
- if (class .eq. 'U') then
- write(*, 2015) m, xcr(m)
- else if (xcrdif(m) .le. epsilon) then
- write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
- else
- verified = .false.
- write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
- endif
- enddo
-
- if (class .ne. 'U') then
- write (*,2002)
- else
- write (*,2006)
- endif
- 2002 format(' Comparison of RMS-norms of solution error')
- 2006 format(' RMS-norms of solution error')
-
- do m = 1, 5
- if (class .eq. 'U') then
- write(*, 2015) m, xce(m)
- else if (xcedif(m) .le. epsilon) then
- write (*,2011) m,xce(m),xceref(m),xcedif(m)
- else
- verified = .false.
- write (*,2010) m,xce(m),xceref(m),xcedif(m)
- endif
- enddo
-
- 2010 format(' FAILURE: ', i2, E20.13, E20.13, E20.13)
- 2011 format(' ', i2, E20.13, E20.13, E20.13)
- 2015 format(' ', i2, E20.13)
-
- if (class .eq. 'U') then
- write(*, 2022)
- write(*, 2023)
- 2022 format(' No reference values provided')
- 2023 format(' No verification performed')
- else if (verified) then
- write(*, 2020)
- 2020 format(' Verification Successful')
- else
- write(*, 2021)
- 2021 format(' Verification failed')
- endif
-
- return
-
-
- end
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine x_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function performs the solution of the approximate factorization
-c step in the x-direction for all five matrix components
-c simultaneously. The Thomas algorithm is employed to solve the
-c systems for the x-lines. Boundary conditions are non-periodic
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
-
- integer i, j, k, jp, kp, n, iend, jsize, ksize, i1, i2,
- > buffer_size, c, m, p, istart, stage, error,
- > requests(2), statuses(MPI_STATUS_SIZE, 2)
- double precision r1, r2, d, e, s(5), sm1, sm2,
- > fac1, fac2
-
-
-
-c---------------------------------------------------------------------
-c OK, now we know that there are multiple processors
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
-c on this node in the direction of increasing i for the forward sweep,
-c and after that reversing the direction for the backsubstitution.
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c FORWARD ELIMINATION
-c---------------------------------------------------------------------
- do stage = 1, ncells
- c = slice(1,stage)
-
- istart = 0
- iend = cell_size(1,c)-1
-
- jsize = cell_size(2,c)
- ksize = cell_size(3,c)
- jp = cell_coord(2,c)-1
- kp = cell_coord(3,c)-1
-
- buffer_size = (jsize-start(2,c)-end(2,c)) *
- > (ksize-start(3,c)-end(3,c))
-
- if ( stage .ne. 1) then
-
-c---------------------------------------------------------------------
-c if this is not the first processor in this row of cells,
-c receive data from predecessor containing the right hand
-c sides and the upper diagonal elements of the previous two rows
-c---------------------------------------------------------------------
- call mpi_irecv(in_buffer, 22*buffer_size,
- > dp_type, predecessor(1),
- > DEFAULT_TAG, comm_solve,
- > requests(1), error)
-
-
-c---------------------------------------------------------------------
-c communication has already been started.
-c compute the left hand side while waiting for the msg
-c---------------------------------------------------------------------
- call lhsx(c)
-
-c---------------------------------------------------------------------
-c wait for pending communication to complete
-c This waits on the current receive and on the send
-c from the previous stage. They always come in pairs.
-c---------------------------------------------------------------------
-
- call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c unpack the buffer
-c---------------------------------------------------------------------
- i = istart
- i1 = istart + 1
- n = 0
-
-c---------------------------------------------------------------------
-c create a running pointer
-c---------------------------------------------------------------------
- p = 0
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
- > in_buffer(p+1) * lhs(i,j,k,n+1,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
- > in_buffer(p+2) * lhs(i,j,k,n+1,c)
- do m = 1, 3
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
- end do
- d = in_buffer(p+6)
- e = in_buffer(p+7)
- do m = 1, 3
- s(m) = in_buffer(p+7+m)
- end do
- r1 = lhs(i,j,k,n+2,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
- lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
- do m = 1, 3
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
- end do
- r2 = lhs(i1,j,k,n+1,c)
- lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
- lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
- do m = 1, 3
- rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2
- end do
- p = p + 10
- end do
- end do
-
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
- > in_buffer(p+1) * lhs(i,j,k,n+1,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
- > in_buffer(p+2) * lhs(i,j,k,n+1,c)
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > in_buffer(p+3) * lhs(i,j,k,n+1,c)
- d = in_buffer(p+4)
- e = in_buffer(p+5)
- s(m) = in_buffer(p+6)
- r1 = lhs(i,j,k,n+2,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
- lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
- r2 = lhs(i1,j,k,n+1,c)
- lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2
- lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2
- rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2
- p = p + 6
- end do
- end do
- end do
-
- else
-
-c---------------------------------------------------------------------
-c if this IS the first cell, we still compute the lhs
-c---------------------------------------------------------------------
- call lhsx(c)
- endif
-
-c---------------------------------------------------------------------
-c perform the Thomas algorithm; first, FORWARD ELIMINATION
-c---------------------------------------------------------------------
- n = 0
-
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- do i = istart, iend-2
- i1 = i + 1
- i2 = i + 2
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- end do
- lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
- > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
- > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
- > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
- end do
- lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
- > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
- lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
- > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
- > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c The last two rows in this grid block are a bit different,
-c since they do not have two more rows available for the
-c elimination of off-diagonal entries
-c---------------------------------------------------------------------
-
- i = iend - 1
- i1 = iend
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- end do
- lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
- > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
- > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
- > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
- end do
-c---------------------------------------------------------------------
-c scale the last row immediately (some of this is
-c overkill in case this is the last cell)
-c---------------------------------------------------------------------
- fac2 = 1.d0/lhs(i1,j,k,n+3,c)
- lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
- lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)
- do m = 1, 3
- rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c)
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c do the u+c and the u-c factors
-c---------------------------------------------------------------------
-
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- do i = istart, iend-2
- i1 = i + 1
- i2 = i + 2
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
- > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
- > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
- rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
- > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
- lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) -
- > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c)
- lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) -
- > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c)
- rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) -
- > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c)
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c And again the last two rows separately
-c---------------------------------------------------------------------
- i = iend - 1
- i1 = iend
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) -
- > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) -
- > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c)
- rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
- > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c)
-c---------------------------------------------------------------------
-c Scale the last row immediately (some of this is overkill
-c if this is the last cell)
-c---------------------------------------------------------------------
- fac2 = 1.d0/lhs(i1,j,k,n+3,c)
- lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c)
- lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c)
- rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c)
-
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c send information to the next processor, except when this
-c is the last grid block
-c---------------------------------------------------------------------
- if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c create a running pointer for the send buffer
-c---------------------------------------------------------------------
- p = 0
- n = 0
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- do i = iend-1, iend
- out_buffer(p+1) = lhs(i,j,k,n+4,c)
- out_buffer(p+2) = lhs(i,j,k,n+5,c)
- do m = 1, 3
- out_buffer(p+2+m) = rhs(i,j,k,m,c)
- end do
- p = p+5
- end do
- end do
- end do
-
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- do i = iend-1, iend
- out_buffer(p+1) = lhs(i,j,k,n+4,c)
- out_buffer(p+2) = lhs(i,j,k,n+5,c)
- out_buffer(p+3) = rhs(i,j,k,m,c)
- p = p + 3
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c send data to next phase
-c can't receive data yet because buffer size will be wrong
-c---------------------------------------------------------------------
- call mpi_isend(out_buffer, 22*buffer_size,
- > dp_type, successor(1),
- > DEFAULT_TAG, comm_solve,
- > requests(2), error)
-
- endif
- end do
-
-c---------------------------------------------------------------------
-c now go in the reverse direction
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c BACKSUBSTITUTION
-c---------------------------------------------------------------------
- do stage = ncells, 1, -1
- c = slice(1,stage)
-
- istart = 0
- iend = cell_size(1,c)-1
-
- jsize = cell_size(2,c)
- ksize = cell_size(3,c)
- jp = cell_coord(2,c)-1
- kp = cell_coord(3,c)-1
-
- buffer_size = (jsize-start(2,c)-end(2,c)) *
- > (ksize-start(3,c)-end(3,c))
-
- if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c if this is not the starting cell in this row of cells,
-c wait for a message to be received, containing the
-c solution of the previous two stations
-c---------------------------------------------------------------------
- call mpi_irecv(in_buffer, 10*buffer_size,
- > dp_type, successor(1),
- > DEFAULT_TAG, comm_solve,
- > requests(1), error)
-
-
-c---------------------------------------------------------------------
-c communication has already been started
-c while waiting, do the block-diagonal inversion for the
-c cell that was just finished
-c---------------------------------------------------------------------
-
- call ninvr(slice(1,stage+1))
-
-c---------------------------------------------------------------------
-c wait for pending communication to complete
-c---------------------------------------------------------------------
- call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c unpack the buffer for the first three factors
-c---------------------------------------------------------------------
- n = 0
- p = 0
- i = iend
- i1 = i - 1
- do m = 1, 3
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- sm1 = in_buffer(p+1)
- sm2 = in_buffer(p+2)
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*sm1 -
- > lhs(i,j,k,n+5,c)*sm2
- rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
- > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) -
- > lhs(i1,j,k,n+5,c) * sm1
- p = p + 2
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c now unpack the buffer for the remaining two factors
-c---------------------------------------------------------------------
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- sm1 = in_buffer(p+1)
- sm2 = in_buffer(p+2)
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*sm1 -
- > lhs(i,j,k,n+5,c)*sm2
- rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) -
- > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) -
- > lhs(i1,j,k,n+5,c) * sm1
- p = p + 2
- end do
- end do
- end do
-
- else
-
-c---------------------------------------------------------------------
-c now we know this is the first grid block on the back sweep,
-c so we don't need a message to start the substitution.
-c---------------------------------------------------------------------
- i = iend-1
- i1 = iend
- n = 0
- do m = 1, 3
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
- end do
- end do
- end do
-
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c)
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c Whether or not this is the last processor, we always have
-c to complete the back-substitution
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c The first three factors
-c---------------------------------------------------------------------
- n = 0
- do m = 1, 3
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- do i = iend-2, istart, -1
- i1 = i + 1
- i2 = i + 2
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
- > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c And the remaining two
-c---------------------------------------------------------------------
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- do i = iend-2, istart, -1
- i1 = i + 1
- i2 = i + 2
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) -
- > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c send on information to the previous processor, if needed
-c---------------------------------------------------------------------
- if (stage .ne. 1) then
- i = istart
- i1 = istart+1
- p = 0
- do m = 1, 5
- do k = start(3,c), ksize-end(3,c)-1
- do j = start(2,c), jsize-end(2,c)-1
- out_buffer(p+1) = rhs(i,j,k,m,c)
- out_buffer(p+2) = rhs(i1,j,k,m,c)
- p = p + 2
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c pack and send the buffer
-c---------------------------------------------------------------------
- call mpi_isend(out_buffer, 10*buffer_size,
- > dp_type, predecessor(1),
- > DEFAULT_TAG, comm_solve,
- > requests(2), error)
-
- endif
-
-c---------------------------------------------------------------------
-c If this was the last stage, do the block-diagonal inversion
-c---------------------------------------------------------------------
- if (stage .eq. 1) call ninvr(c)
-
- end do
-
- return
- end
-
-
-
-
-
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine y_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function performs the solution of the approximate factorization
-c step in the y-direction for all five matrix components
-c simultaneously. The Thomas algorithm is employed to solve the
-c systems for the y-lines. Boundary conditions are non-periodic
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i, j, k, stage, ip, kp, n, isize, jend, ksize, j1, j2,
- > buffer_size, c, m, p, jstart, error,
- > requests(2), statuses(MPI_STATUS_SIZE, 2)
- double precision r1, r2, d, e, s(5), sm1, sm2,
- > fac1, fac2
-
-
-c---------------------------------------------------------------------
-c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
-c on this node in the direction of increasing i for the forward sweep,
-c and after that reversing the direction for the backsubstitution
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c FORWARD ELIMINATION
-c---------------------------------------------------------------------
- do stage = 1, ncells
- c = slice(2,stage)
-
- jstart = 0
- jend = cell_size(2,c)-1
-
- isize = cell_size(1,c)
- ksize = cell_size(3,c)
- ip = cell_coord(1,c)-1
- kp = cell_coord(3,c)-1
-
- buffer_size = (isize-start(1,c)-end(1,c)) *
- > (ksize-start(3,c)-end(3,c))
-
- if ( stage .ne. 1) then
-
-c---------------------------------------------------------------------
-c if this is not the first processor in this row of cells,
-c receive data from predecessor containing the right hand
-c sides and the upper diagonal elements of the previous two rows
-c---------------------------------------------------------------------
-
- call mpi_irecv(in_buffer, 22*buffer_size,
- > dp_type, predecessor(2),
- > DEFAULT_TAG, comm_solve,
- > requests(1), error)
-
-c---------------------------------------------------------------------
-c communication has already been started.
-c compute the left hand side while waiting for the msg
-c---------------------------------------------------------------------
- call lhsy(c)
-
-c---------------------------------------------------------------------
-c wait for pending communication to complete
-c This waits on the current receive and on the send
-c from the previous stage. They always come in pairs.
-c---------------------------------------------------------------------
- call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c unpack the buffer
-c---------------------------------------------------------------------
- j = jstart
- j1 = jstart + 1
- n = 0
-c---------------------------------------------------------------------
-c create a running pointer
-c---------------------------------------------------------------------
- p = 0
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
- > in_buffer(p+1) * lhs(i,j,k,n+1,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
- > in_buffer(p+2) * lhs(i,j,k,n+1,c)
- do m = 1, 3
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
- end do
- d = in_buffer(p+6)
- e = in_buffer(p+7)
- do m = 1, 3
- s(m) = in_buffer(p+7+m)
- end do
- r1 = lhs(i,j,k,n+2,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
- lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
- do m = 1, 3
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
- end do
- r2 = lhs(i,j1,k,n+1,c)
- lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
- lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
- do m = 1, 3
- rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2
- end do
- p = p + 10
- end do
- end do
-
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
- > in_buffer(p+1) * lhs(i,j,k,n+1,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
- > in_buffer(p+2) * lhs(i,j,k,n+1,c)
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > in_buffer(p+3) * lhs(i,j,k,n+1,c)
- d = in_buffer(p+4)
- e = in_buffer(p+5)
- s(m) = in_buffer(p+6)
- r1 = lhs(i,j,k,n+2,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
- lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
- r2 = lhs(i,j1,k,n+1,c)
- lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2
- lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2
- rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2
- p = p + 6
- end do
- end do
- end do
-
- else
-
-c---------------------------------------------------------------------
-c if this IS the first cell, we still compute the lhs
-c---------------------------------------------------------------------
- call lhsy(c)
- endif
-
-c---------------------------------------------------------------------
-c perform the Thomas algorithm; first, FORWARD ELIMINATION
-c---------------------------------------------------------------------
- n = 0
-
- do k = start(3,c), ksize-end(3,c)-1
- do j = jstart, jend-2
- do i = start(1,c), isize-end(1,c)-1
- j1 = j + 1
- j2 = j + 2
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- end do
- lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
- > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
- > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
- > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
- end do
- lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
- > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
- lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
- > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
- > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c The last two rows in this grid block are a bit different,
-c since they do not have two more rows available for the
-c elimination of off-diagonal entries
-c---------------------------------------------------------------------
-
- j = jend - 1
- j1 = jend
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- end do
- lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
- > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
- > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
- > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
- end do
-c---------------------------------------------------------------------
-c scale the last row immediately (some of this is
-c overkill in case this is the last cell)
-c---------------------------------------------------------------------
- fac2 = 1.d0/lhs(i,j1,k,n+3,c)
- lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
- lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
- do m = 1, 3
- rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c)
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c do the u+c and the u-c factors
-c---------------------------------------------------------------------
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do j = jstart, jend-2
- do i = start(1,c), isize-end(1,c)-1
- j1 = j + 1
- j2 = j + 2
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
- > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
- > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
- rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
- > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
- lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) -
- > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c)
- lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) -
- > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c)
- rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) -
- > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c)
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c And again the last two rows separately
-c---------------------------------------------------------------------
- j = jend - 1
- j1 = jend
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) -
- > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) -
- > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c)
- rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
- > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c)
-c---------------------------------------------------------------------
-c Scale the last row immediately (some of this is overkill
-c if this is the last cell)
-c---------------------------------------------------------------------
- fac2 = 1.d0/lhs(i,j1,k,n+3,c)
- lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c)
- lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c)
- rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c)
-
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c send information to the next processor, except when this
-c is the last grid block;
-c---------------------------------------------------------------------
-
- if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c create a running pointer for the send buffer
-c---------------------------------------------------------------------
- p = 0
- n = 0
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- do j = jend-1, jend
- out_buffer(p+1) = lhs(i,j,k,n+4,c)
- out_buffer(p+2) = lhs(i,j,k,n+5,c)
- do m = 1, 3
- out_buffer(p+2+m) = rhs(i,j,k,m,c)
- end do
- p = p+5
- end do
- end do
- end do
-
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- do j = jend-1, jend
- out_buffer(p+1) = lhs(i,j,k,n+4,c)
- out_buffer(p+2) = lhs(i,j,k,n+5,c)
- out_buffer(p+3) = rhs(i,j,k,m,c)
- p = p + 3
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c pack and send the buffer
-c---------------------------------------------------------------------
- call mpi_isend(out_buffer, 22*buffer_size,
- > dp_type, successor(2),
- > DEFAULT_TAG, comm_solve,
- > requests(2), error)
-
- endif
- end do
-
-c---------------------------------------------------------------------
-c now go in the reverse direction
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c BACKSUBSTITUTION
-c---------------------------------------------------------------------
- do stage = ncells, 1, -1
- c = slice(2,stage)
-
- jstart = 0
- jend = cell_size(2,c)-1
-
- isize = cell_size(1,c)
- ksize = cell_size(3,c)
- ip = cell_coord(1,c)-1
- kp = cell_coord(3,c)-1
-
- buffer_size = (isize-start(1,c)-end(1,c)) *
- > (ksize-start(3,c)-end(3,c))
-
- if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c if this is not the starting cell in this row of cells,
-c wait for a message to be received, containing the
-c solution of the previous two stations
-c---------------------------------------------------------------------
-
- call mpi_irecv(in_buffer, 10*buffer_size,
- > dp_type, successor(2),
- > DEFAULT_TAG, comm_solve,
- > requests(1), error)
-
-
-c---------------------------------------------------------------------
-c communication has already been started
-c while waiting, do the block-diagonal inversion for the
-c cell that was just finished
-c---------------------------------------------------------------------
-
- call pinvr(slice(2,stage+1))
-
-c---------------------------------------------------------------------
-c wait for pending communication to complete
-c---------------------------------------------------------------------
- call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c unpack the buffer for the first three factors
-c---------------------------------------------------------------------
- n = 0
- p = 0
- j = jend
- j1 = j - 1
- do m = 1, 3
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- sm1 = in_buffer(p+1)
- sm2 = in_buffer(p+2)
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*sm1 -
- > lhs(i,j,k,n+5,c)*sm2
- rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
- > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) -
- > lhs(i,j1,k,n+5,c) * sm1
- p = p + 2
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c now unpack the buffer for the remaining two factors
-c---------------------------------------------------------------------
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- sm1 = in_buffer(p+1)
- sm2 = in_buffer(p+2)
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*sm1 -
- > lhs(i,j,k,n+5,c)*sm2
- rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) -
- > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) -
- > lhs(i,j1,k,n+5,c) * sm1
- p = p + 2
- end do
- end do
- end do
-
- else
-c---------------------------------------------------------------------
-c now we know this is the first grid block on the back sweep,
-c so we don't need a message to start the substitution.
-c---------------------------------------------------------------------
-
- j = jend - 1
- j1 = jend
- n = 0
- do m = 1, 3
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
- end do
- end do
- end do
-
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c)
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c Whether or not this is the last processor, we always have
-c to complete the back-substitution
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c The first three factors
-c---------------------------------------------------------------------
- n = 0
- do m = 1, 3
- do k = start(3,c), ksize-end(3,c)-1
- do j = jend-2, jstart, -1
- do i = start(1,c), isize-end(1,c)-1
- j1 = j + 1
- j2 = j + 2
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
- > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c And the remaining two
-c---------------------------------------------------------------------
- do m = 4, 5
- n = (m-3)*5
- do k = start(3,c), ksize-end(3,c)-1
- do j = jend-2, jstart, -1
- do i = start(1,c), isize-end(1,c)-1
- j1 = j + 1
- j2 = j1 + 1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) -
- > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c send on information to the previous processor, if needed
-c---------------------------------------------------------------------
- if (stage .ne. 1) then
- j = jstart
- j1 = jstart + 1
- p = 0
- do m = 1, 5
- do k = start(3,c), ksize-end(3,c)-1
- do i = start(1,c), isize-end(1,c)-1
- out_buffer(p+1) = rhs(i,j,k,m,c)
- out_buffer(p+2) = rhs(i,j1,k,m,c)
- p = p + 2
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c pack and send the buffer
-c---------------------------------------------------------------------
-
- call mpi_isend(out_buffer, 10*buffer_size,
- > dp_type, predecessor(2),
- > DEFAULT_TAG, comm_solve,
- > requests(2), error)
-
- endif
-
-c---------------------------------------------------------------------
-c If this was the last stage, do the block-diagonal inversion
-c---------------------------------------------------------------------
- if (stage .eq. 1) call pinvr(c)
-
- end do
-
- return
- end
-
-
-
-
-
-
-
+++ /dev/null
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
- subroutine z_solve
-
-c---------------------------------------------------------------------
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c this function performs the solution of the approximate factorization
-c step in the z-direction for all five matrix components
-c simultaneously. The Thomas algorithm is employed to solve the
-c systems for the z-lines. Boundary conditions are non-periodic
-c---------------------------------------------------------------------
-
- include 'header.h'
- include 'mpinpb.h'
-
- integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2,
- > buffer_size, c, m, p, kstart, error,
- > requests(2), statuses(MPI_STATUS_SIZE, 2)
- double precision r1, r2, d, e, s(5), sm1, sm2,
- > fac1, fac2
-
-c---------------------------------------------------------------------
-c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells
-c on this node in the direction of increasing i for the forward sweep,
-c and after that reversing the direction for the backsubstitution
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c FORWARD ELIMINATION
-c---------------------------------------------------------------------
- do stage = 1, ncells
- c = slice(3,stage)
-
- kstart = 0
- kend = cell_size(3,c)-1
-
- isize = cell_size(1,c)
- jsize = cell_size(2,c)
- ip = cell_coord(1,c)-1
- jp = cell_coord(2,c)-1
-
- buffer_size = (isize-start(1,c)-end(1,c)) *
- > (jsize-start(2,c)-end(2,c))
-
- if (stage .ne. 1) then
-
-
-c---------------------------------------------------------------------
-c if this is not the first processor in this row of cells,
-c receive data from predecessor containing the right hand
-c sides and the upper diagonal elements of the previous two rows
-c---------------------------------------------------------------------
-
- call mpi_irecv(in_buffer, 22*buffer_size,
- > dp_type, predecessor(3),
- > DEFAULT_TAG, comm_solve,
- > requests(1), error)
-
-
-c---------------------------------------------------------------------
-c communication has already been started.
-c compute the left hand side while waiting for the msg
-c---------------------------------------------------------------------
- call lhsz(c)
-
-c---------------------------------------------------------------------
-c wait for pending communication to complete
-c---------------------------------------------------------------------
- call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c unpack the buffer
-c---------------------------------------------------------------------
- k = kstart
- k1 = kstart + 1
- n = 0
-
-c---------------------------------------------------------------------
-c create a running pointer
-c---------------------------------------------------------------------
- p = 0
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
- > in_buffer(p+1) * lhs(i,j,k,n+1,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
- > in_buffer(p+2) * lhs(i,j,k,n+1,c)
- do m = 1, 3
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > in_buffer(p+2+m) * lhs(i,j,k,n+1,c)
- end do
- d = in_buffer(p+6)
- e = in_buffer(p+7)
- do m = 1, 3
- s(m) = in_buffer(p+7+m)
- end do
- r1 = lhs(i,j,k,n+2,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
- lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
- do m = 1, 3
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
- end do
- r2 = lhs(i,j,k1,n+1,c)
- lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
- lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
- do m = 1, 3
- rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
- end do
- p = p + 10
- end do
- end do
-
- do m = 4, 5
- n = (m-3)*5
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) -
- > in_buffer(p+1) * lhs(i,j,k,n+1,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) -
- > in_buffer(p+2) * lhs(i,j,k,n+1,c)
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > in_buffer(p+3) * lhs(i,j,k,n+1,c)
- d = in_buffer(p+4)
- e = in_buffer(p+5)
- s(m) = in_buffer(p+6)
- r1 = lhs(i,j,k,n+2,c)
- lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1
- lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1
- r2 = lhs(i,j,k1,n+1,c)
- lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2
- lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2
- rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2
- p = p + 6
- end do
- end do
- end do
-
- else
-
-c---------------------------------------------------------------------
-c if this IS the first cell, we still compute the lhs
-c---------------------------------------------------------------------
- call lhsz(c)
- endif
-
-c---------------------------------------------------------------------
-c perform the Thomas algorithm; first, FORWARD ELIMINATION
-c---------------------------------------------------------------------
- n = 0
-
- do k = kstart, kend-2
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- k1 = k + 1
- k2 = k + 2
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- end do
- lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
- > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
- > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
- > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
- end do
- lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
- > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
- lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
- > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
- > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c The last two rows in this grid block are a bit different,
-c since they do not have two more rows available for the
-c elimination of off-diagonal entries
-c---------------------------------------------------------------------
- k = kend - 1
- k1 = kend
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- end do
- lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
- > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
- > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
- do m = 1, 3
- rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
- > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
- end do
-c---------------------------------------------------------------------
-c scale the last row immediately (some of this is
-c overkill in case this is the last cell)
-c---------------------------------------------------------------------
- fac2 = 1.d0/lhs(i,j,k1,n+3,c)
- lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
- lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
- do m = 1, 3
- rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c do the u+c and the u-c factors
-c---------------------------------------------------------------------
- do m = 4, 5
- n = (m-3)*5
- do k = kstart, kend-2
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- k1 = k + 1
- k2 = k + 2
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
- > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
- > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
- rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
- > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
- lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) -
- > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c)
- lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) -
- > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c)
- rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) -
- > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c)
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c And again the last two rows separately
-c---------------------------------------------------------------------
- k = kend - 1
- k1 = kend
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- fac1 = 1.d0/lhs(i,j,k,n+3,c)
- lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c)
- lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c)
- rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c)
- lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) -
- > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c)
- lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) -
- > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c)
- rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
- > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c)
-c---------------------------------------------------------------------
-c Scale the last row immediately (some of this is overkill
-c if this is the last cell)
-c---------------------------------------------------------------------
- fac2 = 1.d0/lhs(i,j,k1,n+3,c)
- lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c)
- lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c)
- rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c)
-
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c send information to the next processor, except when this
-c is the last grid block,
-c---------------------------------------------------------------------
-
- if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c create a running pointer for the send buffer
-c---------------------------------------------------------------------
- p = 0
- n = 0
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- do k = kend-1, kend
- out_buffer(p+1) = lhs(i,j,k,n+4,c)
- out_buffer(p+2) = lhs(i,j,k,n+5,c)
- do m = 1, 3
- out_buffer(p+2+m) = rhs(i,j,k,m,c)
- end do
- p = p+5
- end do
- end do
- end do
-
- do m = 4, 5
- n = (m-3)*5
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- do k = kend-1, kend
- out_buffer(p+1) = lhs(i,j,k,n+4,c)
- out_buffer(p+2) = lhs(i,j,k,n+5,c)
- out_buffer(p+3) = rhs(i,j,k,m,c)
- p = p + 3
- end do
- end do
- end do
- end do
-
-
- call mpi_isend(out_buffer, 22*buffer_size,
- > dp_type, successor(3),
- > DEFAULT_TAG, comm_solve,
- > requests(2), error)
-
- endif
- end do
-
-c---------------------------------------------------------------------
-c now go in the reverse direction
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c BACKSUBSTITUTION
-c---------------------------------------------------------------------
- do stage = ncells, 1, -1
- c = slice(3,stage)
-
- kstart = 0
- kend = cell_size(3,c)-1
-
- isize = cell_size(1,c)
- jsize = cell_size(2,c)
- ip = cell_coord(1,c)-1
- jp = cell_coord(2,c)-1
-
- buffer_size = (isize-start(1,c)-end(1,c)) *
- > (jsize-start(2,c)-end(2,c))
-
- if (stage .ne. ncells) then
-
-c---------------------------------------------------------------------
-c if this is not the starting cell in this row of cells,
-c wait for a message to be received, containing the
-c solution of the previous two stations
-c---------------------------------------------------------------------
-
- call mpi_irecv(in_buffer, 10*buffer_size,
- > dp_type, successor(3),
- > DEFAULT_TAG, comm_solve,
- > requests(1), error)
-
-
-c---------------------------------------------------------------------
-c communication has already been started
-c while waiting, do the block-diagonal inversion for the
-c cell that was just finished
-c---------------------------------------------------------------------
-
- call tzetar(slice(3,stage+1))
-
-c---------------------------------------------------------------------
-c wait for pending communication to complete
-c---------------------------------------------------------------------
- call mpi_waitall(2, requests, statuses, error)
-
-c---------------------------------------------------------------------
-c unpack the buffer for the first three factors
-c---------------------------------------------------------------------
- n = 0
- p = 0
- k = kend
- k1 = k - 1
- do m = 1, 3
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- sm1 = in_buffer(p+1)
- sm2 = in_buffer(p+2)
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*sm1 -
- > lhs(i,j,k,n+5,c)*sm2
- rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
- > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) -
- > lhs(i,j,k1,n+5,c) * sm1
- p = p + 2
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c now unpack the buffer for the remaining two factors
-c---------------------------------------------------------------------
- do m = 4, 5
- n = (m-3)*5
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- sm1 = in_buffer(p+1)
- sm2 = in_buffer(p+2)
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*sm1 -
- > lhs(i,j,k,n+5,c)*sm2
- rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) -
- > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) -
- > lhs(i,j,k1,n+5,c) * sm1
- p = p + 2
- end do
- end do
- end do
-
- else
-
-c---------------------------------------------------------------------
-c now we know this is the first grid block on the back sweep,
-c so we don't need a message to start the substitution.
-c---------------------------------------------------------------------
-
- k = kend - 1
- k1 = kend
- n = 0
- do m = 1, 3
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
- end do
- end do
- end do
-
- do m = 4, 5
- n = (m-3)*5
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c)
- end do
- end do
- end do
- endif
-
-c---------------------------------------------------------------------
-c Whether or not this is the last processor, we always have
-c to complete the back-substitution
-c---------------------------------------------------------------------
-
-c---------------------------------------------------------------------
-c The first three factors
-c---------------------------------------------------------------------
- n = 0
- do m = 1, 3
- do k = kend-2, kstart, -1
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- k1 = k + 1
- k2 = k + 2
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
- > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c And the remaining two
-c---------------------------------------------------------------------
- do m = 4, 5
- n = (m-3)*5
- do k = kend-2, kstart, -1
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- k1 = k + 1
- k2 = k + 2
- rhs(i,j,k,m,c) = rhs(i,j,k,m,c) -
- > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) -
- > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c)
- end do
- end do
- end do
- end do
-
-c---------------------------------------------------------------------
-c send on information to the previous processor, if needed
-c---------------------------------------------------------------------
- if (stage .ne. 1) then
- k = kstart
- k1 = kstart + 1
- p = 0
- do m = 1, 5
- do j = start(2,c), jsize-end(2,c)-1
- do i = start(1,c), isize-end(1,c)-1
- out_buffer(p+1) = rhs(i,j,k,m,c)
- out_buffer(p+2) = rhs(i,j,k1,m,c)
- p = p + 2
- end do
- end do
- end do
-
- call mpi_isend(out_buffer, 10*buffer_size,
- > dp_type, predecessor(3),
- > DEFAULT_TAG, comm_solve,
- > requests(2), error)
-
- endif
-
-c---------------------------------------------------------------------
-c If this was the last stage, do the block-diagonal inversion
-c---------------------------------------------------------------------
- if (stage .eq. 1) call tzetar(c)
-
- end do
-
- return
- end
-
-
-
-
-
-
-