From: Martin Quinson Date: Thu, 13 Nov 2014 20:59:09 +0000 (+0100) Subject: Remove the unmodified NAS examples as they are really useless nowadays X-Git-Tag: v3_12~732^2~211 X-Git-Url: http://info.iut-bm.univ-fcomte.fr/pub/gitweb/simgrid.git/commitdiff_plain/d8983d99631ddba747941cadb391ce80243a5529 Remove the unmodified NAS examples as they are really useless nowadays I'm still unsure of what to do with the modified ones. I vote for removing them if we have enough examples already. --- diff --git a/examples/smpi/NAS/BT/Makefile b/examples/smpi/NAS/BT/Makefile deleted file mode 100644 index dd27503221..0000000000 --- a/examples/smpi/NAS/BT/Makefile +++ /dev/null @@ -1,106 +0,0 @@ -SHELL=/bin/sh -BENCHMARK=bt -BENCHMARKU=BT -VEC= - -include ../config/make.def - - -OBJS = bt.o make_set.o initialize.o exact_solution.o exact_rhs.o \ - set_constants.o adi.o define.o copy_faces.o rhs.o solve_subs.o \ - x_solve$(VEC).o y_solve$(VEC).o z_solve$(VEC).o add.o error.o \ - verify.o setup_mpi.o \ - ${COMMON}/print_results.o ${COMMON}/timers.o - -include ../sys/make.common - -# npbparams.h is included by header.h -# The following rule should do the trick but many make programs (not gmake) -# will do the wrong thing and rebuild the world every time (because the -# mod time on header.h is not changed. One solution would be to -# touch header.h but this might cause confusion if someone has -# accidentally deleted it. Instead, make the dependency on npbparams.h -# explicit in all the lines below (even though dependence is indirect). - -# header.h: npbparams.h - -${PROGRAM}: config - @if [ x$(VERSION) = xvec ] ; then \ - ${MAKE} VEC=_vec exec; \ - elif [ x$(VERSION) = xVEC ] ; then \ - ${MAKE} VEC=_vec exec; \ - else \ - ${MAKE} exec; \ - fi - -exec: $(OBJS) - @if [ x$(SUBTYPE) = xfull ] ; then \ - ${MAKE} bt-full; \ - elif [ x$(SUBTYPE) = xFULL ] ; then \ - ${MAKE} bt-full; \ - elif [ x$(SUBTYPE) = xsimple ] ; then \ - ${MAKE} bt-simple; \ - elif [ x$(SUBTYPE) = xSIMPLE ] ; then \ - ${MAKE} bt-simple; \ - elif [ x$(SUBTYPE) = xfortran ] ; then \ - ${MAKE} bt-fortran; \ - elif [ x$(SUBTYPE) = xFORTRAN ] ; then \ - ${MAKE} bt-fortran; \ - elif [ x$(SUBTYPE) = xepio ] ; then \ - ${MAKE} bt-epio; \ - elif [ x$(SUBTYPE) = xEPIO ] ; then \ - ${MAKE} bt-epio; \ - else \ - ${MAKE} bt-bt; \ - fi - -bt-bt: ${OBJS} btio.o - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} btio.o ${FMPI_LIB} - -bt-full: ${OBJS} full_mpiio.o btio_common.o - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_full ${OBJS} btio_common.o full_mpiio.o ${FMPI_LIB} - -bt-simple: ${OBJS} simple_mpiio.o btio_common.o - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_simple ${OBJS} btio_common.o simple_mpiio.o ${FMPI_LIB} - -bt-fortran: ${OBJS} fortran_io.o btio_common.o - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.fortran_io ${OBJS} btio_common.o fortran_io.o ${FMPI_LIB} - -bt-epio: ${OBJS} epio.o btio_common.o - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.ep_io ${OBJS} btio_common.o epio.o ${FMPI_LIB} - -.f.o: - ${FCOMPILE} $< - -.c.o: - ${CCOMPILE} $< - - -bt.o: bt.f header.h npbparams.h mpinpb.h -make_set.o: make_set.f header.h npbparams.h mpinpb.h -initialize.o: initialize.f header.h npbparams.h -exact_solution.o: exact_solution.f header.h npbparams.h -exact_rhs.o: exact_rhs.f header.h npbparams.h -set_constants.o: set_constants.f header.h npbparams.h -adi.o: adi.f header.h npbparams.h -define.o: define.f header.h npbparams.h -copy_faces.o: copy_faces.f header.h npbparams.h mpinpb.h -rhs.o: rhs.f header.h npbparams.h -x_solve$(VEC).o: x_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h -y_solve$(VEC).o: y_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h -z_solve$(VEC).o: z_solve$(VEC).f header.h work_lhs$(VEC).h npbparams.h mpinpb.h -solve_subs.o: solve_subs.f npbparams.h -add.o: add.f header.h npbparams.h -error.o: error.f header.h npbparams.h mpinpb.h -verify.o: verify.f header.h npbparams.h mpinpb.h -setup_mpi.o: setup_mpi.f mpinpb.h npbparams.h -btio.o: btio.f header.h npbparams.h -btio_common.o: btio_common.f mpinpb.h npbparams.h -fortran_io.o: fortran_io.f mpinpb.h npbparams.h -simple_mpiio.o: simple_mpiio.f mpinpb.h npbparams.h -full_mpiio.o: full_mpiio.f mpinpb.h npbparams.h -epio.o: epio.f mpinpb.h npbparams.h - -clean: - - rm -f *.o *~ mputil* - - rm -f npbparams.h core diff --git a/examples/smpi/NAS/BT/add.f b/examples/smpi/NAS/BT/add.f deleted file mode 100644 index e14cde46ef..0000000000 --- a/examples/smpi/NAS/BT/add.f +++ /dev/null @@ -1,30 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine add - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c addition of update to the vector u -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, i, j, k, m - - do c = 1, ncells - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - u(m,i,j,k,c) = u(m,i,j,k,c) + rhs(m,i,j,k,c) - enddo - enddo - enddo - enddo - enddo - - return - end diff --git a/examples/smpi/NAS/BT/adi.f b/examples/smpi/NAS/BT/adi.f deleted file mode 100644 index 58450c028e..0000000000 --- a/examples/smpi/NAS/BT/adi.f +++ /dev/null @@ -1,21 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine adi - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - call copy_faces - - call x_solve - - call y_solve - - call z_solve - - call add - - return - end - diff --git a/examples/smpi/NAS/BT/bt.f b/examples/smpi/NAS/BT/bt.f deleted file mode 100644 index 36e50781b3..0000000000 --- a/examples/smpi/NAS/BT/bt.f +++ /dev/null @@ -1,275 +0,0 @@ -!-------------------------------------------------------------------------! -! ! -! N A S P A R A L L E L B E N C H M A R K S 3.3 ! -! ! -! B T ! -! ! -!-------------------------------------------------------------------------! -! ! -! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! -! It is described in NAS Technical Reports 95-020 and 02-007. ! -! ! -! Permission to use, copy, distribute and modify this software ! -! for any purpose with or without fee is hereby granted. We ! -! request, however, that all derived work reference the NAS ! -! Parallel Benchmarks 3.3. This software is provided "as is" ! -! without express or implied warranty. ! -! ! -! Information on NPB 3.3, including the technical report, the ! -! original specifications, source code, results and information ! -! on how to submit new results, is available at: ! -! ! -! http://www.nas.nasa.gov/Software/NPB/ ! -! ! -! Send comments or suggestions to npb@nas.nasa.gov ! -! ! -! NAS Parallel Benchmarks Group ! -! NASA Ames Research Center ! -! Mail Stop: T27A-1 ! -! Moffett Field, CA 94035-1000 ! -! ! -! E-mail: npb@nas.nasa.gov ! -! Fax: (650) 604-3957 ! -! ! -!-------------------------------------------------------------------------! - -c--------------------------------------------------------------------- -c -c Authors: R. F. Van der Wijngaart -c T. Harris -c M. Yarrow -c -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- - program MPBT -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i, niter, step, c, error, fstatus - double precision navg, mflops, mbytes, n3 - - external timer_read - double precision t, tmax, tiominv, tpc, timer_read - logical verified - character class, cbuff*40 - - integer wr_interval - - call setup_mpi - if (.not. active) goto 999 - -c--------------------------------------------------------------------- -c Root node reads input file (if it exists) else takes -c defaults from parameters -c--------------------------------------------------------------------- - if (node .eq. root) then - - write(*, 1000) - open (unit=2,file='inputbt.data',status='old', iostat=fstatus) -c - rd_interval = 0 - if (fstatus .eq. 0) then - write(*,233) - 233 format(' Reading from input file inputbt.data') - read (2,*) niter - read (2,*) dt - read (2,*) grid_points(1), grid_points(2), grid_points(3) - if (iotype .ne. 0) then - read (2,'(A)') cbuff - read (cbuff,*,iostat=i) wr_interval, rd_interval - if (i .ne. 0) rd_interval = 0 - if (wr_interval .le. 0) wr_interval = wr_default - endif - if (iotype .eq. 1) then - read (2,*) collbuf_nodes, collbuf_size - write(*,*) 'collbuf_nodes ', collbuf_nodes - write(*,*) 'collbuf_size ', collbuf_size - endif - close(2) - else - write(*,234) - niter = niter_default - dt = dt_default - grid_points(1) = problem_size - grid_points(2) = problem_size - grid_points(3) = problem_size - wr_interval = wr_default - if (iotype .eq. 1) then -c set number of nodes involved in collective buffering to 4, -c unless total number of nodes is smaller than that. -c set buffer size for collective buffering to 1MB per node -c collbuf_nodes = min(4,no_nodes) -c set default to No-File-Hints with a value of 0 - collbuf_nodes = 0 - collbuf_size = 1000000 - endif - endif - 234 format(' No input file inputbt.data. Using compiled defaults') - - write(*, 1001) grid_points(1), grid_points(2), grid_points(3) - write(*, 1002) niter, dt - if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes - if (no_nodes .ne. maxcells*maxcells) - > write(*, 1005) maxcells*maxcells - write(*, 1003) no_nodes - - if (iotype .eq. 1) write(*, 1006) 'FULL MPI-IO', wr_interval - if (iotype .eq. 2) write(*, 1006) 'SIMPLE MPI-IO', wr_interval - if (iotype .eq. 3) write(*, 1006) 'EPIO', wr_interval - if (iotype .eq. 4) write(*, 1006) 'FORTRAN IO', wr_interval - - 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- BT Benchmark ',/) - 1001 format(' Size: ', i4, 'x', i4, 'x', i4) - 1002 format(' Iterations: ', i4, ' dt: ', F11.7) - 1004 format(' Total number of processes: ', i5) - 1005 format(' WARNING: compiled for ', i5, ' processes ') - 1003 format(' Number of active processes: ', i5, /) - 1006 format(' BTIO -- ', A, ' write interval: ', i3 /) - - endif - - call mpi_bcast(niter, 1, MPI_INTEGER, - > root, comm_setup, error) - - call mpi_bcast(dt, 1, dp_type, - > root, comm_setup, error) - - call mpi_bcast(grid_points(1), 3, MPI_INTEGER, - > root, comm_setup, error) - - call mpi_bcast(wr_interval, 1, MPI_INTEGER, - > root, comm_setup, error) - - call mpi_bcast(rd_interval, 1, MPI_INTEGER, - > root, comm_setup, error) - - call make_set - - do c = 1, maxcells - if ( (cell_size(1,c) .gt. IMAX) .or. - > (cell_size(2,c) .gt. JMAX) .or. - > (cell_size(3,c) .gt. KMAX) ) then - print *,node, c, (cell_size(i,c),i=1,3) - print *,' Problem size too big for compiled array sizes' - goto 999 - endif - end do - - call set_constants - - call initialize - - call setup_btio - idump = 0 - - call lhsinit - - call exact_rhs - - call compute_buffer_size(5) - -c--------------------------------------------------------------------- -c do one time step to touch all code, and reinitialize -c--------------------------------------------------------------------- - call adi - call initialize - - call timer_clear(2) - -c--------------------------------------------------------------------- -c Synchronize before placing time stamp -c--------------------------------------------------------------------- - call mpi_barrier(comm_setup, error) - - call timer_clear(1) - call timer_start(1) - - do step = 1, niter - - if (node .eq. root) then - if (mod(step, 20) .eq. 0 .or. step .eq. niter .or. - > step .eq. 1) then - write(*, 200) step - 200 format(' Time step ', i4) - endif - endif - - call adi - - if (iotype .ne. 0) then - call timer_start(2) - if (mod(step, wr_interval).eq.0 .or. step .eq. niter) then - if (node .eq. root) then - print *, 'Writing data set, time step', step - endif - if (step .eq. niter .and. rd_interval .gt. 1) then - rd_interval = 1 - endif - call output_timestep - idump = idump + 1 - endif - call timer_stop(2) - endif - end do - - call btio_cleanup - - call timer_stop(1) - t = timer_read(1) - - call verify(niter, class, verified) - - call mpi_reduce(t, tmax, 1, - > dp_type, MPI_MAX, - > root, comm_setup, error) - - if (iotype .ne. 0) then - t = timer_read(2) - if (t .ne. 0.d0) t = 1.0d0 / t - call mpi_reduce(t, tiominv, 1, - > dp_type, MPI_SUM, - > root, comm_setup, error) - endif - - if( node .eq. root ) then - n3 = 1.0d0*grid_points(1)*grid_points(2)*grid_points(3) - navg = (grid_points(1)+grid_points(2)+grid_points(3))/3.0 - if( tmax .ne. 0. ) then - mflops = 1.0e-6*float(niter)* - > (3478.8*n3-17655.7*navg**2+28023.7*navg) - > / tmax - else - mflops = 0.0 - endif - - if (iotype .ne. 0) then - mbytes = n3 * 40.0 * idump * 1.0d-6 - tiominv = tiominv / no_nodes - t = 0.0 - if (tiominv .ne. 0.) t = 1.d0 / tiominv - tpc = 0.0 - if (tmax .ne. 0.) tpc = t * 100.0 / tmax - write(*,1100) t, tpc, mbytes, mbytes*tiominv - 1100 format(/' BTIO -- statistics:'/ - > ' I/O timing in seconds : ', f14.2/ - > ' I/O timing percentage : ', f14.2/ - > ' Total data written (MB) : ', f14.2/ - > ' I/O data rate (MB/sec) : ', f14.2) - endif - - call print_results('BT', class, grid_points(1), - > grid_points(2), grid_points(3), niter, maxcells*maxcells, - > total_nodes, tmax, mflops, ' floating point', - > verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5, - > cs6, '(none)') - endif - - 999 continue - call mpi_barrier(MPI_COMM_WORLD, error) - call mpi_finalize(error) - - end - diff --git a/examples/smpi/NAS/BT/btio.f b/examples/smpi/NAS/BT/btio.f deleted file mode 100644 index 1fb730b1f6..0000000000 --- a/examples/smpi/NAS/BT/btio.f +++ /dev/null @@ -1,72 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setup_btio - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine output_timestep - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine btio_cleanup - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine btio_verify(verified) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - logical verified - - verified = .true. - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine accumulate_norms(xce_acc) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - double precision xce_acc(5) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine checksum_timestep - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - return - end diff --git a/examples/smpi/NAS/BT/btio_common.f b/examples/smpi/NAS/BT/btio_common.f deleted file mode 100644 index 9227a12b70..0000000000 --- a/examples/smpi/NAS/BT/btio_common.f +++ /dev/null @@ -1,30 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine clear_timestep - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer cio, kio, jio, ix - - do cio=1,ncells - do kio=0, cell_size(3,cio)-1 - do jio=0, cell_size(2,cio)-1 - do ix=0,cell_size(1,cio)-1 - u(1,ix, jio,kio,cio) = 0 - u(2,ix, jio,kio,cio) = 0 - u(3,ix, jio,kio,cio) = 0 - u(4,ix, jio,kio,cio) = 0 - u(5,ix, jio,kio,cio) = 0 - enddo - enddo - enddo - enddo - - return - end - diff --git a/examples/smpi/NAS/BT/copy_faces.f b/examples/smpi/NAS/BT/copy_faces.f deleted file mode 100644 index 14b82caf83..0000000000 --- a/examples/smpi/NAS/BT/copy_faces.f +++ /dev/null @@ -1,316 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine copy_faces - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c This function copies the face values of a variable defined on a set -c of cells to the overlap locations of the adjacent sets of cells. -c Because a set of cells interfaces in each direction with exactly one -c other set, we only need to fill six different buffers. We could try to -c overlap communication with computation, by computing -c some internal values while communicating boundary values, but this -c adds so much overhead that it's not clearly useful. -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i, j, k, c, m, requests(0:11), p0, p1, - > p2, p3, p4, p5, b_size(0:5), ss(0:5), - > sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11) - -c--------------------------------------------------------------------- -c exit immediately if there are no faces to be copied -c--------------------------------------------------------------------- - if (no_nodes .eq. 1) then - call compute_rhs - return - endif - - ss(0) = start_send_east - ss(1) = start_send_west - ss(2) = start_send_north - ss(3) = start_send_south - ss(4) = start_send_top - ss(5) = start_send_bottom - - sr(0) = start_recv_east - sr(1) = start_recv_west - sr(2) = start_recv_north - sr(3) = start_recv_south - sr(4) = start_recv_top - sr(5) = start_recv_bottom - - b_size(0) = east_size - b_size(1) = west_size - b_size(2) = north_size - b_size(3) = south_size - b_size(4) = top_size - b_size(5) = bottom_size - -c--------------------------------------------------------------------- -c because the difference stencil for the diagonalized scheme is -c orthogonal, we do not have to perform the staged copying of faces, -c but can send all face information simultaneously to the neighboring -c cells in all directions -c--------------------------------------------------------------------- - p0 = 0 - p1 = 0 - p2 = 0 - p3 = 0 - p4 = 0 - p5 = 0 - - do c = 1, ncells - -c--------------------------------------------------------------------- -c fill the buffer to be sent to eastern neighbors (i-dir) -c--------------------------------------------------------------------- - if (cell_coord(1,c) .ne. ncells) then - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = cell_size(1,c)-2, cell_size(1,c)-1 - do m = 1, 5 - out_buffer(ss(0)+p0) = u(m,i,j,k,c) - p0 = p0 + 1 - end do - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to western neighbors -c--------------------------------------------------------------------- - if (cell_coord(1,c) .ne. 1) then - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, 1 - do m = 1, 5 - out_buffer(ss(1)+p1) = u(m,i,j,k,c) - p1 = p1 + 1 - end do - end do - end do - end do - - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to northern neighbors (j_dir) -c--------------------------------------------------------------------- - if (cell_coord(2,c) .ne. ncells) then - do k = 0, cell_size(3,c)-1 - do j = cell_size(2,c)-2, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - out_buffer(ss(2)+p2) = u(m,i,j,k,c) - p2 = p2 + 1 - end do - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to southern neighbors -c--------------------------------------------------------------------- - if (cell_coord(2,c).ne. 1) then - do k = 0, cell_size(3,c)-1 - do j = 0, 1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - out_buffer(ss(3)+p3) = u(m,i,j,k,c) - p3 = p3 + 1 - end do - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to top neighbors (k-dir) -c--------------------------------------------------------------------- - if (cell_coord(3,c) .ne. ncells) then - do k = cell_size(3,c)-2, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - out_buffer(ss(4)+p4) = u(m,i,j,k,c) - p4 = p4 + 1 - end do - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to bottom neighbors -c--------------------------------------------------------------------- - if (cell_coord(3,c).ne. 1) then - do k=0, 1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - out_buffer(ss(5)+p5) = u(m,i,j,k,c) - p5 = p5 + 1 - end do - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c cell loop -c--------------------------------------------------------------------- - end do - - call mpi_irecv(in_buffer(sr(0)), b_size(0), - > dp_type, successor(1), WEST, - > comm_rhs, requests(0), error) - call mpi_irecv(in_buffer(sr(1)), b_size(1), - > dp_type, predecessor(1), EAST, - > comm_rhs, requests(1), error) - call mpi_irecv(in_buffer(sr(2)), b_size(2), - > dp_type, successor(2), SOUTH, - > comm_rhs, requests(2), error) - call mpi_irecv(in_buffer(sr(3)), b_size(3), - > dp_type, predecessor(2), NORTH, - > comm_rhs, requests(3), error) - call mpi_irecv(in_buffer(sr(4)), b_size(4), - > dp_type, successor(3), BOTTOM, - > comm_rhs, requests(4), error) - call mpi_irecv(in_buffer(sr(5)), b_size(5), - > dp_type, predecessor(3), TOP, - > comm_rhs, requests(5), error) - - call mpi_isend(out_buffer(ss(0)), b_size(0), - > dp_type, successor(1), EAST, - > comm_rhs, requests(6), error) - call mpi_isend(out_buffer(ss(1)), b_size(1), - > dp_type, predecessor(1), WEST, - > comm_rhs, requests(7), error) - call mpi_isend(out_buffer(ss(2)), b_size(2), - > dp_type,successor(2), NORTH, - > comm_rhs, requests(8), error) - call mpi_isend(out_buffer(ss(3)), b_size(3), - > dp_type,predecessor(2), SOUTH, - > comm_rhs, requests(9), error) - call mpi_isend(out_buffer(ss(4)), b_size(4), - > dp_type,successor(3), TOP, - > comm_rhs, requests(10), error) - call mpi_isend(out_buffer(ss(5)), b_size(5), - > dp_type,predecessor(3), BOTTOM, - > comm_rhs,requests(11), error) - - - call mpi_waitall(12, requests, statuses, error) - -c--------------------------------------------------------------------- -c unpack the data that has just been received; -c--------------------------------------------------------------------- - p0 = 0 - p1 = 0 - p2 = 0 - p3 = 0 - p4 = 0 - p5 = 0 - - do c = 1, ncells - - if (cell_coord(1,c) .ne. 1) then - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = -2, -1 - do m = 1, 5 - u(m,i,j,k,c) = in_buffer(sr(1)+p0) - p0 = p0 + 1 - end do - end do - end do - end do - endif - - if (cell_coord(1,c) .ne. ncells) then - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = cell_size(1,c), cell_size(1,c)+1 - do m = 1, 5 - u(m,i,j,k,c) = in_buffer(sr(0)+p1) - p1 = p1 + 1 - end do - end do - end do - end do - end if - - if (cell_coord(2,c) .ne. 1) then - do k = 0, cell_size(3,c)-1 - do j = -2, -1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - u(m,i,j,k,c) = in_buffer(sr(3)+p2) - p2 = p2 + 1 - end do - end do - end do - end do - - endif - - if (cell_coord(2,c) .ne. ncells) then - do k = 0, cell_size(3,c)-1 - do j = cell_size(2,c), cell_size(2,c)+1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - u(m,i,j,k,c) = in_buffer(sr(2)+p3) - p3 = p3 + 1 - end do - end do - end do - end do - endif - - if (cell_coord(3,c) .ne. 1) then - do k = -2, -1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - u(m,i,j,k,c) = in_buffer(sr(5)+p4) - p4 = p4 + 1 - end do - end do - end do - end do - endif - - if (cell_coord(3,c) .ne. ncells) then - do k = cell_size(3,c), cell_size(3,c)+1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - u(m,i,j,k,c) = in_buffer(sr(4)+p5) - p5 = p5 + 1 - end do - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c cells loop -c--------------------------------------------------------------------- - end do - -c--------------------------------------------------------------------- -c do the rest of the rhs that uses the copied face values -c--------------------------------------------------------------------- - call compute_rhs - - return - end diff --git a/examples/smpi/NAS/BT/define.f b/examples/smpi/NAS/BT/define.f deleted file mode 100644 index 03c4c6edd7..0000000000 --- a/examples/smpi/NAS/BT/define.f +++ /dev/null @@ -1,64 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine compute_buffer_size(dim) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, dim, face_size - - if (ncells .eq. 1) return - -c--------------------------------------------------------------------- -c compute the actual sizes of the buffers; note that there is -c always one cell face that doesn't need buffer space, because it -c is at the boundary of the grid -c--------------------------------------------------------------------- - west_size = 0 - east_size = 0 - - do c = 1, ncells - face_size = cell_size(2,c) * cell_size(3,c) * dim * 2 - if (cell_coord(1,c).ne.1) west_size = west_size + face_size - if (cell_coord(1,c).ne.ncells) east_size = east_size + - > face_size - end do - - north_size = 0 - south_size = 0 - do c = 1, ncells - face_size = cell_size(1,c)*cell_size(3,c) * dim * 2 - if (cell_coord(2,c).ne.1) south_size = south_size + face_size - if (cell_coord(2,c).ne.ncells) north_size = north_size + - > face_size - end do - - top_size = 0 - bottom_size = 0 - do c = 1, ncells - face_size = cell_size(1,c) * cell_size(2,c) * dim * 2 - if (cell_coord(3,c).ne.1) bottom_size = bottom_size + - > face_size - if (cell_coord(3,c).ne.ncells) top_size = top_size + - > face_size - end do - - start_send_west = 1 - start_send_east = start_send_west + west_size - start_send_south = start_send_east + east_size - start_send_north = start_send_south + south_size - start_send_bottom = start_send_north + north_size - start_send_top = start_send_bottom + bottom_size - start_recv_west = 1 - start_recv_east = start_recv_west + west_size - start_recv_south = start_recv_east + east_size - start_recv_north = start_recv_south + south_size - start_recv_bottom = start_recv_north + north_size - start_recv_top = start_recv_bottom + bottom_size - - return - end - diff --git a/examples/smpi/NAS/BT/epio.f b/examples/smpi/NAS/BT/epio.f deleted file mode 100644 index 52b630999d..0000000000 --- a/examples/smpi/NAS/BT/epio.f +++ /dev/null @@ -1,165 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setup_btio - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - character*(128) newfilenm - integer m - - if (node .lt. 10000) then - write (newfilenm, 996) filenm,node - else - print *, 'error generating file names (> 10000 nodes)' - stop - endif - -996 format (a,'.',i4.4) - - open (unit=99, file=newfilenm, form='unformatted', - $ status='unknown') - - do m = 1, 5 - xce_sub(m) = 0.d0 - end do - - idump_sub = 0 - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine output_timestep - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - include 'header.h' - include 'mpinpb.h' - - integer ix, iio, jio, kio, cio, aio - - do cio=1,ncells - write(99) - $ ((((u(aio,ix, jio,kio,cio),aio=1,5), - $ ix=0, cell_size(1,cio)-1), - $ jio=0, cell_size(2,cio)-1), - $ kio=0, cell_size(3,cio)-1) - enddo - - idump_sub = idump_sub + 1 - if (rd_interval .gt. 0) then - if (idump_sub .ge. rd_interval) then - - rewind(99) - call acc_sub_norms(idump+1) - - rewind(99) - idump_sub = 0 - endif - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine acc_sub_norms(idump_cur) - - include 'header.h' - include 'mpinpb.h' - - integer idump_cur - - integer ix, jio, kio, cio, ii, m, ichunk - double precision xce_single(5) - - ichunk = idump_cur - idump_sub + 1 - do ii=0, idump_sub-1 - do cio=1,ncells - read(99) - $ ((((u(m,ix, jio,kio,cio),m=1,5), - $ ix=0, cell_size(1,cio)-1), - $ jio=0, cell_size(2,cio)-1), - $ kio=0, cell_size(3,cio)-1) - enddo - - if (node .eq. root) print *, 'Reading data set ', ii+ichunk - - call error_norm(xce_single) - do m = 1, 5 - xce_sub(m) = xce_sub(m) + xce_single(m) - end do - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine btio_cleanup - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - close(unit=99) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine accumulate_norms(xce_acc) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - double precision xce_acc(5) - - character*(128) newfilenm - integer m - - if (rd_interval .gt. 0) goto 20 - - if (node .lt. 10000) then - write (newfilenm, 996) filenm,node - else - print *, 'error generating file names (> 10000 nodes)' - stop - endif - -996 format (a,'.',i4.4) - - open (unit=99, file=newfilenm, - $ form='unformatted') - -c clear the last time step - - call clear_timestep - -c read back the time steps and accumulate norms - - call acc_sub_norms(idump) - - close(unit=99) - - 20 continue - do m = 1, 5 - xce_acc(m) = xce_sub(m) / dble(idump) - end do - - return - end diff --git a/examples/smpi/NAS/BT/error.f b/examples/smpi/NAS/BT/error.f deleted file mode 100644 index 147a582b58..0000000000 --- a/examples/smpi/NAS/BT/error.f +++ /dev/null @@ -1,106 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine error_norm(rms) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c this function computes the norm of the difference between the -c computed solution and the exact solution -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer c, i, j, k, m, ii, jj, kk, d, error - double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5), - > add - - do m = 1, 5 - rms_work(m) = 0.0d0 - enddo - - do c = 1, ncells - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - ii = 0 - do i = cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - call exact_solution(xi, eta, zeta, u_exact) - - do m = 1, 5 - add = u(m,ii,jj,kk,c)-u_exact(m) - rms_work(m) = rms_work(m) + add*add - enddo - ii = ii + 1 - enddo - jj = jj + 1 - enddo - kk = kk + 1 - enddo - enddo - - call mpi_allreduce(rms_work, rms, 5, dp_type, - > MPI_SUM, comm_setup, error) - - do m = 1, 5 - do d = 1, 3 - rms(m) = rms(m) / dble(grid_points(d)-2) - enddo - rms(m) = dsqrt(rms(m)) - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine rhs_norm(rms) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer c, i, j, k, d, m, error - double precision rms(5), rms_work(5), add - - do m = 1, 5 - rms_work(m) = 0.0d0 - enddo - - do c = 1, ncells - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - add = rhs(m,i,j,k,c) - rms_work(m) = rms_work(m) + add*add - enddo - enddo - enddo - enddo - enddo - - call mpi_allreduce(rms_work, rms, 5, dp_type, - > MPI_SUM, comm_setup, error) - - do m = 1, 5 - do d = 1, 3 - rms(m) = rms(m) / dble(grid_points(d)-2) - enddo - rms(m) = dsqrt(rms(m)) - enddo - - return - end - diff --git a/examples/smpi/NAS/BT/exact_rhs.f b/examples/smpi/NAS/BT/exact_rhs.f deleted file mode 100644 index 26a2871d20..0000000000 --- a/examples/smpi/NAS/BT/exact_rhs.f +++ /dev/null @@ -1,360 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exact_rhs - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute the right hand side based on exact solution -c--------------------------------------------------------------------- - - include 'header.h' - - double precision dtemp(5), xi, eta, zeta, dtpp - integer c, m, i, j, k, ip1, im1, jp1, - > jm1, km1, kp1 - - -c--------------------------------------------------------------------- -c loop over all cells owned by this node -c--------------------------------------------------------------------- - do c = 1, ncells - -c--------------------------------------------------------------------- -c initialize -c--------------------------------------------------------------------- - do k= 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - forcing(m,i,j,k,c) = 0.0d0 - enddo - enddo - enddo - enddo - -c--------------------------------------------------------------------- -c xi-direction flux differences -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - zeta = dble(k+cell_low(3,c)) * dnzm1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - eta = dble(j+cell_low(2,c)) * dnym1 - - do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c) - xi = dble(i+cell_low(1,c)) * dnxm1 - - call exact_solution(xi, eta, zeta, dtemp) - do m = 1, 5 - ue(i,m) = dtemp(m) - enddo - - dtpp = 1.0d0 / dtemp(1) - - do m = 2, 5 - buf(i,m) = dtpp * dtemp(m) - enddo - - cuf(i) = buf(i,2) * buf(i,2) - buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + - > buf(i,4) * buf(i,4) - q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) + - > buf(i,4)*ue(i,4)) - - enddo - - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - im1 = i-1 - ip1 = i+1 - - forcing(1,i,j,k,c) = forcing(1,i,j,k,c) - - > tx2*( ue(ip1,2)-ue(im1,2) )+ - > dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1)) - - forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tx2 * ( - > (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))- - > (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+ - > xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+ - > dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2)) - - forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tx2 * ( - > ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+ - > xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+ - > dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3)) - - forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tx2*( - > ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+ - > xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+ - > dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4)) - - forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tx2*( - > buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))- - > buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+ - > 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+ - > buf(im1,1))+ - > xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+ - > xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+ - > dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5)) - enddo - -c--------------------------------------------------------------------- -c Fourth-order dissipation -c--------------------------------------------------------------------- - if (start(1,c) .gt. 0) then - do m = 1, 5 - i = 1 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m)) - i = 2 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) - - > 4.0d0*ue(i+1,m) + ue(i+2,m)) - enddo - endif - - do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1 - do m = 1, 5 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp* - > (ue(i-2,m) - 4.0d0*ue(i-1,m) + - > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m)) - enddo - enddo - - if (end(1,c) .gt. 0) then - do m = 1, 5 - i = cell_size(1,c)-3 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (ue(i-2,m) - 4.0d0*ue(i-1,m) + - > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m)) - i = cell_size(1,c)-2 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m)) - enddo - endif - - enddo - enddo - -c--------------------------------------------------------------------- -c eta-direction flux differences -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - zeta = dble(k+cell_low(3,c)) * dnzm1 - do i=start(1,c), cell_size(1,c)-end(1,c)-1 - xi = dble(i+cell_low(1,c)) * dnxm1 - - do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c) - eta = dble(j+cell_low(2,c)) * dnym1 - - call exact_solution(xi, eta, zeta, dtemp) - do m = 1, 5 - ue(j,m) = dtemp(m) - enddo - - dtpp = 1.0d0/dtemp(1) - - do m = 2, 5 - buf(j,m) = dtpp * dtemp(m) - enddo - - cuf(j) = buf(j,3) * buf(j,3) - buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + - > buf(j,4) * buf(j,4) - q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) + - > buf(j,4)*ue(j,4)) - enddo - - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - jm1 = j-1 - jp1 = j+1 - - forcing(1,i,j,k,c) = forcing(1,i,j,k,c) - - > ty2*( ue(jp1,3)-ue(jm1,3) )+ - > dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1)) - - forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - ty2*( - > ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+ - > yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+ - > dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2)) - - forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - ty2*( - > (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))- - > (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+ - > yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+ - > dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3)) - - forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - ty2*( - > ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+ - > yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+ - > dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4)) - - forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - ty2*( - > buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))- - > buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+ - > 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+ - > buf(jm1,1))+ - > yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+ - > yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+ - > dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5)) - enddo - -c--------------------------------------------------------------------- -c Fourth-order dissipation -c--------------------------------------------------------------------- - if (start(2,c) .gt. 0) then - do m = 1, 5 - j = 1 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m)) - j = 2 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) - - > 4.0d0*ue(j+1,m) + ue(j+2,m)) - enddo - endif - - do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1 - do m = 1, 5 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp* - > (ue(j-2,m) - 4.0d0*ue(j-1,m) + - > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m)) - enddo - enddo - - if (end(2,c) .gt. 0) then - do m = 1, 5 - j = cell_size(2,c)-3 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (ue(j-2,m) - 4.0d0*ue(j-1,m) + - > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m)) - j = cell_size(2,c)-2 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m)) - - enddo - endif - - enddo - enddo - -c--------------------------------------------------------------------- -c zeta-direction flux differences -c--------------------------------------------------------------------- - do j=start(2,c), cell_size(2,c)-end(2,c)-1 - eta = dble(j+cell_low(2,c)) * dnym1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - xi = dble(i+cell_low(1,c)) * dnxm1 - - do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c) - zeta = dble(k+cell_low(3,c)) * dnzm1 - - call exact_solution(xi, eta, zeta, dtemp) - do m = 1, 5 - ue(k,m) = dtemp(m) - enddo - - dtpp = 1.0d0/dtemp(1) - - do m = 2, 5 - buf(k,m) = dtpp * dtemp(m) - enddo - - cuf(k) = buf(k,4) * buf(k,4) - buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + - > buf(k,3) * buf(k,3) - q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) + - > buf(k,4)*ue(k,4)) - enddo - - do k=start(3,c), cell_size(3,c)-end(3,c)-1 - km1 = k-1 - kp1 = k+1 - - forcing(1,i,j,k,c) = forcing(1,i,j,k,c) - - > tz2*( ue(kp1,4)-ue(km1,4) )+ - > dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1)) - - forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tz2 * ( - > ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+ - > zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+ - > dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2)) - - forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tz2 * ( - > ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+ - > zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+ - > dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3)) - - forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tz2 * ( - > (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))- - > (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+ - > zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+ - > dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4)) - - forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tz2 * ( - > buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))- - > buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+ - > 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1) - > +buf(km1,1))+ - > zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+ - > zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+ - > dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5)) - enddo - -c--------------------------------------------------------------------- -c Fourth-order dissipation -c--------------------------------------------------------------------- - if (start(3,c) .gt. 0) then - do m = 1, 5 - k = 1 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m)) - k = 2 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) - - > 4.0d0*ue(k+1,m) + ue(k+2,m)) - enddo - endif - - do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1 - do m = 1, 5 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp* - > (ue(k-2,m) - 4.0d0*ue(k-1,m) + - > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m)) - enddo - enddo - - if (end(3,c) .gt. 0) then - do m = 1, 5 - k = cell_size(3,c)-3 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (ue(k-2,m) - 4.0d0*ue(k-1,m) + - > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m)) - k = cell_size(3,c)-2 - forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp * - > (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m)) - enddo - endif - - enddo - enddo - -c--------------------------------------------------------------------- -c now change the sign of the forcing function, -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - forcing(m,i,j,k,c) = -1.d0 * forcing(m,i,j,k,c) - enddo - enddo - enddo - enddo - - enddo - - return - end diff --git a/examples/smpi/NAS/BT/exact_solution.f b/examples/smpi/NAS/BT/exact_solution.f deleted file mode 100644 index b093b46d16..0000000000 --- a/examples/smpi/NAS/BT/exact_solution.f +++ /dev/null @@ -1,29 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exact_solution(xi,eta,zeta,dtemp) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c this function returns the exact solution at point xi, eta, zeta -c--------------------------------------------------------------------- - - include 'header.h' - - double precision xi, eta, zeta, dtemp(5) - integer m - - do m = 1, 5 - dtemp(m) = ce(m,1) + - > xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) + - > eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+ - > zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + - > zeta*ce(m,13)))) - enddo - - return - end - - diff --git a/examples/smpi/NAS/BT/fortran_io.f b/examples/smpi/NAS/BT/fortran_io.f deleted file mode 100644 index d3085a030a..0000000000 --- a/examples/smpi/NAS/BT/fortran_io.f +++ /dev/null @@ -1,174 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setup_btio - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - character*(128) newfilenm - integer m, ierr - - if (node.eq.root) record_length = 40/fortran_rec_sz - call mpi_bcast(record_length, 1, MPI_INTEGER, - > root, comm_setup, ierr) - - open (unit=99, file=filenm, - $ form='unformatted', access='direct', - $ recl=record_length) - - do m = 1, 5 - xce_sub(m) = 0.d0 - end do - - idump_sub = 0 - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine output_timestep - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - include 'header.h' - include 'mpinpb.h' - - integer ix, jio, kio, cio - - do cio=1,ncells - do kio=0, cell_size(3,cio)-1 - do jio=0, cell_size(2,cio)-1 - iseek=(cell_low(1,cio) + - $ PROBLEM_SIZE*((cell_low(2,cio)+jio) + - $ PROBLEM_SIZE*((cell_low(3,cio)+kio) + - $ PROBLEM_SIZE*idump_sub))) - - do ix=0,cell_size(1,cio)-1 - write(99, rec=iseek+ix+1) - $ u(1,ix, jio,kio,cio), - $ u(2,ix, jio,kio,cio), - $ u(3,ix, jio,kio,cio), - $ u(4,ix, jio,kio,cio), - $ u(5,ix, jio,kio,cio) - enddo - enddo - enddo - enddo - - idump_sub = idump_sub + 1 - if (rd_interval .gt. 0) then - if (idump_sub .ge. rd_interval) then - - call acc_sub_norms(idump+1) - - idump_sub = 0 - endif - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine acc_sub_norms(idump_cur) - - include 'header.h' - include 'mpinpb.h' - - integer idump_cur - - integer ix, jio, kio, cio, ii, m, ichunk - double precision xce_single(5) - - ichunk = idump_cur - idump_sub + 1 - do ii=0, idump_sub-1 - do cio=1,ncells - do kio=0, cell_size(3,cio)-1 - do jio=0, cell_size(2,cio)-1 - iseek=(cell_low(1,cio) + - $ PROBLEM_SIZE*((cell_low(2,cio)+jio) + - $ PROBLEM_SIZE*((cell_low(3,cio)+kio) + - $ PROBLEM_SIZE*ii))) - - - do ix=0,cell_size(1,cio)-1 - read(99, rec=iseek+ix+1) - $ u(1,ix, jio,kio,cio), - $ u(2,ix, jio,kio,cio), - $ u(3,ix, jio,kio,cio), - $ u(4,ix, jio,kio,cio), - $ u(5,ix, jio,kio,cio) - enddo - enddo - enddo - enddo - - if (node .eq. root) print *, 'Reading data set ', ii+ichunk - - call error_norm(xce_single) - do m = 1, 5 - xce_sub(m) = xce_sub(m) + xce_single(m) - end do - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine btio_cleanup - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - close(unit=99) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine accumulate_norms(xce_acc) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - include 'header.h' - include 'mpinpb.h' - - double precision xce_acc(5) - integer m - - if (rd_interval .gt. 0) goto 20 - - open (unit=99, file=filenm, - $ form='unformatted', access='direct', - $ recl=record_length) - -c clear the last time step - - call clear_timestep - -c read back the time steps and accumulate norms - - call acc_sub_norms(idump) - - close(unit=99) - - 20 continue - do m = 1, 5 - xce_acc(m) = xce_sub(m) / dble(idump) - end do - - return - end diff --git a/examples/smpi/NAS/BT/full_mpiio.f b/examples/smpi/NAS/BT/full_mpiio.f deleted file mode 100644 index ecfd41ca73..0000000000 --- a/examples/smpi/NAS/BT/full_mpiio.f +++ /dev/null @@ -1,307 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setup_btio - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer ierr - integer mstatus(MPI_STATUS_SIZE) - integer sizes(4), starts(4), subsizes(4) - integer cell_btype(maxcells), cell_ftype(maxcells) - integer cell_blength(maxcells) - integer info - character*20 cb_nodes, cb_size - integer c, m - integer cell_disp(maxcells) - - call mpi_bcast(collbuf_nodes, 1, MPI_INTEGER, - > root, comm_setup, ierr) - - call mpi_bcast(collbuf_size, 1, MPI_INTEGER, - > root, comm_setup, ierr) - - if (collbuf_nodes .eq. 0) then - info = MPI_INFO_NULL - else - write (cb_nodes,*) collbuf_nodes - write (cb_size,*) collbuf_size - call MPI_Info_create(info, ierr) - call MPI_Info_set(info, 'cb_nodes', cb_nodes, ierr) - call MPI_Info_set(info, 'cb_buffer_size', cb_size, ierr) - call MPI_Info_set(info, 'collective_buffering', 'true', ierr) - endif - - call MPI_Type_contiguous(5, MPI_DOUBLE_PRECISION, - $ element, ierr) - call MPI_Type_commit(element, ierr) - call MPI_Type_extent(element, eltext, ierr) - - do c = 1, ncells -c -c Outer array dimensions ar same for every cell -c - sizes(1) = IMAX+4 - sizes(2) = JMAX+4 - sizes(3) = KMAX+4 -c -c 4th dimension is cell number, total of maxcells cells -c - sizes(4) = maxcells -c -c Internal dimensions of cells can differ slightly between cells -c - subsizes(1) = cell_size(1, c) - subsizes(2) = cell_size(2, c) - subsizes(3) = cell_size(3, c) -c -c Cell is 4th dimension, 1 cell per cell type to handle varying -c cell sub-array sizes -c - subsizes(4) = 1 - -c -c type constructors use 0-based start addresses -c - starts(1) = 2 - starts(2) = 2 - starts(3) = 2 - starts(4) = c-1 - -c -c Create buftype for a cell -c - call MPI_Type_create_subarray(4, sizes, subsizes, - $ starts, MPI_ORDER_FORTRAN, element, - $ cell_btype(c), ierr) -c -c block length and displacement for joining cells - -c 1 cell buftype per block, cell buftypes have own displacment -c generated from cell number (4th array dimension) -c - cell_blength(c) = 1 - cell_disp(c) = 0 - - enddo -c -c Create combined buftype for all cells -c - call MPI_Type_struct(ncells, cell_blength, cell_disp, - $ cell_btype, combined_btype, ierr) - call MPI_Type_commit(combined_btype, ierr) - - do c = 1, ncells -c -c Entire array size -c - sizes(1) = PROBLEM_SIZE - sizes(2) = PROBLEM_SIZE - sizes(3) = PROBLEM_SIZE - -c -c Size of c'th cell -c - subsizes(1) = cell_size(1, c) - subsizes(2) = cell_size(2, c) - subsizes(3) = cell_size(3, c) - -c -c Starting point in full array of c'th cell -c - starts(1) = cell_low(1,c) - starts(2) = cell_low(2,c) - starts(3) = cell_low(3,c) - - call MPI_Type_create_subarray(3, sizes, subsizes, - $ starts, MPI_ORDER_FORTRAN, - $ element, cell_ftype(c), ierr) - cell_blength(c) = 1 - cell_disp(c) = 0 - enddo - - call MPI_Type_struct(ncells, cell_blength, cell_disp, - $ cell_ftype, combined_ftype, ierr) - call MPI_Type_commit(combined_ftype, ierr) - - iseek=0 - if (node .eq. root) then - call MPI_File_delete(filenm, MPI_INFO_NULL, ierr) - endif - - - call MPI_Barrier(comm_solve, ierr) - - call MPI_File_open(comm_solve, - $ filenm, - $ MPI_MODE_RDWR+MPI_MODE_CREATE, - $ MPI_INFO_NULL, fp, ierr) - - if (ierr .ne. MPI_SUCCESS) then - print *, 'Error opening file' - stop - endif - - call MPI_File_set_view(fp, iseek, element, - $ combined_ftype, 'native', info, ierr) - - if (ierr .ne. MPI_SUCCESS) then - print *, 'Error setting file view' - stop - endif - - do m = 1, 5 - xce_sub(m) = 0.d0 - end do - - idump_sub = 0 - - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine output_timestep - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - include 'header.h' - include 'mpinpb.h' - - integer mstatus(MPI_STATUS_SIZE) - integer ierr - - call MPI_File_write_at_all(fp, iseek, u, - $ 1, combined_btype, mstatus, ierr) - if (ierr .ne. MPI_SUCCESS) then - print *, 'Error writing to file' - stop - endif - - call MPI_Type_size(combined_btype, iosize, ierr) - iseek = iseek + iosize/eltext - - idump_sub = idump_sub + 1 - if (rd_interval .gt. 0) then - if (idump_sub .ge. rd_interval) then - - iseek = 0 - call acc_sub_norms(idump+1) - - iseek = 0 - idump_sub = 0 - endif - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine acc_sub_norms(idump_cur) - - include 'header.h' - include 'mpinpb.h' - - integer idump_cur - - integer ii, m, ichunk - integer ierr - integer mstatus(MPI_STATUS_SIZE) - double precision xce_single(5) - - ichunk = idump_cur - idump_sub + 1 - do ii=0, idump_sub-1 - - call MPI_File_read_at_all(fp, iseek, u, - $ 1, combined_btype, mstatus, ierr) - if (ierr .ne. MPI_SUCCESS) then - print *, 'Error reading back file' - call MPI_File_close(fp, ierr) - stop - endif - - call MPI_Type_size(combined_btype, iosize, ierr) - iseek = iseek + iosize/eltext - - if (node .eq. root) print *, 'Reading data set ', ii+ichunk - - call error_norm(xce_single) - do m = 1, 5 - xce_sub(m) = xce_sub(m) + xce_single(m) - end do - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine btio_cleanup - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - include 'header.h' - include 'mpinpb.h' - - integer ierr - - call MPI_File_close(fp, ierr) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - - subroutine accumulate_norms(xce_acc) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - double precision xce_acc(5) - integer m, ierr - - if (rd_interval .gt. 0) goto 20 - - call MPI_File_open(comm_solve, - $ filenm, - $ MPI_MODE_RDONLY, - $ MPI_INFO_NULL, - $ fp, - $ ierr) - - iseek = 0 - call MPI_File_set_view(fp, iseek, element, combined_ftype, - $ 'native', MPI_INFO_NULL, ierr) - -c clear the last time step - - call clear_timestep - -c read back the time steps and accumulate norms - - call acc_sub_norms(idump) - - call MPI_File_close(fp, ierr) - - 20 continue - do m = 1, 5 - xce_acc(m) = xce_sub(m) / dble(idump) - end do - - return - end - diff --git a/examples/smpi/NAS/BT/header.h b/examples/smpi/NAS/BT/header.h deleted file mode 100644 index 47719da674..0000000000 --- a/examples/smpi/NAS/BT/header.h +++ /dev/null @@ -1,137 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- -c -c header.h -c -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - -c--------------------------------------------------------------------- -c The following include file is generated automatically by the -c "setparams" utility. It defines -c maxcells: the square root of the maximum number of processors -c problem_size: 12, 64, 102, 162 (for class T, A, B, C) -c dt_default: default time step for this problem size if no -c config file -c niter_default: default number of iterations for this problem size -c--------------------------------------------------------------------- - - include 'npbparams.h' - - integer aa, bb, cc, BLOCK_SIZE - parameter (aa=1, bb=2, cc=3, BLOCK_SIZE=5) - - integer ncells, grid_points(3) - double precision elapsed_time - common /global/ elapsed_time, ncells, grid_points - - double precision tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, - > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, - > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, - > ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2, - > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, - > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, - > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, - > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, - > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, - > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, - > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, - > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, - > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, - > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16 - - common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, - > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, - > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, - > ce, dxmax, dymax, dzmax, xxcon1, xxcon2, - > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, - > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, - > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, - > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, - > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, - > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, - > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, - > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, - > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, - > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16 - - integer EAST, WEST, NORTH, SOUTH, - > BOTTOM, TOP - - parameter (EAST=2000, WEST=3000, NORTH=4000, SOUTH=5000, - > BOTTOM=6000, TOP=7000) - - integer cell_coord (3,maxcells), cell_low (3,maxcells), - > cell_high (3,maxcells), cell_size(3,maxcells), - > predecessor(3), slice (3,maxcells), - > grid_size (3), successor(3) , - > start (3,maxcells), end (3,maxcells) - common /partition/ cell_coord, cell_low, cell_high, cell_size, - > grid_size, successor, predecessor, slice, - > start, end - - integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE - - parameter (MAX_CELL_DIM = (problem_size/maxcells)+1) - - parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM) - - parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60+1) - - double precision - > us ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > vs ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > ws ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > qs ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > rho_i ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > square ( -1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > forcing (5, 0:IMAX-1, 0:JMAX-1, 0:KMAX-1, maxcells), - > u (5, -2:IMAX+1,-2:JMAX+1,-2:KMAX+1, maxcells), - > rhs (5, -1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells), - > lhsc (5,5,-1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells), - > backsub_info (5, 0:MAX_CELL_DIM, 0:MAX_CELL_DIM, maxcells), - > in_buffer(BUF_SIZE), out_buffer(BUF_SIZE) - common /fields/ u, us, vs, ws, qs, rho_i, square, - > rhs, forcing, lhsc, in_buffer, out_buffer, - > backsub_info - - double precision cv(-2:MAX_CELL_DIM+1), rhon(-2:MAX_CELL_DIM+1), - > rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1), - > cuf(-2:MAX_CELL_DIM+1), q(-2:MAX_CELL_DIM+1), - > ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5) - common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf - - integer west_size, east_size, bottom_size, top_size, - > north_size, south_size, start_send_west, - > start_send_east, start_send_south, start_send_north, - > start_send_bottom, start_send_top, start_recv_west, - > start_recv_east, start_recv_south, start_recv_north, - > start_recv_bottom, start_recv_top - common /box/ west_size, east_size, bottom_size, - > top_size, north_size, south_size, - > start_send_west, start_send_east, start_send_south, - > start_send_north, start_send_bottom, start_send_top, - > start_recv_west, start_recv_east, start_recv_south, - > start_recv_north, start_recv_bottom, start_recv_top - - double precision tmp_block(5,5), b_inverse(5,5), tmp_vec(5) - common /work_solve/ tmp_block, b_inverse, tmp_vec - -c -c These are used by btio -c - integer collbuf_nodes, collbuf_size, iosize, eltext, - $ combined_btype, fp, idump, record_length, element, - $ combined_ftype, idump_sub, rd_interval - common /btio/ collbuf_nodes, collbuf_size, iosize, eltext, - $ combined_btype, fp, idump, record_length, - $ idump_sub, rd_interval - double precision sum(niter_default), xce_sub(5) - common /btio/ sum, xce_sub - integer*8 iseek - common /btio/ iseek, element, combined_ftype - - - diff --git a/examples/smpi/NAS/BT/initialize.f b/examples/smpi/NAS/BT/initialize.f deleted file mode 100644 index 274cdb1899..0000000000 --- a/examples/smpi/NAS/BT/initialize.f +++ /dev/null @@ -1,308 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine initialize - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c This subroutine initializes the field variable u using -c tri-linear transfinite interpolation of the boundary values -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, i, j, k, m, ii, jj, kk, ix, iy, iz - double precision xi, eta, zeta, Pface(5,3,2), Pxi, Peta, - > Pzeta, temp(5) - -c--------------------------------------------------------------------- -c Later (in compute_rhs) we compute 1/u for every element. A few of -c the corner elements are not used, but it convenient (and faster) -c to compute the whole thing with a simple loop. Make sure those -c values are nonzero by initializing the whole thing here. -c--------------------------------------------------------------------- - do c = 1, ncells - do kk = -1, KMAX - do jj = -1, JMAX - do ii = -1, IMAX - do m = 1, 5 - u(m, ii, jj, kk, c) = 1.0 - end do - end do - end do - end do - end do -c--------------------------------------------------------------------- - - - -c--------------------------------------------------------------------- -c first store the "interpolated" values everywhere on the grid -c--------------------------------------------------------------------- - do c=1, ncells - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - ii = 0 - do i = cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - - do ix = 1, 2 - call exact_solution(dble(ix-1), eta, zeta, - > Pface(1,1,ix)) - enddo - - do iy = 1, 2 - call exact_solution(xi, dble(iy-1) , zeta, - > Pface(1,2,iy)) - enddo - - do iz = 1, 2 - call exact_solution(xi, eta, dble(iz-1), - > Pface(1,3,iz)) - enddo - - do m = 1, 5 - Pxi = xi * Pface(m,1,2) + - > (1.0d0-xi) * Pface(m,1,1) - Peta = eta * Pface(m,2,2) + - > (1.0d0-eta) * Pface(m,2,1) - Pzeta = zeta * Pface(m,3,2) + - > (1.0d0-zeta) * Pface(m,3,1) - - u(m,ii,jj,kk,c) = Pxi + Peta + Pzeta - - > Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + - > Pxi*Peta*Pzeta - - enddo - ii = ii + 1 - enddo - jj = jj + 1 - enddo - kk = kk+1 - enddo - enddo - -c--------------------------------------------------------------------- -c now store the exact values on the boundaries -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c west face -c--------------------------------------------------------------------- - c = slice(1,1) - ii = 0 - xi = 0.0d0 - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(m,ii,jj,kk,c) = temp(m) - enddo - jj = jj + 1 - enddo - kk = kk + 1 - enddo - -c--------------------------------------------------------------------- -c east face -c--------------------------------------------------------------------- - c = slice(1,ncells) - ii = cell_size(1,c)-1 - xi = 1.0d0 - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(m,ii,jj,kk,c) = temp(m) - enddo - jj = jj + 1 - enddo - kk = kk + 1 - enddo - -c--------------------------------------------------------------------- -c south face -c--------------------------------------------------------------------- - c = slice(2,1) - jj = 0 - eta = 0.0d0 - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - ii = 0 - do i = cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(m,ii,jj,kk,c) = temp(m) - enddo - ii = ii + 1 - enddo - kk = kk + 1 - enddo - - -c--------------------------------------------------------------------- -c north face -c--------------------------------------------------------------------- - c = slice(2,ncells) - jj = cell_size(2,c)-1 - eta = 1.0d0 - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - ii = 0 - do i = cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(m,ii,jj,kk,c) = temp(m) - enddo - ii = ii + 1 - enddo - kk = kk + 1 - enddo - -c--------------------------------------------------------------------- -c bottom face -c--------------------------------------------------------------------- - c = slice(3,1) - kk = 0 - zeta = 0.0d0 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - ii = 0 - do i =cell_low(1,c), cell_high(1,c) - xi = dble(i) *dnxm1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(m,ii,jj,kk,c) = temp(m) - enddo - ii = ii + 1 - enddo - jj = jj + 1 - enddo - -c--------------------------------------------------------------------- -c top face -c--------------------------------------------------------------------- - c = slice(3,ncells) - kk = cell_size(3,c)-1 - zeta = 1.0d0 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - ii = 0 - do i =cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(m,ii,jj,kk,c) = temp(m) - enddo - ii = ii + 1 - enddo - jj = jj + 1 - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine lhsinit - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - - integer i, j, k, d, c, m, n - -c--------------------------------------------------------------------- -c loop over all cells -c--------------------------------------------------------------------- - do c = 1, ncells - -c--------------------------------------------------------------------- -c first, initialize the start and end arrays -c--------------------------------------------------------------------- - do d = 1, 3 - if (cell_coord(d,c) .eq. 1) then - start(d,c) = 1 - else - start(d,c) = 0 - endif - if (cell_coord(d,c) .eq. ncells) then - end(d,c) = 1 - else - end(d,c) = 0 - endif - enddo - -c--------------------------------------------------------------------- -c zero the whole left hand side for starters -c--------------------------------------------------------------------- - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - do m = 1,5 - do n = 1, 5 - lhsc(m,n,i,j,k,c) = 0.0d0 - enddo - enddo - enddo - enddo - enddo - - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine lhsabinit(lhsa, lhsb, size) - implicit none - - integer size - double precision lhsa(5, 5, -1:size), lhsb(5, 5, -1:size) - - integer i, m, n - -c--------------------------------------------------------------------- -c next, set all diagonal values to 1. This is overkill, but convenient -c--------------------------------------------------------------------- - do i = 0, size - do m = 1, 5 - do n = 1, 5 - lhsa(m,n,i) = 0.0d0 - lhsb(m,n,i) = 0.0d0 - enddo - lhsb(m,m,i) = 1.0d0 - enddo - enddo - - return - end - - - diff --git a/examples/smpi/NAS/BT/inputbt.data.sample b/examples/smpi/NAS/BT/inputbt.data.sample deleted file mode 100644 index 776654e8d0..0000000000 --- a/examples/smpi/NAS/BT/inputbt.data.sample +++ /dev/null @@ -1,5 +0,0 @@ -200 number of time steps -0.0008d0 dt for class A = 0.0008d0. class B = 0.0003d0 class C = 0.0001d0 -64 64 64 -5 0 write interval (optional read interval) for BTIO -0 1000000 number of nodes in collective buffering and buffer size for BTIO diff --git a/examples/smpi/NAS/BT/make_set.f b/examples/smpi/NAS/BT/make_set.f deleted file mode 100644 index b8d90c65a4..0000000000 --- a/examples/smpi/NAS/BT/make_set.f +++ /dev/null @@ -1,124 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine make_set - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c This function allocates space for a set of cells and fills the set -c such that communication between cells on different nodes is only -c nearest neighbor -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - - integer p, i, j, c, dir, size, excess, ierr,ierrcode - -c--------------------------------------------------------------------- -c compute square root; add small number to allow for roundoff -c (note: this is computed in setup_mpi.f also, but prefer to do -c it twice because of some include file problems). -c--------------------------------------------------------------------- - ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0)) - -c--------------------------------------------------------------------- -c this makes coding easier -c--------------------------------------------------------------------- - p = ncells - -c--------------------------------------------------------------------- -c determine the location of the cell at the bottom of the 3D -c array of cells -c--------------------------------------------------------------------- - cell_coord(1,1) = mod(node,p) - cell_coord(2,1) = node/p - cell_coord(3,1) = 0 - -c--------------------------------------------------------------------- -c set the cell_coords for cells in the rest of the z-layers; -c this comes down to a simple linear numbering in the z-direct- -c ion, and to the doubly-cyclic numbering in the other dirs -c--------------------------------------------------------------------- - do c=2, p - cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) - cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) - cell_coord(3,c) = c-1 - end do - -c--------------------------------------------------------------------- -c offset all the coordinates by 1 to adjust for Fortran arrays -c--------------------------------------------------------------------- - do dir = 1, 3 - do c = 1, p - cell_coord(dir,c) = cell_coord(dir,c) + 1 - end do - end do - -c--------------------------------------------------------------------- -c slice(dir,n) contains the sequence number of the cell that is in -c coordinate plane n in the dir direction -c--------------------------------------------------------------------- - do dir = 1, 3 - do c = 1, p - slice(dir,cell_coord(dir,c)) = c - end do - end do - - -c--------------------------------------------------------------------- -c fill the predecessor and successor entries, using the indices -c of the bottom cells (they are the same at each level of k -c anyway) acting as if full periodicity pertains; note that p is -c added to those arguments to the mod functions that might -c otherwise return wrong values when using the modulo function -c--------------------------------------------------------------------- - i = cell_coord(1,1)-1 - j = cell_coord(2,1)-1 - - predecessor(1) = mod(i-1+p,p) + p*j - predecessor(2) = i + p*mod(j-1+p,p) - predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p) - successor(1) = mod(i+1,p) + p*j - successor(2) = i + p*mod(j+1,p) - successor(3) = mod(i-1+p,p) + p*mod(j+1,p) - -c--------------------------------------------------------------------- -c now compute the sizes of the cells -c--------------------------------------------------------------------- - do dir= 1, 3 -c--------------------------------------------------------------------- -c set cell_coord range for each direction -c--------------------------------------------------------------------- - size = grid_points(dir)/p - excess = mod(grid_points(dir),p) - do c=1, ncells - if (cell_coord(dir,c) .le. excess) then - cell_size(dir,c) = size+1 - cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1) - cell_high(dir,c) = cell_low(dir,c)+size - else - cell_size(dir,c) = size - cell_low(dir,c) = excess*(size+1)+ - > (cell_coord(dir,c)-excess-1)*size - cell_high(dir,c) = cell_low(dir,c)+size-1 - endif - if (cell_size(dir, c) .le. 2) then - write(*,50) - 50 format(' Error: Cell size too small. Min size is 3') - call MPI_Abort(mpi_comm_world,ierrcode,ierr) - stop - endif - end do - end do - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - diff --git a/examples/smpi/NAS/BT/mpinpb.h b/examples/smpi/NAS/BT/mpinpb.h deleted file mode 100644 index f621f08b64..0000000000 --- a/examples/smpi/NAS/BT/mpinpb.h +++ /dev/null @@ -1,12 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'mpif.h' - - integer node, no_nodes, total_nodes, root, comm_setup, - > comm_solve, comm_rhs, dp_type - logical active - common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup, - > comm_solve, comm_rhs, dp_type, active - diff --git a/examples/smpi/NAS/BT/rhs.f b/examples/smpi/NAS/BT/rhs.f deleted file mode 100644 index 89171a6741..0000000000 --- a/examples/smpi/NAS/BT/rhs.f +++ /dev/null @@ -1,425 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine compute_rhs - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, i, j, k, m - double precision rho_inv, uijk, up1, um1, vijk, vp1, vm1, - > wijk, wp1, wm1 - - -c--------------------------------------------------------------------- -c loop over all cells owned by this node -c--------------------------------------------------------------------- - do c = 1, ncells - -c--------------------------------------------------------------------- -c compute the reciprocal of density, and the kinetic energy, -c and the speed of sound. -c--------------------------------------------------------------------- - do k = -1, cell_size(3,c) - do j = -1, cell_size(2,c) - do i = -1, cell_size(1,c) - rho_inv = 1.0d0/u(1,i,j,k,c) - rho_i(i,j,k,c) = rho_inv - us(i,j,k,c) = u(2,i,j,k,c) * rho_inv - vs(i,j,k,c) = u(3,i,j,k,c) * rho_inv - ws(i,j,k,c) = u(4,i,j,k,c) * rho_inv - square(i,j,k,c) = 0.5d0* ( - > u(2,i,j,k,c)*u(2,i,j,k,c) + - > u(3,i,j,k,c)*u(3,i,j,k,c) + - > u(4,i,j,k,c)*u(4,i,j,k,c) ) * rho_inv - qs(i,j,k,c) = square(i,j,k,c) * rho_inv - enddo - enddo - enddo - -c--------------------------------------------------------------------- -c copy the exact forcing term to the right hand side; because -c this forcing term is known, we can store it on the whole of every -c cell, including the boundary -c--------------------------------------------------------------------- - - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = forcing(m,i,j,k,c) - enddo - enddo - enddo - enddo - - -c--------------------------------------------------------------------- -c compute xi-direction fluxes -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - uijk = us(i,j,k,c) - up1 = us(i+1,j,k,c) - um1 = us(i-1,j,k,c) - - rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dx1tx1 * - > (u(1,i+1,j,k,c) - 2.0d0*u(1,i,j,k,c) + - > u(1,i-1,j,k,c)) - - > tx2 * (u(2,i+1,j,k,c) - u(2,i-1,j,k,c)) - - rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dx2tx1 * - > (u(2,i+1,j,k,c) - 2.0d0*u(2,i,j,k,c) + - > u(2,i-1,j,k,c)) + - > xxcon2*con43 * (up1 - 2.0d0*uijk + um1) - - > tx2 * (u(2,i+1,j,k,c)*up1 - - > u(2,i-1,j,k,c)*um1 + - > (u(5,i+1,j,k,c)- square(i+1,j,k,c)- - > u(5,i-1,j,k,c)+ square(i-1,j,k,c))* - > c2) - - rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dx3tx1 * - > (u(3,i+1,j,k,c) - 2.0d0*u(3,i,j,k,c) + - > u(3,i-1,j,k,c)) + - > xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) + - > vs(i-1,j,k,c)) - - > tx2 * (u(3,i+1,j,k,c)*up1 - - > u(3,i-1,j,k,c)*um1) - - rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dx4tx1 * - > (u(4,i+1,j,k,c) - 2.0d0*u(4,i,j,k,c) + - > u(4,i-1,j,k,c)) + - > xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) + - > ws(i-1,j,k,c)) - - > tx2 * (u(4,i+1,j,k,c)*up1 - - > u(4,i-1,j,k,c)*um1) - - rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dx5tx1 * - > (u(5,i+1,j,k,c) - 2.0d0*u(5,i,j,k,c) + - > u(5,i-1,j,k,c)) + - > xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) + - > qs(i-1,j,k,c)) + - > xxcon4 * (up1*up1 - 2.0d0*uijk*uijk + - > um1*um1) + - > xxcon5 * (u(5,i+1,j,k,c)*rho_i(i+1,j,k,c) - - > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) + - > u(5,i-1,j,k,c)*rho_i(i-1,j,k,c)) - - > tx2 * ( (c1*u(5,i+1,j,k,c) - - > c2*square(i+1,j,k,c))*up1 - - > (c1*u(5,i-1,j,k,c) - - > c2*square(i-1,j,k,c))*um1 ) - enddo - enddo - enddo - -c--------------------------------------------------------------------- -c add fourth order xi-direction dissipation -c--------------------------------------------------------------------- - if (start(1,c) .gt. 0) then - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - i = 1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * - > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) + - > u(m,i+2,j,k,c)) - enddo - - i = 2 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > (-4.0d0*u(m,i-1,j,k,c) + 6.0d0*u(m,i,j,k,c) - - > 4.0d0*u(m,i+1,j,k,c) + u(m,i+2,j,k,c)) - enddo - enddo - enddo - endif - - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + - > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) + - > u(m,i+2,j,k,c) ) - enddo - enddo - enddo - enddo - - - if (end(1,c) .gt. 0) then - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - i = cell_size(1,c)-3 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + - > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) ) - enddo - - i = cell_size(1,c)-2 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > ( u(m,i-2,j,k,c) - 4.d0*u(m,i-1,j,k,c) + - > 5.d0*u(m,i,j,k,c) ) - enddo - enddo - enddo - endif - -c--------------------------------------------------------------------- -c compute eta-direction fluxes -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - vijk = vs(i,j,k,c) - vp1 = vs(i,j+1,k,c) - vm1 = vs(i,j-1,k,c) - rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dy1ty1 * - > (u(1,i,j+1,k,c) - 2.0d0*u(1,i,j,k,c) + - > u(1,i,j-1,k,c)) - - > ty2 * (u(3,i,j+1,k,c) - u(3,i,j-1,k,c)) - rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dy2ty1 * - > (u(2,i,j+1,k,c) - 2.0d0*u(2,i,j,k,c) + - > u(2,i,j-1,k,c)) + - > yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) + - > us(i,j-1,k,c)) - - > ty2 * (u(2,i,j+1,k,c)*vp1 - - > u(2,i,j-1,k,c)*vm1) - rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dy3ty1 * - > (u(3,i,j+1,k,c) - 2.0d0*u(3,i,j,k,c) + - > u(3,i,j-1,k,c)) + - > yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) - - > ty2 * (u(3,i,j+1,k,c)*vp1 - - > u(3,i,j-1,k,c)*vm1 + - > (u(5,i,j+1,k,c) - square(i,j+1,k,c) - - > u(5,i,j-1,k,c) + square(i,j-1,k,c)) - > *c2) - rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dy4ty1 * - > (u(4,i,j+1,k,c) - 2.0d0*u(4,i,j,k,c) + - > u(4,i,j-1,k,c)) + - > yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) + - > ws(i,j-1,k,c)) - - > ty2 * (u(4,i,j+1,k,c)*vp1 - - > u(4,i,j-1,k,c)*vm1) - rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dy5ty1 * - > (u(5,i,j+1,k,c) - 2.0d0*u(5,i,j,k,c) + - > u(5,i,j-1,k,c)) + - > yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) + - > qs(i,j-1,k,c)) + - > yycon4 * (vp1*vp1 - 2.0d0*vijk*vijk + - > vm1*vm1) + - > yycon5 * (u(5,i,j+1,k,c)*rho_i(i,j+1,k,c) - - > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) + - > u(5,i,j-1,k,c)*rho_i(i,j-1,k,c)) - - > ty2 * ((c1*u(5,i,j+1,k,c) - - > c2*square(i,j+1,k,c)) * vp1 - - > (c1*u(5,i,j-1,k,c) - - > c2*square(i,j-1,k,c)) * vm1) - enddo - enddo - enddo - -c--------------------------------------------------------------------- -c add fourth order eta-direction dissipation -c--------------------------------------------------------------------- - if (start(2,c) .gt. 0) then - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - j = 1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * - > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) + - > u(m,i,j+2,k,c)) - enddo - enddo - - j = 2 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > (-4.0d0*u(m,i,j-1,k,c) + 6.0d0*u(m,i,j,k,c) - - > 4.0d0*u(m,i,j+1,k,c) + u(m,i,j+2,k,c)) - enddo - enddo - enddo - endif - - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1 - do i = start(1,c),cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + - > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) + - > u(m,i,j+2,k,c) ) - enddo - enddo - enddo - enddo - - if (end(2,c) .gt. 0) then - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - j = cell_size(2,c)-3 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + - > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) ) - enddo - enddo - - j = cell_size(2,c)-2 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > ( u(m,i,j-2,k,c) - 4.d0*u(m,i,j-1,k,c) + - > 5.d0*u(m,i,j,k,c) ) - enddo - enddo - enddo - endif - -c--------------------------------------------------------------------- -c compute zeta-direction fluxes -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - wijk = ws(i,j,k,c) - wp1 = ws(i,j,k+1,c) - wm1 = ws(i,j,k-1,c) - - rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dz1tz1 * - > (u(1,i,j,k+1,c) - 2.0d0*u(1,i,j,k,c) + - > u(1,i,j,k-1,c)) - - > tz2 * (u(4,i,j,k+1,c) - u(4,i,j,k-1,c)) - rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dz2tz1 * - > (u(2,i,j,k+1,c) - 2.0d0*u(2,i,j,k,c) + - > u(2,i,j,k-1,c)) + - > zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) + - > us(i,j,k-1,c)) - - > tz2 * (u(2,i,j,k+1,c)*wp1 - - > u(2,i,j,k-1,c)*wm1) - rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dz3tz1 * - > (u(3,i,j,k+1,c) - 2.0d0*u(3,i,j,k,c) + - > u(3,i,j,k-1,c)) + - > zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) + - > vs(i,j,k-1,c)) - - > tz2 * (u(3,i,j,k+1,c)*wp1 - - > u(3,i,j,k-1,c)*wm1) - rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dz4tz1 * - > (u(4,i,j,k+1,c) - 2.0d0*u(4,i,j,k,c) + - > u(4,i,j,k-1,c)) + - > zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) - - > tz2 * (u(4,i,j,k+1,c)*wp1 - - > u(4,i,j,k-1,c)*wm1 + - > (u(5,i,j,k+1,c) - square(i,j,k+1,c) - - > u(5,i,j,k-1,c) + square(i,j,k-1,c)) - > *c2) - rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dz5tz1 * - > (u(5,i,j,k+1,c) - 2.0d0*u(5,i,j,k,c) + - > u(5,i,j,k-1,c)) + - > zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) + - > qs(i,j,k-1,c)) + - > zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk + - > wm1*wm1) + - > zzcon5 * (u(5,i,j,k+1,c)*rho_i(i,j,k+1,c) - - > 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) + - > u(5,i,j,k-1,c)*rho_i(i,j,k-1,c)) - - > tz2 * ( (c1*u(5,i,j,k+1,c) - - > c2*square(i,j,k+1,c))*wp1 - - > (c1*u(5,i,j,k-1,c) - - > c2*square(i,j,k-1,c))*wm1) - enddo - enddo - enddo - -c--------------------------------------------------------------------- -c add fourth order zeta-direction dissipation -c--------------------------------------------------------------------- - if (start(3,c) .gt. 0) then - k = 1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * - > ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) + - > u(m,i,j,k+2,c)) - enddo - enddo - enddo - - k = 2 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > (-4.0d0*u(m,i,j,k-1,c) + 6.0d0*u(m,i,j,k,c) - - > 4.0d0*u(m,i,j,k+1,c) + u(m,i,j,k+2,c)) - enddo - enddo - enddo - endif - - do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c),cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + - > 6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) + - > u(m,i,j,k+2,c) ) - enddo - enddo - enddo - enddo - - if (end(3,c) .gt. 0) then - k = cell_size(3,c)-3 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + - > 6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) ) - enddo - enddo - enddo - - k = cell_size(3,c)-2 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * - > ( u(m,i,j,k-2,c) - 4.d0*u(m,i,j,k-1,c) + - > 5.d0*u(m,i,j,k,c) ) - enddo - enddo - enddo - endif - - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) * dt - enddo - enddo - enddo - enddo - - enddo - - return - end - - - - diff --git a/examples/smpi/NAS/BT/set_constants.f b/examples/smpi/NAS/BT/set_constants.f deleted file mode 100644 index 81397d4bcf..0000000000 --- a/examples/smpi/NAS/BT/set_constants.f +++ /dev/null @@ -1,202 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine set_constants - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - - ce(1,1) = 2.0d0 - ce(1,2) = 0.0d0 - ce(1,3) = 0.0d0 - ce(1,4) = 4.0d0 - ce(1,5) = 5.0d0 - ce(1,6) = 3.0d0 - ce(1,7) = 0.5d0 - ce(1,8) = 0.02d0 - ce(1,9) = 0.01d0 - ce(1,10) = 0.03d0 - ce(1,11) = 0.5d0 - ce(1,12) = 0.4d0 - ce(1,13) = 0.3d0 - - ce(2,1) = 1.0d0 - ce(2,2) = 0.0d0 - ce(2,3) = 0.0d0 - ce(2,4) = 0.0d0 - ce(2,5) = 1.0d0 - ce(2,6) = 2.0d0 - ce(2,7) = 3.0d0 - ce(2,8) = 0.01d0 - ce(2,9) = 0.03d0 - ce(2,10) = 0.02d0 - ce(2,11) = 0.4d0 - ce(2,12) = 0.3d0 - ce(2,13) = 0.5d0 - - ce(3,1) = 2.0d0 - ce(3,2) = 2.0d0 - ce(3,3) = 0.0d0 - ce(3,4) = 0.0d0 - ce(3,5) = 0.0d0 - ce(3,6) = 2.0d0 - ce(3,7) = 3.0d0 - ce(3,8) = 0.04d0 - ce(3,9) = 0.03d0 - ce(3,10) = 0.05d0 - ce(3,11) = 0.3d0 - ce(3,12) = 0.5d0 - ce(3,13) = 0.4d0 - - ce(4,1) = 2.0d0 - ce(4,2) = 2.0d0 - ce(4,3) = 0.0d0 - ce(4,4) = 0.0d0 - ce(4,5) = 0.0d0 - ce(4,6) = 2.0d0 - ce(4,7) = 3.0d0 - ce(4,8) = 0.03d0 - ce(4,9) = 0.05d0 - ce(4,10) = 0.04d0 - ce(4,11) = 0.2d0 - ce(4,12) = 0.1d0 - ce(4,13) = 0.3d0 - - ce(5,1) = 5.0d0 - ce(5,2) = 4.0d0 - ce(5,3) = 3.0d0 - ce(5,4) = 2.0d0 - ce(5,5) = 0.1d0 - ce(5,6) = 0.4d0 - ce(5,7) = 0.3d0 - ce(5,8) = 0.05d0 - ce(5,9) = 0.04d0 - ce(5,10) = 0.03d0 - ce(5,11) = 0.1d0 - ce(5,12) = 0.3d0 - ce(5,13) = 0.2d0 - - c1 = 1.4d0 - c2 = 0.4d0 - c3 = 0.1d0 - c4 = 1.0d0 - c5 = 1.4d0 - - bt = dsqrt(0.5d0) - - dnxm1 = 1.0d0 / dble(grid_points(1)-1) - dnym1 = 1.0d0 / dble(grid_points(2)-1) - dnzm1 = 1.0d0 / dble(grid_points(3)-1) - - c1c2 = c1 * c2 - c1c5 = c1 * c5 - c3c4 = c3 * c4 - c1345 = c1c5 * c3c4 - - conz1 = (1.0d0-c1c5) - - tx1 = 1.0d0 / (dnxm1 * dnxm1) - tx2 = 1.0d0 / (2.0d0 * dnxm1) - tx3 = 1.0d0 / dnxm1 - - ty1 = 1.0d0 / (dnym1 * dnym1) - ty2 = 1.0d0 / (2.0d0 * dnym1) - ty3 = 1.0d0 / dnym1 - - tz1 = 1.0d0 / (dnzm1 * dnzm1) - tz2 = 1.0d0 / (2.0d0 * dnzm1) - tz3 = 1.0d0 / dnzm1 - - dx1 = 0.75d0 - dx2 = 0.75d0 - dx3 = 0.75d0 - dx4 = 0.75d0 - dx5 = 0.75d0 - - dy1 = 0.75d0 - dy2 = 0.75d0 - dy3 = 0.75d0 - dy4 = 0.75d0 - dy5 = 0.75d0 - - dz1 = 1.0d0 - dz2 = 1.0d0 - dz3 = 1.0d0 - dz4 = 1.0d0 - dz5 = 1.0d0 - - dxmax = dmax1(dx3, dx4) - dymax = dmax1(dy2, dy4) - dzmax = dmax1(dz2, dz3) - - dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) ) - - c4dssp = 4.0d0 * dssp - c5dssp = 5.0d0 * dssp - - dttx1 = dt*tx1 - dttx2 = dt*tx2 - dtty1 = dt*ty1 - dtty2 = dt*ty2 - dttz1 = dt*tz1 - dttz2 = dt*tz2 - - c2dttx1 = 2.0d0*dttx1 - c2dtty1 = 2.0d0*dtty1 - c2dttz1 = 2.0d0*dttz1 - - dtdssp = dt*dssp - - comz1 = dtdssp - comz4 = 4.0d0*dtdssp - comz5 = 5.0d0*dtdssp - comz6 = 6.0d0*dtdssp - - c3c4tx3 = c3c4*tx3 - c3c4ty3 = c3c4*ty3 - c3c4tz3 = c3c4*tz3 - - dx1tx1 = dx1*tx1 - dx2tx1 = dx2*tx1 - dx3tx1 = dx3*tx1 - dx4tx1 = dx4*tx1 - dx5tx1 = dx5*tx1 - - dy1ty1 = dy1*ty1 - dy2ty1 = dy2*ty1 - dy3ty1 = dy3*ty1 - dy4ty1 = dy4*ty1 - dy5ty1 = dy5*ty1 - - dz1tz1 = dz1*tz1 - dz2tz1 = dz2*tz1 - dz3tz1 = dz3*tz1 - dz4tz1 = dz4*tz1 - dz5tz1 = dz5*tz1 - - c2iv = 2.5d0 - con43 = 4.0d0/3.0d0 - con16 = 1.0d0/6.0d0 - - xxcon1 = c3c4tx3*con43*tx3 - xxcon2 = c3c4tx3*tx3 - xxcon3 = c3c4tx3*conz1*tx3 - xxcon4 = c3c4tx3*con16*tx3 - xxcon5 = c3c4tx3*c1c5*tx3 - - yycon1 = c3c4ty3*con43*ty3 - yycon2 = c3c4ty3*ty3 - yycon3 = c3c4ty3*conz1*ty3 - yycon4 = c3c4ty3*con16*ty3 - yycon5 = c3c4ty3*c1c5*ty3 - - zzcon1 = c3c4tz3*con43*tz3 - zzcon2 = c3c4tz3*tz3 - zzcon3 = c3c4tz3*conz1*tz3 - zzcon4 = c3c4tz3*con16*tz3 - zzcon5 = c3c4tz3*c1c5*tz3 - - return - end diff --git a/examples/smpi/NAS/BT/setup_mpi.f b/examples/smpi/NAS/BT/setup_mpi.f deleted file mode 100644 index 987c6bfba4..0000000000 --- a/examples/smpi/NAS/BT/setup_mpi.f +++ /dev/null @@ -1,64 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setup_mpi - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c set up MPI stuff -c--------------------------------------------------------------------- - - implicit none - include 'mpinpb.h' - include 'npbparams.h' - integer error, color, nc - - call mpi_init(error) - - call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error) - call mpi_comm_rank(MPI_COMM_WORLD, node, error) - - if (.not. convertdouble) then - dp_type = MPI_DOUBLE_PRECISION - else - dp_type = MPI_REAL - endif - -c--------------------------------------------------------------------- -c compute square root; add small number to allow for roundoff -c--------------------------------------------------------------------- - nc = dint(dsqrt(dble(total_nodes) + 0.00001d0)) - -c--------------------------------------------------------------------- -c We handle a non-square number of nodes by making the excess nodes -c inactive. However, we can never handle more cells than were compiled -c in. -c--------------------------------------------------------------------- - - if (nc .gt. maxcells) nc = maxcells - if (node .ge. nc*nc) then - active = .false. - color = 1 - else - active = .true. - color = 0 - end if - - call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error) - if (.not. active) return - - call mpi_comm_size(comm_setup, no_nodes, error) - call mpi_comm_dup(comm_setup, comm_solve, error) - call mpi_comm_dup(comm_setup, comm_rhs, error) - -c--------------------------------------------------------------------- -c let node 0 be the root for the group (there is only one) -c--------------------------------------------------------------------- - root = 0 - - return - end - diff --git a/examples/smpi/NAS/BT/simple_mpiio.f b/examples/smpi/NAS/BT/simple_mpiio.f deleted file mode 100644 index 02e2700177..0000000000 --- a/examples/smpi/NAS/BT/simple_mpiio.f +++ /dev/null @@ -1,213 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setup_btio - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer m, ierr - - iseek=0 - - if (node .eq. root) then - call MPI_File_delete(filenm, MPI_INFO_NULL, ierr) - endif - - call MPI_Barrier(comm_solve, ierr) - - call MPI_File_open(comm_solve, - $ filenm, - $ MPI_MODE_RDWR + MPI_MODE_CREATE, - $ MPI_INFO_NULL, - $ fp, - $ ierr) - - call MPI_File_set_view(fp, - $ iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION, - $ 'native', MPI_INFO_NULL, ierr) - - if (ierr .ne. MPI_SUCCESS) then - print *, 'Error opening file' - stop - endif - - do m = 1, 5 - xce_sub(m) = 0.d0 - end do - - idump_sub = 0 - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine output_timestep - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - include 'header.h' - include 'mpinpb.h' - - integer count, jio, kio, cio, aio - integer ierr - integer mstatus(MPI_STATUS_SIZE) - - do cio=1,ncells - do kio=0, cell_size(3,cio)-1 - do jio=0, cell_size(2,cio)-1 - iseek=5*(cell_low(1,cio) + - $ PROBLEM_SIZE*((cell_low(2,cio)+jio) + - $ PROBLEM_SIZE*((cell_low(3,cio)+kio) + - $ PROBLEM_SIZE*idump_sub))) - - count=5*cell_size(1,cio) - - call MPI_File_write_at(fp, iseek, - $ u(1,0,jio,kio,cio), - $ count, MPI_DOUBLE_PRECISION, - $ mstatus, ierr) - - if (ierr .ne. MPI_SUCCESS) then - print *, 'Error writing to file' - stop - endif - enddo - enddo - enddo - - idump_sub = idump_sub + 1 - if (rd_interval .gt. 0) then - if (idump_sub .ge. rd_interval) then - - call acc_sub_norms(idump+1) - - idump_sub = 0 - endif - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine acc_sub_norms(idump_cur) - - include 'header.h' - include 'mpinpb.h' - - integer idump_cur - - integer count, jio, kio, cio, ii, m, ichunk - integer ierr - integer mstatus(MPI_STATUS_SIZE) - double precision xce_single(5) - - ichunk = idump_cur - idump_sub + 1 - do ii=0, idump_sub-1 - do cio=1,ncells - do kio=0, cell_size(3,cio)-1 - do jio=0, cell_size(2,cio)-1 - iseek=5*(cell_low(1,cio) + - $ PROBLEM_SIZE*((cell_low(2,cio)+jio) + - $ PROBLEM_SIZE*((cell_low(3,cio)+kio) + - $ PROBLEM_SIZE*ii))) - - count=5*cell_size(1,cio) - - call MPI_File_read_at(fp, iseek, - $ u(1,0,jio,kio,cio), - $ count, MPI_DOUBLE_PRECISION, - $ mstatus, ierr) - - if (ierr .ne. MPI_SUCCESS) then - print *, 'Error reading back file' - call MPI_File_close(fp, ierr) - stop - endif - enddo - enddo - enddo - - if (node .eq. root) print *, 'Reading data set ', ii+ichunk - - call error_norm(xce_single) - do m = 1, 5 - xce_sub(m) = xce_sub(m) + xce_single(m) - end do - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine btio_cleanup - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer ierr - - call MPI_File_close(fp, ierr) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine accumulate_norms(xce_acc) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - double precision xce_acc(5) - integer m, ierr - - if (rd_interval .gt. 0) goto 20 - - call MPI_File_open(comm_solve, - $ filenm, - $ MPI_MODE_RDONLY, - $ MPI_INFO_NULL, - $ fp, - $ ierr) - - iseek = 0 - call MPI_File_set_view(fp, - $ iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION, - $ 'native', MPI_INFO_NULL, ierr) - -c clear the last time step - - call clear_timestep - -c read back the time steps and accumulate norms - - call acc_sub_norms(idump) - - call MPI_File_close(fp, ierr) - - 20 continue - do m = 1, 5 - xce_acc(m) = xce_sub(m) / dble(idump) - end do - - return - end - diff --git a/examples/smpi/NAS/BT/solve_subs.f b/examples/smpi/NAS/BT/solve_subs.f deleted file mode 100644 index 351489a313..0000000000 --- a/examples/smpi/NAS/BT/solve_subs.f +++ /dev/null @@ -1,642 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine matvec_sub(ablock,avec,bvec) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c subtracts bvec=bvec - ablock*avec -c--------------------------------------------------------------------- - - implicit none - - double precision ablock,avec,bvec - dimension ablock(5,5),avec(5),bvec(5) - -c--------------------------------------------------------------------- -c rhs(i,ic,jc,kc,ccell) = rhs(i,ic,jc,kc,ccell) -c $ - lhs(i,1,ablock,ia,ja,ka,acell)* -c--------------------------------------------------------------------- - bvec(1) = bvec(1) - ablock(1,1)*avec(1) - > - ablock(1,2)*avec(2) - > - ablock(1,3)*avec(3) - > - ablock(1,4)*avec(4) - > - ablock(1,5)*avec(5) - bvec(2) = bvec(2) - ablock(2,1)*avec(1) - > - ablock(2,2)*avec(2) - > - ablock(2,3)*avec(3) - > - ablock(2,4)*avec(4) - > - ablock(2,5)*avec(5) - bvec(3) = bvec(3) - ablock(3,1)*avec(1) - > - ablock(3,2)*avec(2) - > - ablock(3,3)*avec(3) - > - ablock(3,4)*avec(4) - > - ablock(3,5)*avec(5) - bvec(4) = bvec(4) - ablock(4,1)*avec(1) - > - ablock(4,2)*avec(2) - > - ablock(4,3)*avec(3) - > - ablock(4,4)*avec(4) - > - ablock(4,5)*avec(5) - bvec(5) = bvec(5) - ablock(5,1)*avec(1) - > - ablock(5,2)*avec(2) - > - ablock(5,3)*avec(3) - > - ablock(5,4)*avec(4) - > - ablock(5,5)*avec(5) - - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine matmul_sub(ablock, bblock, cblock) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c subtracts a(i,j,k) X b(i,j,k) from c(i,j,k) -c--------------------------------------------------------------------- - - implicit none - - double precision ablock, bblock, cblock - dimension ablock(5,5), bblock(5,5), cblock(5,5) - - - cblock(1,1) = cblock(1,1) - ablock(1,1)*bblock(1,1) - > - ablock(1,2)*bblock(2,1) - > - ablock(1,3)*bblock(3,1) - > - ablock(1,4)*bblock(4,1) - > - ablock(1,5)*bblock(5,1) - cblock(2,1) = cblock(2,1) - ablock(2,1)*bblock(1,1) - > - ablock(2,2)*bblock(2,1) - > - ablock(2,3)*bblock(3,1) - > - ablock(2,4)*bblock(4,1) - > - ablock(2,5)*bblock(5,1) - cblock(3,1) = cblock(3,1) - ablock(3,1)*bblock(1,1) - > - ablock(3,2)*bblock(2,1) - > - ablock(3,3)*bblock(3,1) - > - ablock(3,4)*bblock(4,1) - > - ablock(3,5)*bblock(5,1) - cblock(4,1) = cblock(4,1) - ablock(4,1)*bblock(1,1) - > - ablock(4,2)*bblock(2,1) - > - ablock(4,3)*bblock(3,1) - > - ablock(4,4)*bblock(4,1) - > - ablock(4,5)*bblock(5,1) - cblock(5,1) = cblock(5,1) - ablock(5,1)*bblock(1,1) - > - ablock(5,2)*bblock(2,1) - > - ablock(5,3)*bblock(3,1) - > - ablock(5,4)*bblock(4,1) - > - ablock(5,5)*bblock(5,1) - cblock(1,2) = cblock(1,2) - ablock(1,1)*bblock(1,2) - > - ablock(1,2)*bblock(2,2) - > - ablock(1,3)*bblock(3,2) - > - ablock(1,4)*bblock(4,2) - > - ablock(1,5)*bblock(5,2) - cblock(2,2) = cblock(2,2) - ablock(2,1)*bblock(1,2) - > - ablock(2,2)*bblock(2,2) - > - ablock(2,3)*bblock(3,2) - > - ablock(2,4)*bblock(4,2) - > - ablock(2,5)*bblock(5,2) - cblock(3,2) = cblock(3,2) - ablock(3,1)*bblock(1,2) - > - ablock(3,2)*bblock(2,2) - > - ablock(3,3)*bblock(3,2) - > - ablock(3,4)*bblock(4,2) - > - ablock(3,5)*bblock(5,2) - cblock(4,2) = cblock(4,2) - ablock(4,1)*bblock(1,2) - > - ablock(4,2)*bblock(2,2) - > - ablock(4,3)*bblock(3,2) - > - ablock(4,4)*bblock(4,2) - > - ablock(4,5)*bblock(5,2) - cblock(5,2) = cblock(5,2) - ablock(5,1)*bblock(1,2) - > - ablock(5,2)*bblock(2,2) - > - ablock(5,3)*bblock(3,2) - > - ablock(5,4)*bblock(4,2) - > - ablock(5,5)*bblock(5,2) - cblock(1,3) = cblock(1,3) - ablock(1,1)*bblock(1,3) - > - ablock(1,2)*bblock(2,3) - > - ablock(1,3)*bblock(3,3) - > - ablock(1,4)*bblock(4,3) - > - ablock(1,5)*bblock(5,3) - cblock(2,3) = cblock(2,3) - ablock(2,1)*bblock(1,3) - > - ablock(2,2)*bblock(2,3) - > - ablock(2,3)*bblock(3,3) - > - ablock(2,4)*bblock(4,3) - > - ablock(2,5)*bblock(5,3) - cblock(3,3) = cblock(3,3) - ablock(3,1)*bblock(1,3) - > - ablock(3,2)*bblock(2,3) - > - ablock(3,3)*bblock(3,3) - > - ablock(3,4)*bblock(4,3) - > - ablock(3,5)*bblock(5,3) - cblock(4,3) = cblock(4,3) - ablock(4,1)*bblock(1,3) - > - ablock(4,2)*bblock(2,3) - > - ablock(4,3)*bblock(3,3) - > - ablock(4,4)*bblock(4,3) - > - ablock(4,5)*bblock(5,3) - cblock(5,3) = cblock(5,3) - ablock(5,1)*bblock(1,3) - > - ablock(5,2)*bblock(2,3) - > - ablock(5,3)*bblock(3,3) - > - ablock(5,4)*bblock(4,3) - > - ablock(5,5)*bblock(5,3) - cblock(1,4) = cblock(1,4) - ablock(1,1)*bblock(1,4) - > - ablock(1,2)*bblock(2,4) - > - ablock(1,3)*bblock(3,4) - > - ablock(1,4)*bblock(4,4) - > - ablock(1,5)*bblock(5,4) - cblock(2,4) = cblock(2,4) - ablock(2,1)*bblock(1,4) - > - ablock(2,2)*bblock(2,4) - > - ablock(2,3)*bblock(3,4) - > - ablock(2,4)*bblock(4,4) - > - ablock(2,5)*bblock(5,4) - cblock(3,4) = cblock(3,4) - ablock(3,1)*bblock(1,4) - > - ablock(3,2)*bblock(2,4) - > - ablock(3,3)*bblock(3,4) - > - ablock(3,4)*bblock(4,4) - > - ablock(3,5)*bblock(5,4) - cblock(4,4) = cblock(4,4) - ablock(4,1)*bblock(1,4) - > - ablock(4,2)*bblock(2,4) - > - ablock(4,3)*bblock(3,4) - > - ablock(4,4)*bblock(4,4) - > - ablock(4,5)*bblock(5,4) - cblock(5,4) = cblock(5,4) - ablock(5,1)*bblock(1,4) - > - ablock(5,2)*bblock(2,4) - > - ablock(5,3)*bblock(3,4) - > - ablock(5,4)*bblock(4,4) - > - ablock(5,5)*bblock(5,4) - cblock(1,5) = cblock(1,5) - ablock(1,1)*bblock(1,5) - > - ablock(1,2)*bblock(2,5) - > - ablock(1,3)*bblock(3,5) - > - ablock(1,4)*bblock(4,5) - > - ablock(1,5)*bblock(5,5) - cblock(2,5) = cblock(2,5) - ablock(2,1)*bblock(1,5) - > - ablock(2,2)*bblock(2,5) - > - ablock(2,3)*bblock(3,5) - > - ablock(2,4)*bblock(4,5) - > - ablock(2,5)*bblock(5,5) - cblock(3,5) = cblock(3,5) - ablock(3,1)*bblock(1,5) - > - ablock(3,2)*bblock(2,5) - > - ablock(3,3)*bblock(3,5) - > - ablock(3,4)*bblock(4,5) - > - ablock(3,5)*bblock(5,5) - cblock(4,5) = cblock(4,5) - ablock(4,1)*bblock(1,5) - > - ablock(4,2)*bblock(2,5) - > - ablock(4,3)*bblock(3,5) - > - ablock(4,4)*bblock(4,5) - > - ablock(4,5)*bblock(5,5) - cblock(5,5) = cblock(5,5) - ablock(5,1)*bblock(1,5) - > - ablock(5,2)*bblock(2,5) - > - ablock(5,3)*bblock(3,5) - > - ablock(5,4)*bblock(4,5) - > - ablock(5,5)*bblock(5,5) - - - return - end - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine binvcrhs( lhs,c,r ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c--------------------------------------------------------------------- - - implicit none - - double precision pivot, coeff, lhs - dimension lhs(5,5) - double precision c(5,5), r(5) - -c--------------------------------------------------------------------- -c -c--------------------------------------------------------------------- - - pivot = 1.00d0/lhs(1,1) - lhs(1,2) = lhs(1,2)*pivot - lhs(1,3) = lhs(1,3)*pivot - lhs(1,4) = lhs(1,4)*pivot - lhs(1,5) = lhs(1,5)*pivot - c(1,1) = c(1,1)*pivot - c(1,2) = c(1,2)*pivot - c(1,3) = c(1,3)*pivot - c(1,4) = c(1,4)*pivot - c(1,5) = c(1,5)*pivot - r(1) = r(1) *pivot - - coeff = lhs(2,1) - lhs(2,2)= lhs(2,2) - coeff*lhs(1,2) - lhs(2,3)= lhs(2,3) - coeff*lhs(1,3) - lhs(2,4)= lhs(2,4) - coeff*lhs(1,4) - lhs(2,5)= lhs(2,5) - coeff*lhs(1,5) - c(2,1) = c(2,1) - coeff*c(1,1) - c(2,2) = c(2,2) - coeff*c(1,2) - c(2,3) = c(2,3) - coeff*c(1,3) - c(2,4) = c(2,4) - coeff*c(1,4) - c(2,5) = c(2,5) - coeff*c(1,5) - r(2) = r(2) - coeff*r(1) - - coeff = lhs(3,1) - lhs(3,2)= lhs(3,2) - coeff*lhs(1,2) - lhs(3,3)= lhs(3,3) - coeff*lhs(1,3) - lhs(3,4)= lhs(3,4) - coeff*lhs(1,4) - lhs(3,5)= lhs(3,5) - coeff*lhs(1,5) - c(3,1) = c(3,1) - coeff*c(1,1) - c(3,2) = c(3,2) - coeff*c(1,2) - c(3,3) = c(3,3) - coeff*c(1,3) - c(3,4) = c(3,4) - coeff*c(1,4) - c(3,5) = c(3,5) - coeff*c(1,5) - r(3) = r(3) - coeff*r(1) - - coeff = lhs(4,1) - lhs(4,2)= lhs(4,2) - coeff*lhs(1,2) - lhs(4,3)= lhs(4,3) - coeff*lhs(1,3) - lhs(4,4)= lhs(4,4) - coeff*lhs(1,4) - lhs(4,5)= lhs(4,5) - coeff*lhs(1,5) - c(4,1) = c(4,1) - coeff*c(1,1) - c(4,2) = c(4,2) - coeff*c(1,2) - c(4,3) = c(4,3) - coeff*c(1,3) - c(4,4) = c(4,4) - coeff*c(1,4) - c(4,5) = c(4,5) - coeff*c(1,5) - r(4) = r(4) - coeff*r(1) - - coeff = lhs(5,1) - lhs(5,2)= lhs(5,2) - coeff*lhs(1,2) - lhs(5,3)= lhs(5,3) - coeff*lhs(1,3) - lhs(5,4)= lhs(5,4) - coeff*lhs(1,4) - lhs(5,5)= lhs(5,5) - coeff*lhs(1,5) - c(5,1) = c(5,1) - coeff*c(1,1) - c(5,2) = c(5,2) - coeff*c(1,2) - c(5,3) = c(5,3) - coeff*c(1,3) - c(5,4) = c(5,4) - coeff*c(1,4) - c(5,5) = c(5,5) - coeff*c(1,5) - r(5) = r(5) - coeff*r(1) - - - pivot = 1.00d0/lhs(2,2) - lhs(2,3) = lhs(2,3)*pivot - lhs(2,4) = lhs(2,4)*pivot - lhs(2,5) = lhs(2,5)*pivot - c(2,1) = c(2,1)*pivot - c(2,2) = c(2,2)*pivot - c(2,3) = c(2,3)*pivot - c(2,4) = c(2,4)*pivot - c(2,5) = c(2,5)*pivot - r(2) = r(2) *pivot - - coeff = lhs(1,2) - lhs(1,3)= lhs(1,3) - coeff*lhs(2,3) - lhs(1,4)= lhs(1,4) - coeff*lhs(2,4) - lhs(1,5)= lhs(1,5) - coeff*lhs(2,5) - c(1,1) = c(1,1) - coeff*c(2,1) - c(1,2) = c(1,2) - coeff*c(2,2) - c(1,3) = c(1,3) - coeff*c(2,3) - c(1,4) = c(1,4) - coeff*c(2,4) - c(1,5) = c(1,5) - coeff*c(2,5) - r(1) = r(1) - coeff*r(2) - - coeff = lhs(3,2) - lhs(3,3)= lhs(3,3) - coeff*lhs(2,3) - lhs(3,4)= lhs(3,4) - coeff*lhs(2,4) - lhs(3,5)= lhs(3,5) - coeff*lhs(2,5) - c(3,1) = c(3,1) - coeff*c(2,1) - c(3,2) = c(3,2) - coeff*c(2,2) - c(3,3) = c(3,3) - coeff*c(2,3) - c(3,4) = c(3,4) - coeff*c(2,4) - c(3,5) = c(3,5) - coeff*c(2,5) - r(3) = r(3) - coeff*r(2) - - coeff = lhs(4,2) - lhs(4,3)= lhs(4,3) - coeff*lhs(2,3) - lhs(4,4)= lhs(4,4) - coeff*lhs(2,4) - lhs(4,5)= lhs(4,5) - coeff*lhs(2,5) - c(4,1) = c(4,1) - coeff*c(2,1) - c(4,2) = c(4,2) - coeff*c(2,2) - c(4,3) = c(4,3) - coeff*c(2,3) - c(4,4) = c(4,4) - coeff*c(2,4) - c(4,5) = c(4,5) - coeff*c(2,5) - r(4) = r(4) - coeff*r(2) - - coeff = lhs(5,2) - lhs(5,3)= lhs(5,3) - coeff*lhs(2,3) - lhs(5,4)= lhs(5,4) - coeff*lhs(2,4) - lhs(5,5)= lhs(5,5) - coeff*lhs(2,5) - c(5,1) = c(5,1) - coeff*c(2,1) - c(5,2) = c(5,2) - coeff*c(2,2) - c(5,3) = c(5,3) - coeff*c(2,3) - c(5,4) = c(5,4) - coeff*c(2,4) - c(5,5) = c(5,5) - coeff*c(2,5) - r(5) = r(5) - coeff*r(2) - - - pivot = 1.00d0/lhs(3,3) - lhs(3,4) = lhs(3,4)*pivot - lhs(3,5) = lhs(3,5)*pivot - c(3,1) = c(3,1)*pivot - c(3,2) = c(3,2)*pivot - c(3,3) = c(3,3)*pivot - c(3,4) = c(3,4)*pivot - c(3,5) = c(3,5)*pivot - r(3) = r(3) *pivot - - coeff = lhs(1,3) - lhs(1,4)= lhs(1,4) - coeff*lhs(3,4) - lhs(1,5)= lhs(1,5) - coeff*lhs(3,5) - c(1,1) = c(1,1) - coeff*c(3,1) - c(1,2) = c(1,2) - coeff*c(3,2) - c(1,3) = c(1,3) - coeff*c(3,3) - c(1,4) = c(1,4) - coeff*c(3,4) - c(1,5) = c(1,5) - coeff*c(3,5) - r(1) = r(1) - coeff*r(3) - - coeff = lhs(2,3) - lhs(2,4)= lhs(2,4) - coeff*lhs(3,4) - lhs(2,5)= lhs(2,5) - coeff*lhs(3,5) - c(2,1) = c(2,1) - coeff*c(3,1) - c(2,2) = c(2,2) - coeff*c(3,2) - c(2,3) = c(2,3) - coeff*c(3,3) - c(2,4) = c(2,4) - coeff*c(3,4) - c(2,5) = c(2,5) - coeff*c(3,5) - r(2) = r(2) - coeff*r(3) - - coeff = lhs(4,3) - lhs(4,4)= lhs(4,4) - coeff*lhs(3,4) - lhs(4,5)= lhs(4,5) - coeff*lhs(3,5) - c(4,1) = c(4,1) - coeff*c(3,1) - c(4,2) = c(4,2) - coeff*c(3,2) - c(4,3) = c(4,3) - coeff*c(3,3) - c(4,4) = c(4,4) - coeff*c(3,4) - c(4,5) = c(4,5) - coeff*c(3,5) - r(4) = r(4) - coeff*r(3) - - coeff = lhs(5,3) - lhs(5,4)= lhs(5,4) - coeff*lhs(3,4) - lhs(5,5)= lhs(5,5) - coeff*lhs(3,5) - c(5,1) = c(5,1) - coeff*c(3,1) - c(5,2) = c(5,2) - coeff*c(3,2) - c(5,3) = c(5,3) - coeff*c(3,3) - c(5,4) = c(5,4) - coeff*c(3,4) - c(5,5) = c(5,5) - coeff*c(3,5) - r(5) = r(5) - coeff*r(3) - - - pivot = 1.00d0/lhs(4,4) - lhs(4,5) = lhs(4,5)*pivot - c(4,1) = c(4,1)*pivot - c(4,2) = c(4,2)*pivot - c(4,3) = c(4,3)*pivot - c(4,4) = c(4,4)*pivot - c(4,5) = c(4,5)*pivot - r(4) = r(4) *pivot - - coeff = lhs(1,4) - lhs(1,5)= lhs(1,5) - coeff*lhs(4,5) - c(1,1) = c(1,1) - coeff*c(4,1) - c(1,2) = c(1,2) - coeff*c(4,2) - c(1,3) = c(1,3) - coeff*c(4,3) - c(1,4) = c(1,4) - coeff*c(4,4) - c(1,5) = c(1,5) - coeff*c(4,5) - r(1) = r(1) - coeff*r(4) - - coeff = lhs(2,4) - lhs(2,5)= lhs(2,5) - coeff*lhs(4,5) - c(2,1) = c(2,1) - coeff*c(4,1) - c(2,2) = c(2,2) - coeff*c(4,2) - c(2,3) = c(2,3) - coeff*c(4,3) - c(2,4) = c(2,4) - coeff*c(4,4) - c(2,5) = c(2,5) - coeff*c(4,5) - r(2) = r(2) - coeff*r(4) - - coeff = lhs(3,4) - lhs(3,5)= lhs(3,5) - coeff*lhs(4,5) - c(3,1) = c(3,1) - coeff*c(4,1) - c(3,2) = c(3,2) - coeff*c(4,2) - c(3,3) = c(3,3) - coeff*c(4,3) - c(3,4) = c(3,4) - coeff*c(4,4) - c(3,5) = c(3,5) - coeff*c(4,5) - r(3) = r(3) - coeff*r(4) - - coeff = lhs(5,4) - lhs(5,5)= lhs(5,5) - coeff*lhs(4,5) - c(5,1) = c(5,1) - coeff*c(4,1) - c(5,2) = c(5,2) - coeff*c(4,2) - c(5,3) = c(5,3) - coeff*c(4,3) - c(5,4) = c(5,4) - coeff*c(4,4) - c(5,5) = c(5,5) - coeff*c(4,5) - r(5) = r(5) - coeff*r(4) - - - pivot = 1.00d0/lhs(5,5) - c(5,1) = c(5,1)*pivot - c(5,2) = c(5,2)*pivot - c(5,3) = c(5,3)*pivot - c(5,4) = c(5,4)*pivot - c(5,5) = c(5,5)*pivot - r(5) = r(5) *pivot - - coeff = lhs(1,5) - c(1,1) = c(1,1) - coeff*c(5,1) - c(1,2) = c(1,2) - coeff*c(5,2) - c(1,3) = c(1,3) - coeff*c(5,3) - c(1,4) = c(1,4) - coeff*c(5,4) - c(1,5) = c(1,5) - coeff*c(5,5) - r(1) = r(1) - coeff*r(5) - - coeff = lhs(2,5) - c(2,1) = c(2,1) - coeff*c(5,1) - c(2,2) = c(2,2) - coeff*c(5,2) - c(2,3) = c(2,3) - coeff*c(5,3) - c(2,4) = c(2,4) - coeff*c(5,4) - c(2,5) = c(2,5) - coeff*c(5,5) - r(2) = r(2) - coeff*r(5) - - coeff = lhs(3,5) - c(3,1) = c(3,1) - coeff*c(5,1) - c(3,2) = c(3,2) - coeff*c(5,2) - c(3,3) = c(3,3) - coeff*c(5,3) - c(3,4) = c(3,4) - coeff*c(5,4) - c(3,5) = c(3,5) - coeff*c(5,5) - r(3) = r(3) - coeff*r(5) - - coeff = lhs(4,5) - c(4,1) = c(4,1) - coeff*c(5,1) - c(4,2) = c(4,2) - coeff*c(5,2) - c(4,3) = c(4,3) - coeff*c(5,3) - c(4,4) = c(4,4) - coeff*c(5,4) - c(4,5) = c(4,5) - coeff*c(5,5) - r(4) = r(4) - coeff*r(5) - - - return - end - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine binvrhs( lhs,r ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c--------------------------------------------------------------------- - - implicit none - - double precision pivot, coeff, lhs - dimension lhs(5,5) - double precision r(5) - -c--------------------------------------------------------------------- -c -c--------------------------------------------------------------------- - - - pivot = 1.00d0/lhs(1,1) - lhs(1,2) = lhs(1,2)*pivot - lhs(1,3) = lhs(1,3)*pivot - lhs(1,4) = lhs(1,4)*pivot - lhs(1,5) = lhs(1,5)*pivot - r(1) = r(1) *pivot - - coeff = lhs(2,1) - lhs(2,2)= lhs(2,2) - coeff*lhs(1,2) - lhs(2,3)= lhs(2,3) - coeff*lhs(1,3) - lhs(2,4)= lhs(2,4) - coeff*lhs(1,4) - lhs(2,5)= lhs(2,5) - coeff*lhs(1,5) - r(2) = r(2) - coeff*r(1) - - coeff = lhs(3,1) - lhs(3,2)= lhs(3,2) - coeff*lhs(1,2) - lhs(3,3)= lhs(3,3) - coeff*lhs(1,3) - lhs(3,4)= lhs(3,4) - coeff*lhs(1,4) - lhs(3,5)= lhs(3,5) - coeff*lhs(1,5) - r(3) = r(3) - coeff*r(1) - - coeff = lhs(4,1) - lhs(4,2)= lhs(4,2) - coeff*lhs(1,2) - lhs(4,3)= lhs(4,3) - coeff*lhs(1,3) - lhs(4,4)= lhs(4,4) - coeff*lhs(1,4) - lhs(4,5)= lhs(4,5) - coeff*lhs(1,5) - r(4) = r(4) - coeff*r(1) - - coeff = lhs(5,1) - lhs(5,2)= lhs(5,2) - coeff*lhs(1,2) - lhs(5,3)= lhs(5,3) - coeff*lhs(1,3) - lhs(5,4)= lhs(5,4) - coeff*lhs(1,4) - lhs(5,5)= lhs(5,5) - coeff*lhs(1,5) - r(5) = r(5) - coeff*r(1) - - - pivot = 1.00d0/lhs(2,2) - lhs(2,3) = lhs(2,3)*pivot - lhs(2,4) = lhs(2,4)*pivot - lhs(2,5) = lhs(2,5)*pivot - r(2) = r(2) *pivot - - coeff = lhs(1,2) - lhs(1,3)= lhs(1,3) - coeff*lhs(2,3) - lhs(1,4)= lhs(1,4) - coeff*lhs(2,4) - lhs(1,5)= lhs(1,5) - coeff*lhs(2,5) - r(1) = r(1) - coeff*r(2) - - coeff = lhs(3,2) - lhs(3,3)= lhs(3,3) - coeff*lhs(2,3) - lhs(3,4)= lhs(3,4) - coeff*lhs(2,4) - lhs(3,5)= lhs(3,5) - coeff*lhs(2,5) - r(3) = r(3) - coeff*r(2) - - coeff = lhs(4,2) - lhs(4,3)= lhs(4,3) - coeff*lhs(2,3) - lhs(4,4)= lhs(4,4) - coeff*lhs(2,4) - lhs(4,5)= lhs(4,5) - coeff*lhs(2,5) - r(4) = r(4) - coeff*r(2) - - coeff = lhs(5,2) - lhs(5,3)= lhs(5,3) - coeff*lhs(2,3) - lhs(5,4)= lhs(5,4) - coeff*lhs(2,4) - lhs(5,5)= lhs(5,5) - coeff*lhs(2,5) - r(5) = r(5) - coeff*r(2) - - - pivot = 1.00d0/lhs(3,3) - lhs(3,4) = lhs(3,4)*pivot - lhs(3,5) = lhs(3,5)*pivot - r(3) = r(3) *pivot - - coeff = lhs(1,3) - lhs(1,4)= lhs(1,4) - coeff*lhs(3,4) - lhs(1,5)= lhs(1,5) - coeff*lhs(3,5) - r(1) = r(1) - coeff*r(3) - - coeff = lhs(2,3) - lhs(2,4)= lhs(2,4) - coeff*lhs(3,4) - lhs(2,5)= lhs(2,5) - coeff*lhs(3,5) - r(2) = r(2) - coeff*r(3) - - coeff = lhs(4,3) - lhs(4,4)= lhs(4,4) - coeff*lhs(3,4) - lhs(4,5)= lhs(4,5) - coeff*lhs(3,5) - r(4) = r(4) - coeff*r(3) - - coeff = lhs(5,3) - lhs(5,4)= lhs(5,4) - coeff*lhs(3,4) - lhs(5,5)= lhs(5,5) - coeff*lhs(3,5) - r(5) = r(5) - coeff*r(3) - - - pivot = 1.00d0/lhs(4,4) - lhs(4,5) = lhs(4,5)*pivot - r(4) = r(4) *pivot - - coeff = lhs(1,4) - lhs(1,5)= lhs(1,5) - coeff*lhs(4,5) - r(1) = r(1) - coeff*r(4) - - coeff = lhs(2,4) - lhs(2,5)= lhs(2,5) - coeff*lhs(4,5) - r(2) = r(2) - coeff*r(4) - - coeff = lhs(3,4) - lhs(3,5)= lhs(3,5) - coeff*lhs(4,5) - r(3) = r(3) - coeff*r(4) - - coeff = lhs(5,4) - lhs(5,5)= lhs(5,5) - coeff*lhs(4,5) - r(5) = r(5) - coeff*r(4) - - - pivot = 1.00d0/lhs(5,5) - r(5) = r(5) *pivot - - coeff = lhs(1,5) - r(1) = r(1) - coeff*r(5) - - coeff = lhs(2,5) - r(2) = r(2) - coeff*r(5) - - coeff = lhs(3,5) - r(3) = r(3) - coeff*r(5) - - coeff = lhs(4,5) - r(4) = r(4) - coeff*r(5) - - - return - end - - - diff --git a/examples/smpi/NAS/BT/verify.f b/examples/smpi/NAS/BT/verify.f deleted file mode 100644 index 7dbc8a96a2..0000000000 --- a/examples/smpi/NAS/BT/verify.f +++ /dev/null @@ -1,435 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine verify(no_time_steps, class, verified) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c verification routine -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5), - > epsilon, xce(5), xcr(5), dtref - integer m, no_time_steps - character class - logical verified - -c--------------------------------------------------------------------- -c tolerance level -c--------------------------------------------------------------------- - epsilon = 1.0d-08 - verified = .true. - -c--------------------------------------------------------------------- -c compute the error norm and the residual norm, and exit if not printing -c--------------------------------------------------------------------- - - if (iotype .ne. 0) then - call accumulate_norms(xce) - else - call error_norm(xce) - endif - - call copy_faces - - call rhs_norm(xcr) - - do m = 1, 5 - xcr(m) = xcr(m) / dt - enddo - - if (node .ne. 0) return - - class = 'U' - - do m = 1,5 - xcrref(m) = 1.0 - xceref(m) = 1.0 - end do - -c--------------------------------------------------------------------- -c reference data for 12X12X12 grids after 60 time steps, with DT = 1.0d-02 -c--------------------------------------------------------------------- - if ( (grid_points(1) .eq. 12 ) .and. - > (grid_points(2) .eq. 12 ) .and. - > (grid_points(3) .eq. 12 ) .and. - > (no_time_steps .eq. 60 )) then - - class = 'S' - dtref = 1.0d-2 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 1.7034283709541311d-01 - xcrref(2) = 1.2975252070034097d-02 - xcrref(3) = 3.2527926989486055d-02 - xcrref(4) = 2.6436421275166801d-02 - xcrref(5) = 1.9211784131744430d-01 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - - if (iotype .eq. 0) then - xceref(1) = 4.9976913345811579d-04 - xceref(2) = 4.5195666782961927d-05 - xceref(3) = 7.3973765172921357d-05 - xceref(4) = 7.3821238632439731d-05 - xceref(5) = 8.9269630987491446d-04 - else - xceref(1) = 0.1149036328945d+02 - xceref(2) = 0.9156788904727d+00 - xceref(3) = 0.2857899428614d+01 - xceref(4) = 0.2598273346734d+01 - xceref(5) = 0.2652795397547d+02 - endif - -c--------------------------------------------------------------------- -c reference data for 24X24X24 grids after 200 time steps, with DT = 0.8d-3 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 24) .and. - > (grid_points(2) .eq. 24) .and. - > (grid_points(3) .eq. 24) .and. - > (no_time_steps . eq. 200) ) then - - class = 'W' - dtref = 0.8d-3 -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 0.1125590409344d+03 - xcrref(2) = 0.1180007595731d+02 - xcrref(3) = 0.2710329767846d+02 - xcrref(4) = 0.2469174937669d+02 - xcrref(5) = 0.2638427874317d+03 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - - if (iotype .eq. 0) then - xceref(1) = 0.4419655736008d+01 - xceref(2) = 0.4638531260002d+00 - xceref(3) = 0.1011551749967d+01 - xceref(4) = 0.9235878729944d+00 - xceref(5) = 0.1018045837718d+02 - else - xceref(1) = 0.6729594398612d+02 - xceref(2) = 0.5264523081690d+01 - xceref(3) = 0.1677107142637d+02 - xceref(4) = 0.1508721463436d+02 - xceref(5) = 0.1477018363393d+03 - endif - - -c--------------------------------------------------------------------- -c reference data for 64X64X64 grids after 200 time steps, with DT = 0.8d-3 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 64) .and. - > (grid_points(2) .eq. 64) .and. - > (grid_points(3) .eq. 64) .and. - > (no_time_steps . eq. 200) ) then - - class = 'A' - dtref = 0.8d-3 -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 1.0806346714637264d+02 - xcrref(2) = 1.1319730901220813d+01 - xcrref(3) = 2.5974354511582465d+01 - xcrref(4) = 2.3665622544678910d+01 - xcrref(5) = 2.5278963211748344d+02 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - - if (iotype .eq. 0) then - xceref(1) = 4.2348416040525025d+00 - xceref(2) = 4.4390282496995698d-01 - xceref(3) = 9.6692480136345650d-01 - xceref(4) = 8.8302063039765474d-01 - xceref(5) = 9.7379901770829278d+00 - else - xceref(1) = 0.6482218724961d+02 - xceref(2) = 0.5066461714527d+01 - xceref(3) = 0.1613931961359d+02 - xceref(4) = 0.1452010201481d+02 - xceref(5) = 0.1420099377681d+03 - endif - -c--------------------------------------------------------------------- -c reference data for 102X102X102 grids after 200 time steps, -c with DT = 3.0d-04 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 102) .and. - > (grid_points(2) .eq. 102) .and. - > (grid_points(3) .eq. 102) .and. - > (no_time_steps . eq. 200) ) then - - class = 'B' - dtref = 3.0d-4 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 1.4233597229287254d+03 - xcrref(2) = 9.9330522590150238d+01 - xcrref(3) = 3.5646025644535285d+02 - xcrref(4) = 3.2485447959084092d+02 - xcrref(5) = 3.2707541254659363d+03 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - - if (iotype .eq. 0) then - xceref(1) = 5.2969847140936856d+01 - xceref(2) = 4.4632896115670668d+00 - xceref(3) = 1.3122573342210174d+01 - xceref(4) = 1.2006925323559144d+01 - xceref(5) = 1.2459576151035986d+02 - else - xceref(1) = 0.1477545106464d+03 - xceref(2) = 0.1108895555053d+02 - xceref(3) = 0.3698065590331d+02 - xceref(4) = 0.3310505581440d+02 - xceref(5) = 0.3157928282563d+03 - endif - -c--------------------------------------------------------------------- -c reference data for 162X162X162 grids after 200 time steps, -c with DT = 1.0d-04 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 162) .and. - > (grid_points(2) .eq. 162) .and. - > (grid_points(3) .eq. 162) .and. - > (no_time_steps . eq. 200) ) then - - class = 'C' - dtref = 1.0d-4 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 0.62398116551764615d+04 - xcrref(2) = 0.50793239190423964d+03 - xcrref(3) = 0.15423530093013596d+04 - xcrref(4) = 0.13302387929291190d+04 - xcrref(5) = 0.11604087428436455d+05 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - - if (iotype .eq. 0) then - xceref(1) = 0.16462008369091265d+03 - xceref(2) = 0.11497107903824313d+02 - xceref(3) = 0.41207446207461508d+02 - xceref(4) = 0.37087651059694167d+02 - xceref(5) = 0.36211053051841265d+03 - else - xceref(1) = 0.2597156483475d+03 - xceref(2) = 0.1985384289495d+02 - xceref(3) = 0.6517950485788d+02 - xceref(4) = 0.5757235541520d+02 - xceref(5) = 0.5215668188726d+03 - endif - - -c--------------------------------------------------------------------- -c reference data for 408x408x408 grids after 250 time steps, -c with DT = 0.2d-04 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 408) .and. - > (grid_points(2) .eq. 408) .and. - > (grid_points(3) .eq. 408) .and. - > (no_time_steps . eq. 250) ) then - - class = 'D' - dtref = 0.2d-4 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 0.2533188551738d+05 - xcrref(2) = 0.2346393716980d+04 - xcrref(3) = 0.6294554366904d+04 - xcrref(4) = 0.5352565376030d+04 - xcrref(5) = 0.3905864038618d+05 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - - if (iotype .eq. 0) then - xceref(1) = 0.3100009377557d+03 - xceref(2) = 0.2424086324913d+02 - xceref(3) = 0.7782212022645d+02 - xceref(4) = 0.6835623860116d+02 - xceref(5) = 0.6065737200368d+03 - else - xceref(1) = 0.3813781566713d+03 - xceref(2) = 0.3160872966198d+02 - xceref(3) = 0.9593576357290d+02 - xceref(4) = 0.8363391989815d+02 - xceref(5) = 0.7063466087423d+03 - endif - - -c--------------------------------------------------------------------- -c reference data for 1020x1020x1020 grids after 250 time steps, -c with DT = 0.4d-05 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 1020) .and. - > (grid_points(2) .eq. 1020) .and. - > (grid_points(3) .eq. 1020) .and. - > (no_time_steps . eq. 250) ) then - - class = 'E' - dtref = 0.4d-5 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 0.9795372484517d+05 - xcrref(2) = 0.9739814511521d+04 - xcrref(3) = 0.2467606342965d+05 - xcrref(4) = 0.2092419572860d+05 - xcrref(5) = 0.1392138856939d+06 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - - if (iotype .eq. 0) then - xceref(1) = 0.4327562208414d+03 - xceref(2) = 0.3699051964887d+02 - xceref(3) = 0.1089845040954d+03 - xceref(4) = 0.9462517622043d+02 - xceref(5) = 0.7765512765309d+03 - else -c wr_interval = 5 - xceref(1) = 0.4729898413058d+03 - xceref(2) = 0.4145899331704d+02 - xceref(3) = 0.1192850917138d+03 - xceref(4) = 0.1032746026932d+03 - xceref(5) = 0.8270322177634d+03 -c wr_interval = 10 -c xceref(1) = 0.4718135916251d+03 -c xceref(2) = 0.4132620259096d+02 -c xceref(3) = 0.1189831133503d+03 -c xceref(4) = 0.1030212798803d+03 -c xceref(5) = 0.8255924078458d+03 - endif - - else - verified = .false. - endif - -c--------------------------------------------------------------------- -c verification test for residuals if gridsize is one of -c the defined grid sizes above (class .ne. 'U') -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Compute the difference of solution values and the known reference -c values. -c--------------------------------------------------------------------- - do m = 1, 5 - - xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) - xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m)) - - enddo - -c--------------------------------------------------------------------- -c Output the comparison of computed results to known cases. -c--------------------------------------------------------------------- - - if (class .ne. 'U') then - write(*, 1990) class - 1990 format(' Verification being performed for class ', a) - write (*,2000) epsilon - 2000 format(' accuracy setting for epsilon = ', E20.13) - verified = (dabs(dt-dtref) .le. epsilon) - if (.not.verified) then - verified = .false. - class = 'U' - write (*,1000) dtref - 1000 format(' DT does not match the reference value of ', - > E15.8) - endif - else - write(*, 1995) - 1995 format(' Unknown class') - endif - - - if (class .ne. 'U') then - write (*,2001) - else - write (*, 2005) - endif - - 2001 format(' Comparison of RMS-norms of residual') - 2005 format(' RMS-norms of residual') - do m = 1, 5 - if (class .eq. 'U') then - write(*, 2015) m, xcr(m) - else if (xcrdif(m) .le. epsilon) then - write (*,2011) m,xcr(m),xcrref(m),xcrdif(m) - else - verified = .false. - write (*,2010) m,xcr(m),xcrref(m),xcrdif(m) - endif - enddo - - if (class .ne. 'U') then - write (*,2002) - else - write (*,2006) - endif - 2002 format(' Comparison of RMS-norms of solution error') - 2006 format(' RMS-norms of solution error') - - do m = 1, 5 - if (class .eq. 'U') then - write(*, 2015) m, xce(m) - else if (xcedif(m) .le. epsilon) then - write (*,2011) m,xce(m),xceref(m),xcedif(m) - else - verified = .false. - write (*,2010) m,xce(m),xceref(m),xcedif(m) - endif - enddo - - 2010 format(' FAILURE: ', i2, E20.13, E20.13, E20.13) - 2011 format(' ', i2, E20.13, E20.13, E20.13) - 2015 format(' ', i2, E20.13) - - if (class .eq. 'U') then - write(*, 2022) - write(*, 2023) - 2022 format(' No reference values provided') - 2023 format(' No verification performed') - else if (verified) then - write(*, 2020) - 2020 format(' Verification Successful') - else - write(*, 2021) - 2021 format(' Verification failed') - endif - - return - - - end diff --git a/examples/smpi/NAS/BT/work_lhs.h b/examples/smpi/NAS/BT/work_lhs.h deleted file mode 100644 index d9bc9e4d66..0000000000 --- a/examples/smpi/NAS/BT/work_lhs.h +++ /dev/null @@ -1,14 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- -c -c work_lhs.h -c -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - double precision fjac(5, 5, -2:MAX_CELL_DIM+1), - > njac(5, 5, -2:MAX_CELL_DIM+1), - > lhsa(5, 5, -1:MAX_CELL_DIM), - > lhsb(5, 5, -1:MAX_CELL_DIM), - > tmp1, tmp2, tmp3 - common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3 diff --git a/examples/smpi/NAS/BT/work_lhs_vec.h b/examples/smpi/NAS/BT/work_lhs_vec.h deleted file mode 100644 index a97054f419..0000000000 --- a/examples/smpi/NAS/BT/work_lhs_vec.h +++ /dev/null @@ -1,14 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- -c -c work_lhs_vec.h -c -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - double precision fjac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1), - > njac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1), - > lhsa(5, 5, -1:MAX_CELL_DIM, -1:MAX_CELL_DIM), - > lhsb(5, 5, -1:MAX_CELL_DIM, -1:MAX_CELL_DIM), - > tmp1, tmp2, tmp3 - common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3 diff --git a/examples/smpi/NAS/BT/x_solve.f b/examples/smpi/NAS/BT/x_solve.f deleted file mode 100644 index 5386732616..0000000000 --- a/examples/smpi/NAS/BT/x_solve.f +++ /dev/null @@ -1,761 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_solve - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c Performs line solves in X direction by first factoring -c the block-tridiagonal matrix into an upper triangular matrix, -c and then performing back substitution to solve for the unknow -c vectors of each line. -c -c Make sure we treat elements zero to cell_size in the direction -c of the sweep. -c -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - integer c, istart, stage, - > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), - > isize,jsize,ksize,send_id - - istart = 0 - -c--------------------------------------------------------------------- -c in our terminology stage is the number of the cell in the x-direction -c i.e. stage = 1 means the start of the line stage=ncells means end -c--------------------------------------------------------------------- - do stage = 1,ncells - c = slice(1,stage) - isize = cell_size(1,c) - 1 - jsize = cell_size(2,c) - 1 - ksize = cell_size(3,c) - 1 - -c--------------------------------------------------------------------- -c set last-cell flag -c--------------------------------------------------------------------- - if (stage .eq. ncells) then - last = 1 - else - last = 0 - endif - - if (stage .eq. 1) then -c--------------------------------------------------------------------- -c This is the first cell, so solve without receiving data -c--------------------------------------------------------------------- - first = 1 -c call lhsx(c) - call x_solve_cell(first,last,c) - else -c--------------------------------------------------------------------- -c Not the first cell of this line, so receive info from -c processor working on preceeding cell -c--------------------------------------------------------------------- - first = 0 - call x_receive_solve_info(recv_id,c) -c--------------------------------------------------------------------- -c overlap computations and communications -c--------------------------------------------------------------------- -c call lhsx(c) -c--------------------------------------------------------------------- -c wait for completion -c--------------------------------------------------------------------- - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) -c--------------------------------------------------------------------- -c install C'(istart) and rhs'(istart) to be used in this cell -c--------------------------------------------------------------------- - call x_unpack_solve_info(c) - call x_solve_cell(first,last,c) - endif - - if (last .eq. 0) call x_send_solve_info(send_id,c) - enddo - -c--------------------------------------------------------------------- -c now perform backsubstitution in reverse direction -c--------------------------------------------------------------------- - do stage = ncells, 1, -1 - c = slice(1,stage) - first = 0 - last = 0 - if (stage .eq. 1) first = 1 - if (stage .eq. ncells) then - last = 1 -c--------------------------------------------------------------------- -c last cell, so perform back substitute without waiting -c--------------------------------------------------------------------- - call x_backsubstitute(first, last,c) - else - call x_receive_backsub_info(recv_id,c) - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) - call x_unpack_backsub_info(c) - call x_backsubstitute(first,last,c) - endif - if (first .eq. 0) call x_send_backsub_info(send_id,c) - enddo - - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_unpack_solve_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack C'(-1) and rhs'(-1) for -c all j and k -c--------------------------------------------------------------------- - - include 'header.h' - integer j,k,m,n,ptr,c,istart - - istart = 0 - ptr = 0 - do k=0,KMAX-1 - do j=0,JMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - rhs(n,istart-1,j,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_send_solve_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send C'(iend) and rhs'(iend) for -c all j and k -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer j,k,m,n,isize,ptr,c,jp,kp - integer error,send_id,buffer_size - - isize = cell_size(1,c)-1 - jp = cell_coord(2,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - -c--------------------------------------------------------------------- -c pack up buffer -c--------------------------------------------------------------------- - ptr = 0 - do k=0,KMAX-1 - do j=0,JMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,isize,j,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - -c--------------------------------------------------------------------- -c send buffer -c--------------------------------------------------------------------- - call mpi_isend(in_buffer, buffer_size, - > dp_type, successor(1), - > WEST+jp+kp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_send_backsub_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send U(istart) for all j and k -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer j,k,n,ptr,c,istart,jp,kp - integer error,send_id,buffer_size - -c--------------------------------------------------------------------- -c Send element 0 to previous processor -c--------------------------------------------------------------------- - istart = 0 - jp = cell_coord(2,c)-1 - kp = cell_coord(3,c)-1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - ptr = 0 - do k=0,KMAX-1 - do j=0,JMAX-1 - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,istart,j,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - call mpi_isend(in_buffer, buffer_size, - > dp_type, predecessor(1), - > EAST+jp+kp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_unpack_backsub_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack U(isize) for all j and k -c--------------------------------------------------------------------- - - include 'header.h' - integer j,k,n,ptr,c - - ptr = 0 - do k=0,KMAX-1 - do j=0,JMAX-1 - do n=1,BLOCK_SIZE - backsub_info(n,j,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_receive_backsub_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer error,recv_id,jp,kp,c,buffer_size - jp = cell_coord(2,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - call mpi_irecv(out_buffer, buffer_size, - > dp_type, successor(1), - > EAST+jp+kp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_receive_solve_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer jp,kp,recv_id,error,c,buffer_size - jp = cell_coord(2,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - call mpi_irecv(out_buffer, buffer_size, - > dp_type, predecessor(1), - > WEST+jp+kp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_backsubstitute(first, last, c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c back solve: if last cell, then generate U(isize)=rhs(isize) -c else assume U(isize) is loaded in un pack backsub_info -c so just use it -c after call u(istart) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - - integer first, last, c, i, j, k - integer m,n,isize,jsize,ksize,istart - - istart = 0 - isize = cell_size(1,c)-1 - jsize = cell_size(2,c)-end(2,c)-1 - ksize = cell_size(3,c)-end(3,c)-1 - if (last .eq. 0) then - do k=start(3,c),ksize - do j=start(2,c),jsize -c--------------------------------------------------------------------- -c U(isize) uses info from previous cell if not last cell -c--------------------------------------------------------------------- - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) - > - lhsc(m,n,isize,j,k,c)* - > backsub_info(n,j,k,c) -c--------------------------------------------------------------------- -c rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) -c $ - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c) -c--------------------------------------------------------------------- - enddo - enddo - enddo - enddo - endif - do k=start(3,c),ksize - do j=start(2,c),jsize - do i=isize-1,istart,-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - > - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c) - enddo - enddo - enddo - enddo - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_solve_cell(first,last,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c performs guaussian elimination on this cell. -c -c assumes that unpacking routines for non-first cells -c preload C' and rhs' from previous cell. -c -c assumed send happens outside this routine, but that -c c'(IMAX) and rhs'(IMAX) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - include 'work_lhs.h' - - integer first,last,c - integer i,j,k,isize,ksize,jsize,istart - - istart = 0 - isize = cell_size(1,c)-1 - jsize = cell_size(2,c)-end(2,c)-1 - ksize = cell_size(3,c)-end(3,c)-1 - - call lhsabinit(lhsa, lhsb, isize) - - do k=start(3,c),ksize - do j=start(2,c),jsize - -c--------------------------------------------------------------------- -c This function computes the left hand side in the xi-direction -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c determine a (labeled f) and n jacobians for cell c -c--------------------------------------------------------------------- - do i = start(1,c)-1, cell_size(1,c) - end(1,c) - - tmp1 = rho_i(i,j,k,c) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 -c--------------------------------------------------------------------- -c -c--------------------------------------------------------------------- - fjac(1,1,i) = 0.0d+00 - fjac(1,2,i) = 1.0d+00 - fjac(1,3,i) = 0.0d+00 - fjac(1,4,i) = 0.0d+00 - fjac(1,5,i) = 0.0d+00 - - fjac(2,1,i) = -(u(2,i,j,k,c) * tmp2 * - > u(2,i,j,k,c)) - > + c2 * qs(i,j,k,c) - fjac(2,2,i) = ( 2.0d+00 - c2 ) - > * ( u(2,i,j,k,c) * tmp1 ) - fjac(2,3,i) = - c2 * ( u(3,i,j,k,c) * tmp1 ) - fjac(2,4,i) = - c2 * ( u(4,i,j,k,c) * tmp1 ) - fjac(2,5,i) = c2 - - fjac(3,1,i) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2 - fjac(3,2,i) = u(3,i,j,k,c) * tmp1 - fjac(3,3,i) = u(2,i,j,k,c) * tmp1 - fjac(3,4,i) = 0.0d+00 - fjac(3,5,i) = 0.0d+00 - - fjac(4,1,i) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2 - fjac(4,2,i) = u(4,i,j,k,c) * tmp1 - fjac(4,3,i) = 0.0d+00 - fjac(4,4,i) = u(2,i,j,k,c) * tmp1 - fjac(4,5,i) = 0.0d+00 - - fjac(5,1,i) = ( c2 * 2.0d0 * qs(i,j,k,c) - > - c1 * ( u(5,i,j,k,c) * tmp1 ) ) - > * ( u(2,i,j,k,c) * tmp1 ) - fjac(5,2,i) = c1 * u(5,i,j,k,c) * tmp1 - > - c2 - > * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2 - > + qs(i,j,k,c) ) - fjac(5,3,i) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) ) - > * tmp2 - fjac(5,4,i) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) ) - > * tmp2 - fjac(5,5,i) = c1 * ( u(2,i,j,k,c) * tmp1 ) - - njac(1,1,i) = 0.0d+00 - njac(1,2,i) = 0.0d+00 - njac(1,3,i) = 0.0d+00 - njac(1,4,i) = 0.0d+00 - njac(1,5,i) = 0.0d+00 - - njac(2,1,i) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c) - njac(2,2,i) = con43 * c3c4 * tmp1 - njac(2,3,i) = 0.0d+00 - njac(2,4,i) = 0.0d+00 - njac(2,5,i) = 0.0d+00 - - njac(3,1,i) = - c3c4 * tmp2 * u(3,i,j,k,c) - njac(3,2,i) = 0.0d+00 - njac(3,3,i) = c3c4 * tmp1 - njac(3,4,i) = 0.0d+00 - njac(3,5,i) = 0.0d+00 - - njac(4,1,i) = - c3c4 * tmp2 * u(4,i,j,k,c) - njac(4,2,i) = 0.0d+00 - njac(4,3,i) = 0.0d+00 - njac(4,4,i) = c3c4 * tmp1 - njac(4,5,i) = 0.0d+00 - - njac(5,1,i) = - ( con43 * c3c4 - > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2) - > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2) - > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2) - > - c1345 * tmp2 * u(5,i,j,k,c) - - njac(5,2,i) = ( con43 * c3c4 - > - c1345 ) * tmp2 * u(2,i,j,k,c) - njac(5,3,i) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c) - njac(5,4,i) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c) - njac(5,5,i) = ( c1345 ) * tmp1 - - enddo -c--------------------------------------------------------------------- -c now jacobians set, so form left hand side in x direction -c--------------------------------------------------------------------- - do i = start(1,c), isize - end(1,c) - - tmp1 = dt * tx1 - tmp2 = dt * tx2 - - lhsa(1,1,i) = - tmp2 * fjac(1,1,i-1) - > - tmp1 * njac(1,1,i-1) - > - tmp1 * dx1 - lhsa(1,2,i) = - tmp2 * fjac(1,2,i-1) - > - tmp1 * njac(1,2,i-1) - lhsa(1,3,i) = - tmp2 * fjac(1,3,i-1) - > - tmp1 * njac(1,3,i-1) - lhsa(1,4,i) = - tmp2 * fjac(1,4,i-1) - > - tmp1 * njac(1,4,i-1) - lhsa(1,5,i) = - tmp2 * fjac(1,5,i-1) - > - tmp1 * njac(1,5,i-1) - - lhsa(2,1,i) = - tmp2 * fjac(2,1,i-1) - > - tmp1 * njac(2,1,i-1) - lhsa(2,2,i) = - tmp2 * fjac(2,2,i-1) - > - tmp1 * njac(2,2,i-1) - > - tmp1 * dx2 - lhsa(2,3,i) = - tmp2 * fjac(2,3,i-1) - > - tmp1 * njac(2,3,i-1) - lhsa(2,4,i) = - tmp2 * fjac(2,4,i-1) - > - tmp1 * njac(2,4,i-1) - lhsa(2,5,i) = - tmp2 * fjac(2,5,i-1) - > - tmp1 * njac(2,5,i-1) - - lhsa(3,1,i) = - tmp2 * fjac(3,1,i-1) - > - tmp1 * njac(3,1,i-1) - lhsa(3,2,i) = - tmp2 * fjac(3,2,i-1) - > - tmp1 * njac(3,2,i-1) - lhsa(3,3,i) = - tmp2 * fjac(3,3,i-1) - > - tmp1 * njac(3,3,i-1) - > - tmp1 * dx3 - lhsa(3,4,i) = - tmp2 * fjac(3,4,i-1) - > - tmp1 * njac(3,4,i-1) - lhsa(3,5,i) = - tmp2 * fjac(3,5,i-1) - > - tmp1 * njac(3,5,i-1) - - lhsa(4,1,i) = - tmp2 * fjac(4,1,i-1) - > - tmp1 * njac(4,1,i-1) - lhsa(4,2,i) = - tmp2 * fjac(4,2,i-1) - > - tmp1 * njac(4,2,i-1) - lhsa(4,3,i) = - tmp2 * fjac(4,3,i-1) - > - tmp1 * njac(4,3,i-1) - lhsa(4,4,i) = - tmp2 * fjac(4,4,i-1) - > - tmp1 * njac(4,4,i-1) - > - tmp1 * dx4 - lhsa(4,5,i) = - tmp2 * fjac(4,5,i-1) - > - tmp1 * njac(4,5,i-1) - - lhsa(5,1,i) = - tmp2 * fjac(5,1,i-1) - > - tmp1 * njac(5,1,i-1) - lhsa(5,2,i) = - tmp2 * fjac(5,2,i-1) - > - tmp1 * njac(5,2,i-1) - lhsa(5,3,i) = - tmp2 * fjac(5,3,i-1) - > - tmp1 * njac(5,3,i-1) - lhsa(5,4,i) = - tmp2 * fjac(5,4,i-1) - > - tmp1 * njac(5,4,i-1) - lhsa(5,5,i) = - tmp2 * fjac(5,5,i-1) - > - tmp1 * njac(5,5,i-1) - > - tmp1 * dx5 - - lhsb(1,1,i) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(1,1,i) - > + tmp1 * 2.0d+00 * dx1 - lhsb(1,2,i) = tmp1 * 2.0d+00 * njac(1,2,i) - lhsb(1,3,i) = tmp1 * 2.0d+00 * njac(1,3,i) - lhsb(1,4,i) = tmp1 * 2.0d+00 * njac(1,4,i) - lhsb(1,5,i) = tmp1 * 2.0d+00 * njac(1,5,i) - - lhsb(2,1,i) = tmp1 * 2.0d+00 * njac(2,1,i) - lhsb(2,2,i) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(2,2,i) - > + tmp1 * 2.0d+00 * dx2 - lhsb(2,3,i) = tmp1 * 2.0d+00 * njac(2,3,i) - lhsb(2,4,i) = tmp1 * 2.0d+00 * njac(2,4,i) - lhsb(2,5,i) = tmp1 * 2.0d+00 * njac(2,5,i) - - lhsb(3,1,i) = tmp1 * 2.0d+00 * njac(3,1,i) - lhsb(3,2,i) = tmp1 * 2.0d+00 * njac(3,2,i) - lhsb(3,3,i) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(3,3,i) - > + tmp1 * 2.0d+00 * dx3 - lhsb(3,4,i) = tmp1 * 2.0d+00 * njac(3,4,i) - lhsb(3,5,i) = tmp1 * 2.0d+00 * njac(3,5,i) - - lhsb(4,1,i) = tmp1 * 2.0d+00 * njac(4,1,i) - lhsb(4,2,i) = tmp1 * 2.0d+00 * njac(4,2,i) - lhsb(4,3,i) = tmp1 * 2.0d+00 * njac(4,3,i) - lhsb(4,4,i) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(4,4,i) - > + tmp1 * 2.0d+00 * dx4 - lhsb(4,5,i) = tmp1 * 2.0d+00 * njac(4,5,i) - - lhsb(5,1,i) = tmp1 * 2.0d+00 * njac(5,1,i) - lhsb(5,2,i) = tmp1 * 2.0d+00 * njac(5,2,i) - lhsb(5,3,i) = tmp1 * 2.0d+00 * njac(5,3,i) - lhsb(5,4,i) = tmp1 * 2.0d+00 * njac(5,4,i) - lhsb(5,5,i) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(5,5,i) - > + tmp1 * 2.0d+00 * dx5 - - lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i+1) - > - tmp1 * njac(1,1,i+1) - > - tmp1 * dx1 - lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i+1) - > - tmp1 * njac(1,2,i+1) - lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i+1) - > - tmp1 * njac(1,3,i+1) - lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i+1) - > - tmp1 * njac(1,4,i+1) - lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i+1) - > - tmp1 * njac(1,5,i+1) - - lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i+1) - > - tmp1 * njac(2,1,i+1) - lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i+1) - > - tmp1 * njac(2,2,i+1) - > - tmp1 * dx2 - lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i+1) - > - tmp1 * njac(2,3,i+1) - lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i+1) - > - tmp1 * njac(2,4,i+1) - lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i+1) - > - tmp1 * njac(2,5,i+1) - - lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i+1) - > - tmp1 * njac(3,1,i+1) - lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i+1) - > - tmp1 * njac(3,2,i+1) - lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i+1) - > - tmp1 * njac(3,3,i+1) - > - tmp1 * dx3 - lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i+1) - > - tmp1 * njac(3,4,i+1) - lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i+1) - > - tmp1 * njac(3,5,i+1) - - lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i+1) - > - tmp1 * njac(4,1,i+1) - lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i+1) - > - tmp1 * njac(4,2,i+1) - lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i+1) - > - tmp1 * njac(4,3,i+1) - lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i+1) - > - tmp1 * njac(4,4,i+1) - > - tmp1 * dx4 - lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i+1) - > - tmp1 * njac(4,5,i+1) - - lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i+1) - > - tmp1 * njac(5,1,i+1) - lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i+1) - > - tmp1 * njac(5,2,i+1) - lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i+1) - > - tmp1 * njac(5,3,i+1) - lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i+1) - > - tmp1 * njac(5,4,i+1) - lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i+1) - > - tmp1 * njac(5,5,i+1) - > - tmp1 * dx5 - - enddo - - -c--------------------------------------------------------------------- -c outer most do loops - sweeping in i direction -c--------------------------------------------------------------------- - if (first .eq. 1) then - -c--------------------------------------------------------------------- -c multiply c(istart,j,k) by b_inverse and copy back to c -c multiply rhs(istart) by b_inverse(istart) and copy to rhs -c--------------------------------------------------------------------- - call binvcrhs( lhsb(1,1,istart), - > lhsc(1,1,istart,j,k,c), - > rhs(1,istart,j,k,c) ) - - endif - -c--------------------------------------------------------------------- -c begin inner most do loop -c do all the elements of the cell unless last -c--------------------------------------------------------------------- - do i=istart+first,isize-last - -c--------------------------------------------------------------------- -c rhs(i) = rhs(i) - A*rhs(i-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,i), - > rhs(1,i-1,j,k,c),rhs(1,i,j,k,c)) - -c--------------------------------------------------------------------- -c B(i) = B(i) - C(i-1)*A(i) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,i), - > lhsc(1,1,i-1,j,k,c), - > lhsb(1,1,i)) - - -c--------------------------------------------------------------------- -c multiply c(i,j,k) by b_inverse and copy back to c -c multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs -c--------------------------------------------------------------------- - call binvcrhs( lhsb(1,1,i), - > lhsc(1,1,i,j,k,c), - > rhs(1,i,j,k,c) ) - - enddo - -c--------------------------------------------------------------------- -c Now finish up special cases for last cell -c--------------------------------------------------------------------- - if (last .eq. 1) then - -c--------------------------------------------------------------------- -c rhs(isize) = rhs(isize) - A*rhs(isize-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,isize), - > rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c)) - -c--------------------------------------------------------------------- -c B(isize) = B(isize) - C(isize-1)*A(isize) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,isize), - > lhsc(1,1,isize-1,j,k,c), - > lhsb(1,1,isize)) - -c--------------------------------------------------------------------- -c multiply rhs() by b_inverse() and copy to rhs -c--------------------------------------------------------------------- - call binvrhs( lhsb(1,1,isize), - > rhs(1,isize,j,k,c) ) - - endif - enddo - enddo - - - return - end - diff --git a/examples/smpi/NAS/BT/x_solve_vec.f b/examples/smpi/NAS/BT/x_solve_vec.f deleted file mode 100644 index 8f1c1371db..0000000000 --- a/examples/smpi/NAS/BT/x_solve_vec.f +++ /dev/null @@ -1,789 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_solve - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c Performs line solves in X direction by first factoring -c the block-tridiagonal matrix into an upper triangular matrix, -c and then performing back substitution to solve for the unknow -c vectors of each line. -c -c Make sure we treat elements zero to cell_size in the direction -c of the sweep. -c -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - integer c, istart, stage, - > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), - > isize,jsize,ksize,send_id - - istart = 0 - -c--------------------------------------------------------------------- -c in our terminology stage is the number of the cell in the x-direct -c i.e. stage = 1 means the start of the line stage=ncells means end -c--------------------------------------------------------------------- - do stage = 1,ncells - c = slice(1,stage) - isize = cell_size(1,c) - 1 - jsize = cell_size(2,c) - 1 - ksize = cell_size(3,c) - 1 - -c--------------------------------------------------------------------- -c set last-cell flag -c--------------------------------------------------------------------- - if (stage .eq. ncells) then - last = 1 - else - last = 0 - endif - - if (stage .eq. 1) then -c--------------------------------------------------------------------- -c This is the first cell, so solve without receiving data -c--------------------------------------------------------------------- - first = 1 -c call lhsx(c) - call x_solve_cell(first,last,c) - else -c--------------------------------------------------------------------- -c Not the first cell of this line, so receive info from -c processor working on preceeding cell -c--------------------------------------------------------------------- - first = 0 - call x_receive_solve_info(recv_id,c) -c--------------------------------------------------------------------- -c overlap computations and communications -c--------------------------------------------------------------------- -c call lhsx(c) -c--------------------------------------------------------------------- -c wait for completion -c--------------------------------------------------------------------- - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) -c--------------------------------------------------------------------- -c install C'(istart) and rhs'(istart) to be used in this cell -c--------------------------------------------------------------------- - call x_unpack_solve_info(c) - call x_solve_cell(first,last,c) - endif - - if (last .eq. 0) call x_send_solve_info(send_id,c) - enddo - -c--------------------------------------------------------------------- -c now perform backsubstitution in reverse direction -c--------------------------------------------------------------------- - do stage = ncells, 1, -1 - c = slice(1,stage) - first = 0 - last = 0 - if (stage .eq. 1) first = 1 - if (stage .eq. ncells) then - last = 1 -c--------------------------------------------------------------------- -c last cell, so perform back substitute without waiting -c--------------------------------------------------------------------- - call x_backsubstitute(first, last,c) - else - call x_receive_backsub_info(recv_id,c) - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) - call x_unpack_backsub_info(c) - call x_backsubstitute(first,last,c) - endif - if (first .eq. 0) call x_send_backsub_info(send_id,c) - enddo - - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_unpack_solve_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack C'(-1) and rhs'(-1) for -c all j and k -c--------------------------------------------------------------------- - - include 'header.h' - integer j,k,m,n,ptr,c,istart - - istart = 0 - ptr = 0 - do k=0,KMAX-1 - do j=0,JMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - rhs(n,istart-1,j,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_send_solve_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send C'(iend) and rhs'(iend) for -c all j and k -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer j,k,m,n,isize,ptr,c,jp,kp - integer error,send_id,buffer_size - - isize = cell_size(1,c)-1 - jp = cell_coord(2,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - -c--------------------------------------------------------------------- -c pack up buffer -c--------------------------------------------------------------------- - ptr = 0 - do k=0,KMAX-1 - do j=0,JMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,isize,j,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - -c--------------------------------------------------------------------- -c send buffer -c--------------------------------------------------------------------- - call mpi_isend(in_buffer, buffer_size, - > dp_type, successor(1), - > WEST+jp+kp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_send_backsub_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send U(istart) for all j and k -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer j,k,n,ptr,c,istart,jp,kp - integer error,send_id,buffer_size - -c--------------------------------------------------------------------- -c Send element 0 to previous processor -c--------------------------------------------------------------------- - istart = 0 - jp = cell_coord(2,c)-1 - kp = cell_coord(3,c)-1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - ptr = 0 - do k=0,KMAX-1 - do j=0,JMAX-1 - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,istart,j,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - call mpi_isend(in_buffer, buffer_size, - > dp_type, predecessor(1), - > EAST+jp+kp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_unpack_backsub_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack U(isize) for all j and k -c--------------------------------------------------------------------- - - include 'header.h' - integer j,k,n,ptr,c - - ptr = 0 - do k=0,KMAX-1 - do j=0,JMAX-1 - do n=1,BLOCK_SIZE - backsub_info(n,j,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_receive_backsub_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer error,recv_id,jp,kp,c,buffer_size - jp = cell_coord(2,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - call mpi_irecv(out_buffer, buffer_size, - > dp_type, successor(1), - > EAST+jp+kp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_receive_solve_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer jp,kp,recv_id,error,c,buffer_size - jp = cell_coord(2,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - call mpi_irecv(out_buffer, buffer_size, - > dp_type, predecessor(1), - > WEST+jp+kp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_backsubstitute(first, last, c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c back solve: if last cell, then generate U(isize)=rhs(isize) -c else assume U(isize) is loaded in un pack backsub_info -c so just use it -c after call u(istart) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - - integer first, last, c, i, j, k - integer m,n,isize,jsize,ksize,istart - - istart = 0 - isize = cell_size(1,c)-1 - jsize = cell_size(2,c)-end(2,c)-1 - ksize = cell_size(3,c)-end(3,c)-1 - if (last .eq. 0) then - do k=start(3,c),ksize - do j=start(2,c),jsize -c--------------------------------------------------------------------- -c U(isize) uses info from previous cell if not last cell -c--------------------------------------------------------------------- - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) - > - lhsc(m,n,isize,j,k,c)* - > backsub_info(n,j,k,c) -c--------------------------------------------------------------------- -c rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) -c $ - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c) -c--------------------------------------------------------------------- - enddo - enddo - enddo - enddo - endif - do k=start(3,c),ksize - do j=start(2,c),jsize - do i=isize-1,istart,-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - > - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c) - enddo - enddo - enddo - enddo - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_solve_cell(first,last,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c performs guaussian elimination on this cell. -c -c assumes that unpacking routines for non-first cells -c preload C' and rhs' from previous cell. -c -c assumed send happens outside this routine, but that -c c'(IMAX) and rhs'(IMAX) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - include 'work_lhs_vec.h' - - integer first,last,c - integer i,j,k,m,n,isize,ksize,jsize,istart - - istart = 0 - isize = cell_size(1,c)-1 - jsize = cell_size(2,c)-end(2,c)-1 - ksize = cell_size(3,c)-end(3,c)-1 - -c--------------------------------------------------------------------- -c zero the left hand side for starters -c set diagonal values to 1. This is overkill, but convenient -c--------------------------------------------------------------------- - do j = 0, jsize - do m = 1, 5 - do n = 1, 5 - lhsa(m,n,0,j) = 0.0d0 - lhsb(m,n,0,j) = 0.0d0 - lhsa(m,n,isize,j) = 0.0d0 - lhsb(m,n,isize,j) = 0.0d0 - enddo - lhsb(m,m,0,j) = 1.0d0 - lhsb(m,m,isize,j) = 1.0d0 - enddo - enddo - - do k=start(3,c),ksize - -c--------------------------------------------------------------------- -c This function computes the left hand side in the xi-direction -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c determine a (labeled f) and n jacobians for cell c -c--------------------------------------------------------------------- - do j=start(2,c),jsize - do i = start(1,c)-1, cell_size(1,c) - end(1,c) - - tmp1 = rho_i(i,j,k,c) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 -c--------------------------------------------------------------------- -c -c--------------------------------------------------------------------- - fjac(1,1,i,j) = 0.0d+00 - fjac(1,2,i,j) = 1.0d+00 - fjac(1,3,i,j) = 0.0d+00 - fjac(1,4,i,j) = 0.0d+00 - fjac(1,5,i,j) = 0.0d+00 - - fjac(2,1,i,j) = -(u(2,i,j,k,c) * tmp2 * - > u(2,i,j,k,c)) - > + c2 * qs(i,j,k,c) - fjac(2,2,i,j) = ( 2.0d+00 - c2 ) - > * ( u(2,i,j,k,c) * tmp1 ) - fjac(2,3,i,j) = - c2 * ( u(3,i,j,k,c) * tmp1 ) - fjac(2,4,i,j) = - c2 * ( u(4,i,j,k,c) * tmp1 ) - fjac(2,5,i,j) = c2 - - fjac(3,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2 - fjac(3,2,i,j) = u(3,i,j,k,c) * tmp1 - fjac(3,3,i,j) = u(2,i,j,k,c) * tmp1 - fjac(3,4,i,j) = 0.0d+00 - fjac(3,5,i,j) = 0.0d+00 - - fjac(4,1,i,j) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2 - fjac(4,2,i,j) = u(4,i,j,k,c) * tmp1 - fjac(4,3,i,j) = 0.0d+00 - fjac(4,4,i,j) = u(2,i,j,k,c) * tmp1 - fjac(4,5,i,j) = 0.0d+00 - - fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c) - > - c1 * ( u(5,i,j,k,c) * tmp1 ) ) - > * ( u(2,i,j,k,c) * tmp1 ) - fjac(5,2,i,j) = c1 * u(5,i,j,k,c) * tmp1 - > - c2 - > * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2 - > + qs(i,j,k,c) ) - fjac(5,3,i,j) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) ) - > * tmp2 - fjac(5,4,i,j) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) ) - > * tmp2 - fjac(5,5,i,j) = c1 * ( u(2,i,j,k,c) * tmp1 ) - - njac(1,1,i,j) = 0.0d+00 - njac(1,2,i,j) = 0.0d+00 - njac(1,3,i,j) = 0.0d+00 - njac(1,4,i,j) = 0.0d+00 - njac(1,5,i,j) = 0.0d+00 - - njac(2,1,i,j) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c) - njac(2,2,i,j) = con43 * c3c4 * tmp1 - njac(2,3,i,j) = 0.0d+00 - njac(2,4,i,j) = 0.0d+00 - njac(2,5,i,j) = 0.0d+00 - - njac(3,1,i,j) = - c3c4 * tmp2 * u(3,i,j,k,c) - njac(3,2,i,j) = 0.0d+00 - njac(3,3,i,j) = c3c4 * tmp1 - njac(3,4,i,j) = 0.0d+00 - njac(3,5,i,j) = 0.0d+00 - - njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c) - njac(4,2,i,j) = 0.0d+00 - njac(4,3,i,j) = 0.0d+00 - njac(4,4,i,j) = c3c4 * tmp1 - njac(4,5,i,j) = 0.0d+00 - - njac(5,1,i,j) = - ( con43 * c3c4 - > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2) - > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2) - > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2) - > - c1345 * tmp2 * u(5,i,j,k,c) - - njac(5,2,i,j) = ( con43 * c3c4 - > - c1345 ) * tmp2 * u(2,i,j,k,c) - njac(5,3,i,j) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c) - njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c) - njac(5,5,i,j) = ( c1345 ) * tmp1 - - enddo - enddo - -c--------------------------------------------------------------------- -c now jacobians set, so form left hand side in x direction -c--------------------------------------------------------------------- - do j=start(2,c),jsize - do i = start(1,c), isize - end(1,c) - - tmp1 = dt * tx1 - tmp2 = dt * tx2 - - lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i-1,j) - > - tmp1 * njac(1,1,i-1,j) - > - tmp1 * dx1 - lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i-1,j) - > - tmp1 * njac(1,2,i-1,j) - lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i-1,j) - > - tmp1 * njac(1,3,i-1,j) - lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i-1,j) - > - tmp1 * njac(1,4,i-1,j) - lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i-1,j) - > - tmp1 * njac(1,5,i-1,j) - - lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i-1,j) - > - tmp1 * njac(2,1,i-1,j) - lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i-1,j) - > - tmp1 * njac(2,2,i-1,j) - > - tmp1 * dx2 - lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i-1,j) - > - tmp1 * njac(2,3,i-1,j) - lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i-1,j) - > - tmp1 * njac(2,4,i-1,j) - lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i-1,j) - > - tmp1 * njac(2,5,i-1,j) - - lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i-1,j) - > - tmp1 * njac(3,1,i-1,j) - lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i-1,j) - > - tmp1 * njac(3,2,i-1,j) - lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i-1,j) - > - tmp1 * njac(3,3,i-1,j) - > - tmp1 * dx3 - lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i-1,j) - > - tmp1 * njac(3,4,i-1,j) - lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i-1,j) - > - tmp1 * njac(3,5,i-1,j) - - lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i-1,j) - > - tmp1 * njac(4,1,i-1,j) - lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i-1,j) - > - tmp1 * njac(4,2,i-1,j) - lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i-1,j) - > - tmp1 * njac(4,3,i-1,j) - lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i-1,j) - > - tmp1 * njac(4,4,i-1,j) - > - tmp1 * dx4 - lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i-1,j) - > - tmp1 * njac(4,5,i-1,j) - - lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i-1,j) - > - tmp1 * njac(5,1,i-1,j) - lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i-1,j) - > - tmp1 * njac(5,2,i-1,j) - lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i-1,j) - > - tmp1 * njac(5,3,i-1,j) - lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i-1,j) - > - tmp1 * njac(5,4,i-1,j) - lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i-1,j) - > - tmp1 * njac(5,5,i-1,j) - > - tmp1 * dx5 - - lhsb(1,1,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(1,1,i,j) - > + tmp1 * 2.0d+00 * dx1 - lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j) - lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j) - lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j) - lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j) - - lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j) - lhsb(2,2,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(2,2,i,j) - > + tmp1 * 2.0d+00 * dx2 - lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j) - lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j) - lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j) - - lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j) - lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j) - lhsb(3,3,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(3,3,i,j) - > + tmp1 * 2.0d+00 * dx3 - lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j) - lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j) - - lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j) - lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j) - lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j) - lhsb(4,4,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(4,4,i,j) - > + tmp1 * 2.0d+00 * dx4 - lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j) - - lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j) - lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j) - lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j) - lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j) - lhsb(5,5,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(5,5,i,j) - > + tmp1 * 2.0d+00 * dx5 - - lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i+1,j) - > - tmp1 * njac(1,1,i+1,j) - > - tmp1 * dx1 - lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i+1,j) - > - tmp1 * njac(1,2,i+1,j) - lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i+1,j) - > - tmp1 * njac(1,3,i+1,j) - lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i+1,j) - > - tmp1 * njac(1,4,i+1,j) - lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i+1,j) - > - tmp1 * njac(1,5,i+1,j) - - lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i+1,j) - > - tmp1 * njac(2,1,i+1,j) - lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i+1,j) - > - tmp1 * njac(2,2,i+1,j) - > - tmp1 * dx2 - lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i+1,j) - > - tmp1 * njac(2,3,i+1,j) - lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i+1,j) - > - tmp1 * njac(2,4,i+1,j) - lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i+1,j) - > - tmp1 * njac(2,5,i+1,j) - - lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i+1,j) - > - tmp1 * njac(3,1,i+1,j) - lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i+1,j) - > - tmp1 * njac(3,2,i+1,j) - lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i+1,j) - > - tmp1 * njac(3,3,i+1,j) - > - tmp1 * dx3 - lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i+1,j) - > - tmp1 * njac(3,4,i+1,j) - lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i+1,j) - > - tmp1 * njac(3,5,i+1,j) - - lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i+1,j) - > - tmp1 * njac(4,1,i+1,j) - lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i+1,j) - > - tmp1 * njac(4,2,i+1,j) - lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i+1,j) - > - tmp1 * njac(4,3,i+1,j) - lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i+1,j) - > - tmp1 * njac(4,4,i+1,j) - > - tmp1 * dx4 - lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i+1,j) - > - tmp1 * njac(4,5,i+1,j) - - lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i+1,j) - > - tmp1 * njac(5,1,i+1,j) - lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i+1,j) - > - tmp1 * njac(5,2,i+1,j) - lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i+1,j) - > - tmp1 * njac(5,3,i+1,j) - lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i+1,j) - > - tmp1 * njac(5,4,i+1,j) - lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i+1,j) - > - tmp1 * njac(5,5,i+1,j) - > - tmp1 * dx5 - - enddo - enddo - - -c--------------------------------------------------------------------- -c outer most do loops - sweeping in i direction -c--------------------------------------------------------------------- - if (first .eq. 1) then - -c--------------------------------------------------------------------- -c multiply c(istart,j,k) by b_inverse and copy back to c -c multiply rhs(istart) by b_inverse(istart) and copy to rhs -c--------------------------------------------------------------------- -!dir$ ivdep - do j=start(2,c),jsize - call binvcrhs( lhsb(1,1,istart,j), - > lhsc(1,1,istart,j,k,c), - > rhs(1,istart,j,k,c) ) - enddo - - endif - -c--------------------------------------------------------------------- -c begin inner most do loop -c do all the elements of the cell unless last -c--------------------------------------------------------------------- -!dir$ ivdep -!dir$ interchange(i,j) - do j=start(2,c),jsize - do i=istart+first,isize-last - -c--------------------------------------------------------------------- -c rhs(i) = rhs(i) - A*rhs(i-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,i,j), - > rhs(1,i-1,j,k,c),rhs(1,i,j,k,c)) - -c--------------------------------------------------------------------- -c B(i) = B(i) - C(i-1)*A(i) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,i,j), - > lhsc(1,1,i-1,j,k,c), - > lhsb(1,1,i,j)) - - -c--------------------------------------------------------------------- -c multiply c(i,j,k) by b_inverse and copy back to c -c multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs -c--------------------------------------------------------------------- - call binvcrhs( lhsb(1,1,i,j), - > lhsc(1,1,i,j,k,c), - > rhs(1,i,j,k,c) ) - - enddo - enddo - -c--------------------------------------------------------------------- -c Now finish up special cases for last cell -c--------------------------------------------------------------------- - if (last .eq. 1) then - -!dir$ ivdep - do j=start(2,c),jsize -c--------------------------------------------------------------------- -c rhs(isize) = rhs(isize) - A*rhs(isize-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,isize,j), - > rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c)) - -c--------------------------------------------------------------------- -c B(isize) = B(isize) - C(isize-1)*A(isize) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,isize,j), - > lhsc(1,1,isize-1,j,k,c), - > lhsb(1,1,isize,j)) - -c--------------------------------------------------------------------- -c multiply rhs() by b_inverse() and copy to rhs -c--------------------------------------------------------------------- - call binvrhs( lhsb(1,1,isize,j), - > rhs(1,isize,j,k,c) ) - enddo - - endif - enddo - - - return - end - diff --git a/examples/smpi/NAS/BT/y_solve.f b/examples/smpi/NAS/BT/y_solve.f deleted file mode 100644 index 33e2ebc018..0000000000 --- a/examples/smpi/NAS/BT/y_solve.f +++ /dev/null @@ -1,771 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_solve - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Performs line solves in Y direction by first factoring -c the block-tridiagonal matrix into an upper triangular matrix, -c and then performing back substitution to solve for the unknow -c vectors of each line. -c -c Make sure we treat elements zero to cell_size in the direction -c of the sweep. -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer - > c, jstart, stage, - > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), - > isize,jsize,ksize,send_id - - jstart = 0 - -c--------------------------------------------------------------------- -c in our terminology stage is the number of the cell in the y-direction -c i.e. stage = 1 means the start of the line stage=ncells means end -c--------------------------------------------------------------------- - do stage = 1,ncells - c = slice(2,stage) - isize = cell_size(1,c) - 1 - jsize = cell_size(2,c) - 1 - ksize = cell_size(3,c) - 1 - -c--------------------------------------------------------------------- -c set last-cell flag -c--------------------------------------------------------------------- - if (stage .eq. ncells) then - last = 1 - else - last = 0 - endif - - if (stage .eq. 1) then -c--------------------------------------------------------------------- -c This is the first cell, so solve without receiving data -c--------------------------------------------------------------------- - first = 1 -c call lhsy(c) - call y_solve_cell(first,last,c) - else -c--------------------------------------------------------------------- -c Not the first cell of this line, so receive info from -c processor working on preceeding cell -c--------------------------------------------------------------------- - first = 0 - call y_receive_solve_info(recv_id,c) -c--------------------------------------------------------------------- -c overlap computations and communications -c--------------------------------------------------------------------- -c call lhsy(c) -c--------------------------------------------------------------------- -c wait for completion -c--------------------------------------------------------------------- - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) -c--------------------------------------------------------------------- -c install C'(jstart+1) and rhs'(jstart+1) to be used in this cell -c--------------------------------------------------------------------- - call y_unpack_solve_info(c) - call y_solve_cell(first,last,c) - endif - - if (last .eq. 0) call y_send_solve_info(send_id,c) - enddo - -c--------------------------------------------------------------------- -c now perform backsubstitution in reverse direction -c--------------------------------------------------------------------- - do stage = ncells, 1, -1 - c = slice(2,stage) - first = 0 - last = 0 - if (stage .eq. 1) first = 1 - if (stage .eq. ncells) then - last = 1 -c--------------------------------------------------------------------- -c last cell, so perform back substitute without waiting -c--------------------------------------------------------------------- - call y_backsubstitute(first, last,c) - else - call y_receive_backsub_info(recv_id,c) - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) - call y_unpack_backsub_info(c) - call y_backsubstitute(first,last,c) - endif - if (first .eq. 0) call y_send_backsub_info(send_id,c) - enddo - - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_unpack_solve_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack C'(-1) and rhs'(-1) for -c all i and k -c--------------------------------------------------------------------- - - include 'header.h' - - integer i,k,m,n,ptr,c,jstart - - jstart = 0 - ptr = 0 - do k=0,KMAX-1 - do i=0,IMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_send_solve_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send C'(jend) and rhs'(jend) for -c all i and k -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i,k,m,n,jsize,ptr,c,ip,kp - integer error,send_id,buffer_size - - jsize = cell_size(2,c)-1 - ip = cell_coord(1,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - -c--------------------------------------------------------------------- -c pack up buffer -c--------------------------------------------------------------------- - ptr = 0 - do k=0,KMAX-1 - do i=0,IMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,i,jsize,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - -c--------------------------------------------------------------------- -c send buffer -c--------------------------------------------------------------------- - call mpi_isend(in_buffer, buffer_size, - > dp_type, successor(2), - > SOUTH+ip+kp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_send_backsub_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send U(jstart) for all i and k -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i,k,n,ptr,c,jstart,ip,kp - integer error,send_id,buffer_size - -c--------------------------------------------------------------------- -c Send element 0 to previous processor -c--------------------------------------------------------------------- - jstart = 0 - ip = cell_coord(1,c)-1 - kp = cell_coord(3,c)-1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - ptr = 0 - do k=0,KMAX-1 - do i=0,IMAX-1 - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,i,jstart,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - call mpi_isend(in_buffer, buffer_size, - > dp_type, predecessor(2), - > NORTH+ip+kp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_unpack_backsub_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack U(jsize) for all i and k -c--------------------------------------------------------------------- - - include 'header.h' - - integer i,k,n,ptr,c - - ptr = 0 - do k=0,KMAX-1 - do i=0,IMAX-1 - do n=1,BLOCK_SIZE - backsub_info(n,i,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_receive_backsub_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer error,recv_id,ip,kp,c,buffer_size - ip = cell_coord(1,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - call mpi_irecv(out_buffer, buffer_size, - > dp_type, successor(2), - > NORTH+ip+kp*NCELLS, comm_solve, - > recv_id, error) - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_receive_solve_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer ip,kp,recv_id,error,c,buffer_size - ip = cell_coord(1,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - call mpi_irecv(out_buffer, buffer_size, - > dp_type, predecessor(2), - > SOUTH+ip+kp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_backsubstitute(first, last, c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c back solve: if last cell, then generate U(jsize)=rhs(jsize) -c else assume U(jsize) is loaded in un pack backsub_info -c so just use it -c after call u(jstart) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - - integer first, last, c, i, k - integer m,n,j,jsize,isize,ksize,jstart - - jstart = 0 - isize = cell_size(1,c)-end(1,c)-1 - jsize = cell_size(2,c)-1 - ksize = cell_size(3,c)-end(3,c)-1 - if (last .eq. 0) then - do k=start(3,c),ksize - do i=start(1,c),isize -c--------------------------------------------------------------------- -c U(jsize) uses info from previous cell if not last cell -c--------------------------------------------------------------------- - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) - > - lhsc(m,n,i,jsize,k,c)* - > backsub_info(n,i,k,c) - enddo - enddo - enddo - enddo - endif - do k=start(3,c),ksize - do j=jsize-1,jstart,-1 - do i=start(1,c),isize - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - > - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c) - enddo - enddo - enddo - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_solve_cell(first,last,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c performs guaussian elimination on this cell. -c -c assumes that unpacking routines for non-first cells -c preload C' and rhs' from previous cell. -c -c assumed send happens outside this routine, but that -c c'(JMAX) and rhs'(JMAX) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - include 'work_lhs.h' - - integer first,last,c - integer i,j,k,isize,ksize,jsize,jstart - double precision utmp(6,-2:JMAX+1) - - jstart = 0 - isize = cell_size(1,c)-end(1,c)-1 - jsize = cell_size(2,c)-1 - ksize = cell_size(3,c)-end(3,c)-1 - - call lhsabinit(lhsa, lhsb, jsize) - - do k=start(3,c),ksize - do i=start(1,c),isize - -c--------------------------------------------------------------------- -c This function computes the left hand side for the three y-factors -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Compute the indices for storing the tri-diagonal matrix; -c determine a (labeled f) and n jacobians for cell c -c--------------------------------------------------------------------- - do j = start(2,c)-1, cell_size(2,c)-end(2,c) - utmp(1,j) = 1.0d0 / u(1,i,j,k,c) - utmp(2,j) = u(2,i,j,k,c) - utmp(3,j) = u(3,i,j,k,c) - utmp(4,j) = u(4,i,j,k,c) - utmp(5,j) = u(5,i,j,k,c) - utmp(6,j) = qs(i,j,k,c) - end do - - do j = start(2,c)-1, cell_size(2,c)-end(2,c) - - tmp1 = utmp(1,j) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - fjac(1,1,j) = 0.0d+00 - fjac(1,2,j) = 0.0d+00 - fjac(1,3,j) = 1.0d+00 - fjac(1,4,j) = 0.0d+00 - fjac(1,5,j) = 0.0d+00 - - fjac(2,1,j) = - ( utmp(2,j)*utmp(3,j) ) - > * tmp2 - fjac(2,2,j) = utmp(3,j) * tmp1 - fjac(2,3,j) = utmp(2,j) * tmp1 - fjac(2,4,j) = 0.0d+00 - fjac(2,5,j) = 0.0d+00 - - fjac(3,1,j) = - ( utmp(3,j)*utmp(3,j)*tmp2) - > + c2 * utmp(6,j) - fjac(3,2,j) = - c2 * utmp(2,j) * tmp1 - fjac(3,3,j) = ( 2.0d+00 - c2 ) - > * utmp(3,j) * tmp1 - fjac(3,4,j) = - c2 * utmp(4,j) * tmp1 - fjac(3,5,j) = c2 - - fjac(4,1,j) = - ( utmp(3,j)*utmp(4,j) ) - > * tmp2 - fjac(4,2,j) = 0.0d+00 - fjac(4,3,j) = utmp(4,j) * tmp1 - fjac(4,4,j) = utmp(3,j) * tmp1 - fjac(4,5,j) = 0.0d+00 - - fjac(5,1,j) = ( c2 * 2.0d0 * utmp(6,j) - > - c1 * utmp(5,j) * tmp1 ) - > * utmp(3,j) * tmp1 - fjac(5,2,j) = - c2 * utmp(2,j)*utmp(3,j) - > * tmp2 - fjac(5,3,j) = c1 * utmp(5,j) * tmp1 - > - c2 * ( utmp(6,j) - > + utmp(3,j)*utmp(3,j) * tmp2 ) - fjac(5,4,j) = - c2 * ( utmp(3,j)*utmp(4,j) ) - > * tmp2 - fjac(5,5,j) = c1 * utmp(3,j) * tmp1 - - njac(1,1,j) = 0.0d+00 - njac(1,2,j) = 0.0d+00 - njac(1,3,j) = 0.0d+00 - njac(1,4,j) = 0.0d+00 - njac(1,5,j) = 0.0d+00 - - njac(2,1,j) = - c3c4 * tmp2 * utmp(2,j) - njac(2,2,j) = c3c4 * tmp1 - njac(2,3,j) = 0.0d+00 - njac(2,4,j) = 0.0d+00 - njac(2,5,j) = 0.0d+00 - - njac(3,1,j) = - con43 * c3c4 * tmp2 * utmp(3,j) - njac(3,2,j) = 0.0d+00 - njac(3,3,j) = con43 * c3c4 * tmp1 - njac(3,4,j) = 0.0d+00 - njac(3,5,j) = 0.0d+00 - - njac(4,1,j) = - c3c4 * tmp2 * utmp(4,j) - njac(4,2,j) = 0.0d+00 - njac(4,3,j) = 0.0d+00 - njac(4,4,j) = c3c4 * tmp1 - njac(4,5,j) = 0.0d+00 - - njac(5,1,j) = - ( c3c4 - > - c1345 ) * tmp3 * (utmp(2,j)**2) - > - ( con43 * c3c4 - > - c1345 ) * tmp3 * (utmp(3,j)**2) - > - ( c3c4 - c1345 ) * tmp3 * (utmp(4,j)**2) - > - c1345 * tmp2 * utmp(5,j) - - njac(5,2,j) = ( c3c4 - c1345 ) * tmp2 * utmp(2,j) - njac(5,3,j) = ( con43 * c3c4 - > - c1345 ) * tmp2 * utmp(3,j) - njac(5,4,j) = ( c3c4 - c1345 ) * tmp2 * utmp(4,j) - njac(5,5,j) = ( c1345 ) * tmp1 - - enddo - -c--------------------------------------------------------------------- -c now joacobians set, so form left hand side in y direction -c--------------------------------------------------------------------- - do j = start(2,c), jsize-end(2,c) - - tmp1 = dt * ty1 - tmp2 = dt * ty2 - - lhsa(1,1,j) = - tmp2 * fjac(1,1,j-1) - > - tmp1 * njac(1,1,j-1) - > - tmp1 * dy1 - lhsa(1,2,j) = - tmp2 * fjac(1,2,j-1) - > - tmp1 * njac(1,2,j-1) - lhsa(1,3,j) = - tmp2 * fjac(1,3,j-1) - > - tmp1 * njac(1,3,j-1) - lhsa(1,4,j) = - tmp2 * fjac(1,4,j-1) - > - tmp1 * njac(1,4,j-1) - lhsa(1,5,j) = - tmp2 * fjac(1,5,j-1) - > - tmp1 * njac(1,5,j-1) - - lhsa(2,1,j) = - tmp2 * fjac(2,1,j-1) - > - tmp1 * njac(2,1,j-1) - lhsa(2,2,j) = - tmp2 * fjac(2,2,j-1) - > - tmp1 * njac(2,2,j-1) - > - tmp1 * dy2 - lhsa(2,3,j) = - tmp2 * fjac(2,3,j-1) - > - tmp1 * njac(2,3,j-1) - lhsa(2,4,j) = - tmp2 * fjac(2,4,j-1) - > - tmp1 * njac(2,4,j-1) - lhsa(2,5,j) = - tmp2 * fjac(2,5,j-1) - > - tmp1 * njac(2,5,j-1) - - lhsa(3,1,j) = - tmp2 * fjac(3,1,j-1) - > - tmp1 * njac(3,1,j-1) - lhsa(3,2,j) = - tmp2 * fjac(3,2,j-1) - > - tmp1 * njac(3,2,j-1) - lhsa(3,3,j) = - tmp2 * fjac(3,3,j-1) - > - tmp1 * njac(3,3,j-1) - > - tmp1 * dy3 - lhsa(3,4,j) = - tmp2 * fjac(3,4,j-1) - > - tmp1 * njac(3,4,j-1) - lhsa(3,5,j) = - tmp2 * fjac(3,5,j-1) - > - tmp1 * njac(3,5,j-1) - - lhsa(4,1,j) = - tmp2 * fjac(4,1,j-1) - > - tmp1 * njac(4,1,j-1) - lhsa(4,2,j) = - tmp2 * fjac(4,2,j-1) - > - tmp1 * njac(4,2,j-1) - lhsa(4,3,j) = - tmp2 * fjac(4,3,j-1) - > - tmp1 * njac(4,3,j-1) - lhsa(4,4,j) = - tmp2 * fjac(4,4,j-1) - > - tmp1 * njac(4,4,j-1) - > - tmp1 * dy4 - lhsa(4,5,j) = - tmp2 * fjac(4,5,j-1) - > - tmp1 * njac(4,5,j-1) - - lhsa(5,1,j) = - tmp2 * fjac(5,1,j-1) - > - tmp1 * njac(5,1,j-1) - lhsa(5,2,j) = - tmp2 * fjac(5,2,j-1) - > - tmp1 * njac(5,2,j-1) - lhsa(5,3,j) = - tmp2 * fjac(5,3,j-1) - > - tmp1 * njac(5,3,j-1) - lhsa(5,4,j) = - tmp2 * fjac(5,4,j-1) - > - tmp1 * njac(5,4,j-1) - lhsa(5,5,j) = - tmp2 * fjac(5,5,j-1) - > - tmp1 * njac(5,5,j-1) - > - tmp1 * dy5 - - lhsb(1,1,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(1,1,j) - > + tmp1 * 2.0d+00 * dy1 - lhsb(1,2,j) = tmp1 * 2.0d+00 * njac(1,2,j) - lhsb(1,3,j) = tmp1 * 2.0d+00 * njac(1,3,j) - lhsb(1,4,j) = tmp1 * 2.0d+00 * njac(1,4,j) - lhsb(1,5,j) = tmp1 * 2.0d+00 * njac(1,5,j) - - lhsb(2,1,j) = tmp1 * 2.0d+00 * njac(2,1,j) - lhsb(2,2,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(2,2,j) - > + tmp1 * 2.0d+00 * dy2 - lhsb(2,3,j) = tmp1 * 2.0d+00 * njac(2,3,j) - lhsb(2,4,j) = tmp1 * 2.0d+00 * njac(2,4,j) - lhsb(2,5,j) = tmp1 * 2.0d+00 * njac(2,5,j) - - lhsb(3,1,j) = tmp1 * 2.0d+00 * njac(3,1,j) - lhsb(3,2,j) = tmp1 * 2.0d+00 * njac(3,2,j) - lhsb(3,3,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(3,3,j) - > + tmp1 * 2.0d+00 * dy3 - lhsb(3,4,j) = tmp1 * 2.0d+00 * njac(3,4,j) - lhsb(3,5,j) = tmp1 * 2.0d+00 * njac(3,5,j) - - lhsb(4,1,j) = tmp1 * 2.0d+00 * njac(4,1,j) - lhsb(4,2,j) = tmp1 * 2.0d+00 * njac(4,2,j) - lhsb(4,3,j) = tmp1 * 2.0d+00 * njac(4,3,j) - lhsb(4,4,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(4,4,j) - > + tmp1 * 2.0d+00 * dy4 - lhsb(4,5,j) = tmp1 * 2.0d+00 * njac(4,5,j) - - lhsb(5,1,j) = tmp1 * 2.0d+00 * njac(5,1,j) - lhsb(5,2,j) = tmp1 * 2.0d+00 * njac(5,2,j) - lhsb(5,3,j) = tmp1 * 2.0d+00 * njac(5,3,j) - lhsb(5,4,j) = tmp1 * 2.0d+00 * njac(5,4,j) - lhsb(5,5,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(5,5,j) - > + tmp1 * 2.0d+00 * dy5 - - lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,j+1) - > - tmp1 * njac(1,1,j+1) - > - tmp1 * dy1 - lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,j+1) - > - tmp1 * njac(1,2,j+1) - lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,j+1) - > - tmp1 * njac(1,3,j+1) - lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,j+1) - > - tmp1 * njac(1,4,j+1) - lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,j+1) - > - tmp1 * njac(1,5,j+1) - - lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,j+1) - > - tmp1 * njac(2,1,j+1) - lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,j+1) - > - tmp1 * njac(2,2,j+1) - > - tmp1 * dy2 - lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,j+1) - > - tmp1 * njac(2,3,j+1) - lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,j+1) - > - tmp1 * njac(2,4,j+1) - lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,j+1) - > - tmp1 * njac(2,5,j+1) - - lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,j+1) - > - tmp1 * njac(3,1,j+1) - lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,j+1) - > - tmp1 * njac(3,2,j+1) - lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,j+1) - > - tmp1 * njac(3,3,j+1) - > - tmp1 * dy3 - lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,j+1) - > - tmp1 * njac(3,4,j+1) - lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,j+1) - > - tmp1 * njac(3,5,j+1) - - lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,j+1) - > - tmp1 * njac(4,1,j+1) - lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,j+1) - > - tmp1 * njac(4,2,j+1) - lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,j+1) - > - tmp1 * njac(4,3,j+1) - lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,j+1) - > - tmp1 * njac(4,4,j+1) - > - tmp1 * dy4 - lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,j+1) - > - tmp1 * njac(4,5,j+1) - - lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,j+1) - > - tmp1 * njac(5,1,j+1) - lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,j+1) - > - tmp1 * njac(5,2,j+1) - lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,j+1) - > - tmp1 * njac(5,3,j+1) - lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,j+1) - > - tmp1 * njac(5,4,j+1) - lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,j+1) - > - tmp1 * njac(5,5,j+1) - > - tmp1 * dy5 - - enddo - - -c--------------------------------------------------------------------- -c outer most do loops - sweeping in i direction -c--------------------------------------------------------------------- - if (first .eq. 1) then - -c--------------------------------------------------------------------- -c multiply c(i,jstart,k) by b_inverse and copy back to c -c multiply rhs(jstart) by b_inverse(jstart) and copy to rhs -c--------------------------------------------------------------------- - call binvcrhs( lhsb(1,1,jstart), - > lhsc(1,1,i,jstart,k,c), - > rhs(1,i,jstart,k,c) ) - - endif - -c--------------------------------------------------------------------- -c begin inner most do loop -c do all the elements of the cell unless last -c--------------------------------------------------------------------- - do j=jstart+first,jsize-last - -c--------------------------------------------------------------------- -c subtract A*lhs_vector(j-1) from lhs_vector(j) -c -c rhs(j) = rhs(j) - A*rhs(j-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,j), - > rhs(1,i,j-1,k,c),rhs(1,i,j,k,c)) - -c--------------------------------------------------------------------- -c B(j) = B(j) - C(j-1)*A(j) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,j), - > lhsc(1,1,i,j-1,k,c), - > lhsb(1,1,j)) - -c--------------------------------------------------------------------- -c multiply c(i,j,k) by b_inverse and copy back to c -c multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs -c--------------------------------------------------------------------- - call binvcrhs( lhsb(1,1,j), - > lhsc(1,1,i,j,k,c), - > rhs(1,i,j,k,c) ) - - enddo - -c--------------------------------------------------------------------- -c Now finish up special cases for last cell -c--------------------------------------------------------------------- - if (last .eq. 1) then - -c--------------------------------------------------------------------- -c rhs(jsize) = rhs(jsize) - A*rhs(jsize-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,jsize), - > rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c)) - -c--------------------------------------------------------------------- -c B(jsize) = B(jsize) - C(jsize-1)*A(jsize) -c call matmul_sub(aa,i,jsize,k,c, -c $ cc,i,jsize-1,k,c,bb,i,jsize,k,c) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,jsize), - > lhsc(1,1,i,jsize-1,k,c), - > lhsb(1,1,jsize)) - -c--------------------------------------------------------------------- -c multiply rhs(jsize) by b_inverse(jsize) and copy to rhs -c--------------------------------------------------------------------- - call binvrhs( lhsb(1,1,jsize), - > rhs(1,i,jsize,k,c) ) - - endif - enddo - enddo - - - return - end - - - diff --git a/examples/smpi/NAS/BT/y_solve_vec.f b/examples/smpi/NAS/BT/y_solve_vec.f deleted file mode 100644 index e21cfa36ce..0000000000 --- a/examples/smpi/NAS/BT/y_solve_vec.f +++ /dev/null @@ -1,788 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_solve - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Performs line solves in Y direction by first factoring -c the block-tridiagonal matrix into an upper triangular matrix, -c and then performing back substitution to solve for the unknow -c vectors of each line. -c -c Make sure we treat elements zero to cell_size in the direction -c of the sweep. -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer - > c, jstart, stage, - > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), - > isize,jsize,ksize,send_id - - jstart = 0 - -c--------------------------------------------------------------------- -c in our terminology stage is the number of the cell in the y-direct -c i.e. stage = 1 means the start of the line stage=ncells means end -c--------------------------------------------------------------------- - do stage = 1,ncells - c = slice(2,stage) - isize = cell_size(1,c) - 1 - jsize = cell_size(2,c) - 1 - ksize = cell_size(3,c) - 1 - -c--------------------------------------------------------------------- -c set last-cell flag -c--------------------------------------------------------------------- - if (stage .eq. ncells) then - last = 1 - else - last = 0 - endif - - if (stage .eq. 1) then -c--------------------------------------------------------------------- -c This is the first cell, so solve without receiving data -c--------------------------------------------------------------------- - first = 1 -c call lhsy(c) - call y_solve_cell(first,last,c) - else -c--------------------------------------------------------------------- -c Not the first cell of this line, so receive info from -c processor working on preceeding cell -c--------------------------------------------------------------------- - first = 0 - call y_receive_solve_info(recv_id,c) -c--------------------------------------------------------------------- -c overlap computations and communications -c--------------------------------------------------------------------- -c call lhsy(c) -c--------------------------------------------------------------------- -c wait for completion -c--------------------------------------------------------------------- - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) -c--------------------------------------------------------------------- -c install C'(jstart+1) and rhs'(jstart+1) to be used in this cell -c--------------------------------------------------------------------- - call y_unpack_solve_info(c) - call y_solve_cell(first,last,c) - endif - - if (last .eq. 0) call y_send_solve_info(send_id,c) - enddo - -c--------------------------------------------------------------------- -c now perform backsubstitution in reverse direction -c--------------------------------------------------------------------- - do stage = ncells, 1, -1 - c = slice(2,stage) - first = 0 - last = 0 - if (stage .eq. 1) first = 1 - if (stage .eq. ncells) then - last = 1 -c--------------------------------------------------------------------- -c last cell, so perform back substitute without waiting -c--------------------------------------------------------------------- - call y_backsubstitute(first, last,c) - else - call y_receive_backsub_info(recv_id,c) - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) - call y_unpack_backsub_info(c) - call y_backsubstitute(first,last,c) - endif - if (first .eq. 0) call y_send_backsub_info(send_id,c) - enddo - - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_unpack_solve_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack C'(-1) and rhs'(-1) for -c all i and k -c--------------------------------------------------------------------- - - include 'header.h' - - integer i,k,m,n,ptr,c,jstart - - jstart = 0 - ptr = 0 - do k=0,KMAX-1 - do i=0,IMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_send_solve_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send C'(jend) and rhs'(jend) for -c all i and k -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i,k,m,n,jsize,ptr,c,ip,kp - integer error,send_id,buffer_size - - jsize = cell_size(2,c)-1 - ip = cell_coord(1,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - -c--------------------------------------------------------------------- -c pack up buffer -c--------------------------------------------------------------------- - ptr = 0 - do k=0,KMAX-1 - do i=0,IMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,i,jsize,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - -c--------------------------------------------------------------------- -c send buffer -c--------------------------------------------------------------------- - call mpi_isend(in_buffer, buffer_size, - > dp_type, successor(2), - > SOUTH+ip+kp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_send_backsub_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send U(jstart) for all i and k -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i,k,n,ptr,c,jstart,ip,kp - integer error,send_id,buffer_size - -c--------------------------------------------------------------------- -c Send element 0 to previous processor -c--------------------------------------------------------------------- - jstart = 0 - ip = cell_coord(1,c)-1 - kp = cell_coord(3,c)-1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - ptr = 0 - do k=0,KMAX-1 - do i=0,IMAX-1 - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,i,jstart,k,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - call mpi_isend(in_buffer, buffer_size, - > dp_type, predecessor(2), - > NORTH+ip+kp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_unpack_backsub_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack U(jsize) for all i and k -c--------------------------------------------------------------------- - - include 'header.h' - - integer i,k,n,ptr,c - - ptr = 0 - do k=0,KMAX-1 - do i=0,IMAX-1 - do n=1,BLOCK_SIZE - backsub_info(n,i,k,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_receive_backsub_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer error,recv_id,ip,kp,c,buffer_size - ip = cell_coord(1,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - call mpi_irecv(out_buffer, buffer_size, - > dp_type, successor(2), - > NORTH+ip+kp*NCELLS, comm_solve, - > recv_id, error) - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_receive_solve_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer ip,kp,recv_id,error,c,buffer_size - ip = cell_coord(1,c) - 1 - kp = cell_coord(3,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - call mpi_irecv(out_buffer, buffer_size, - > dp_type, predecessor(2), - > SOUTH+ip+kp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_backsubstitute(first, last, c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c back solve: if last cell, then generate U(jsize)=rhs(jsize) -c else assume U(jsize) is loaded in un pack backsub_info -c so just use it -c after call u(jstart) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - - integer first, last, c, i, k - integer m,n,j,jsize,isize,ksize,jstart - - jstart = 0 - isize = cell_size(1,c)-end(1,c)-1 - jsize = cell_size(2,c)-1 - ksize = cell_size(3,c)-end(3,c)-1 - if (last .eq. 0) then - do k=start(3,c),ksize - do i=start(1,c),isize -c--------------------------------------------------------------------- -c U(jsize) uses info from previous cell if not last cell -c--------------------------------------------------------------------- - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) - > - lhsc(m,n,i,jsize,k,c)* - > backsub_info(n,i,k,c) - enddo - enddo - enddo - enddo - endif - do k=start(3,c),ksize - do j=jsize-1,jstart,-1 - do i=start(1,c),isize - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - > - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c) - enddo - enddo - enddo - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_solve_cell(first,last,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c performs guaussian elimination on this cell. -c -c assumes that unpacking routines for non-first cells -c preload C' and rhs' from previous cell. -c -c assumed send happens outside this routine, but that -c c'(JMAX) and rhs'(JMAX) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - include 'work_lhs_vec.h' - - integer first,last,c - integer i,j,k,m,n,isize,ksize,jsize,jstart - - jstart = 0 - isize = cell_size(1,c)-end(1,c)-1 - jsize = cell_size(2,c)-1 - ksize = cell_size(3,c)-end(3,c)-1 - -c--------------------------------------------------------------------- -c zero the left hand side for starters -c set diagonal values to 1. This is overkill, but convenient -c--------------------------------------------------------------------- - do i = 0, isize - do m = 1, 5 - do n = 1, 5 - lhsa(m,n,i,0) = 0.0d0 - lhsb(m,n,i,0) = 0.0d0 - lhsa(m,n,i,jsize) = 0.0d0 - lhsb(m,n,i,jsize) = 0.0d0 - enddo - lhsb(m,m,i,0) = 1.0d0 - lhsb(m,m,i,jsize) = 1.0d0 - enddo - enddo - - do k=start(3,c),ksize - -c--------------------------------------------------------------------- -c This function computes the left hand side for the three y-factors -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Compute the indices for storing the tri-diagonal matrix; -c determine a (labeled f) and n jacobians for cell c -c--------------------------------------------------------------------- - - do j = start(2,c)-1, cell_size(2,c)-end(2,c) - do i=start(1,c),isize - - tmp1 = 1.0d0 / u(1,i,j,k,c) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - fjac(1,1,i,j) = 0.0d+00 - fjac(1,2,i,j) = 0.0d+00 - fjac(1,3,i,j) = 1.0d+00 - fjac(1,4,i,j) = 0.0d+00 - fjac(1,5,i,j) = 0.0d+00 - - fjac(2,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) - > * tmp2 - fjac(2,2,i,j) = u(3,i,j,k,c) * tmp1 - fjac(2,3,i,j) = u(2,i,j,k,c) * tmp1 - fjac(2,4,i,j) = 0.0d+00 - fjac(2,5,i,j) = 0.0d+00 - - fjac(3,1,i,j) = - ( u(3,i,j,k,c)*u(3,i,j,k,c)*tmp2) - > + c2 * qs(i,j,k,c) - fjac(3,2,i,j) = - c2 * u(2,i,j,k,c) * tmp1 - fjac(3,3,i,j) = ( 2.0d+00 - c2 ) - > * u(3,i,j,k,c) * tmp1 - fjac(3,4,i,j) = - c2 * u(4,i,j,k,c) * tmp1 - fjac(3,5,i,j) = c2 - - fjac(4,1,i,j) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) ) - > * tmp2 - fjac(4,2,i,j) = 0.0d+00 - fjac(4,3,i,j) = u(4,i,j,k,c) * tmp1 - fjac(4,4,i,j) = u(3,i,j,k,c) * tmp1 - fjac(4,5,i,j) = 0.0d+00 - - fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c) - > - c1 * u(5,i,j,k,c) * tmp1 ) - > * u(3,i,j,k,c) * tmp1 - fjac(5,2,i,j) = - c2 * u(2,i,j,k,c)*u(3,i,j,k,c) - > * tmp2 - fjac(5,3,i,j) = c1 * u(5,i,j,k,c) * tmp1 - > - c2 * ( qs(i,j,k,c) - > + u(3,i,j,k,c)*u(3,i,j,k,c) * tmp2 ) - fjac(5,4,i,j) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) ) - > * tmp2 - fjac(5,5,i,j) = c1 * u(3,i,j,k,c) * tmp1 - - njac(1,1,i,j) = 0.0d+00 - njac(1,2,i,j) = 0.0d+00 - njac(1,3,i,j) = 0.0d+00 - njac(1,4,i,j) = 0.0d+00 - njac(1,5,i,j) = 0.0d+00 - - njac(2,1,i,j) = - c3c4 * tmp2 * u(2,i,j,k,c) - njac(2,2,i,j) = c3c4 * tmp1 - njac(2,3,i,j) = 0.0d+00 - njac(2,4,i,j) = 0.0d+00 - njac(2,5,i,j) = 0.0d+00 - - njac(3,1,i,j) = - con43 * c3c4 * tmp2 * u(3,i,j,k,c) - njac(3,2,i,j) = 0.0d+00 - njac(3,3,i,j) = con43 * c3c4 * tmp1 - njac(3,4,i,j) = 0.0d+00 - njac(3,5,i,j) = 0.0d+00 - - njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c) - njac(4,2,i,j) = 0.0d+00 - njac(4,3,i,j) = 0.0d+00 - njac(4,4,i,j) = c3c4 * tmp1 - njac(4,5,i,j) = 0.0d+00 - - njac(5,1,i,j) = - ( c3c4 - > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2) - > - ( con43 * c3c4 - > - c1345 ) * tmp3 * (u(3,i,j,k,c)**2) - > - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2) - > - c1345 * tmp2 * u(5,i,j,k,c) - - njac(5,2,i,j) = ( c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c) - njac(5,3,i,j) = ( con43 * c3c4 - > - c1345 ) * tmp2 * u(3,i,j,k,c) - njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c) - njac(5,5,i,j) = ( c1345 ) * tmp1 - - enddo - enddo - -c--------------------------------------------------------------------- -c now joacobians set, so form left hand side in y direction -c--------------------------------------------------------------------- - do j = start(2,c), jsize-end(2,c) - do i=start(1,c),isize - - tmp1 = dt * ty1 - tmp2 = dt * ty2 - - lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i,j-1) - > - tmp1 * njac(1,1,i,j-1) - > - tmp1 * dy1 - lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i,j-1) - > - tmp1 * njac(1,2,i,j-1) - lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i,j-1) - > - tmp1 * njac(1,3,i,j-1) - lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i,j-1) - > - tmp1 * njac(1,4,i,j-1) - lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i,j-1) - > - tmp1 * njac(1,5,i,j-1) - - lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i,j-1) - > - tmp1 * njac(2,1,i,j-1) - lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i,j-1) - > - tmp1 * njac(2,2,i,j-1) - > - tmp1 * dy2 - lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i,j-1) - > - tmp1 * njac(2,3,i,j-1) - lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i,j-1) - > - tmp1 * njac(2,4,i,j-1) - lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i,j-1) - > - tmp1 * njac(2,5,i,j-1) - - lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i,j-1) - > - tmp1 * njac(3,1,i,j-1) - lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i,j-1) - > - tmp1 * njac(3,2,i,j-1) - lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i,j-1) - > - tmp1 * njac(3,3,i,j-1) - > - tmp1 * dy3 - lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i,j-1) - > - tmp1 * njac(3,4,i,j-1) - lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i,j-1) - > - tmp1 * njac(3,5,i,j-1) - - lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i,j-1) - > - tmp1 * njac(4,1,i,j-1) - lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i,j-1) - > - tmp1 * njac(4,2,i,j-1) - lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i,j-1) - > - tmp1 * njac(4,3,i,j-1) - lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i,j-1) - > - tmp1 * njac(4,4,i,j-1) - > - tmp1 * dy4 - lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i,j-1) - > - tmp1 * njac(4,5,i,j-1) - - lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i,j-1) - > - tmp1 * njac(5,1,i,j-1) - lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i,j-1) - > - tmp1 * njac(5,2,i,j-1) - lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i,j-1) - > - tmp1 * njac(5,3,i,j-1) - lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i,j-1) - > - tmp1 * njac(5,4,i,j-1) - lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i,j-1) - > - tmp1 * njac(5,5,i,j-1) - > - tmp1 * dy5 - - lhsb(1,1,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(1,1,i,j) - > + tmp1 * 2.0d+00 * dy1 - lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j) - lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j) - lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j) - lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j) - - lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j) - lhsb(2,2,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(2,2,i,j) - > + tmp1 * 2.0d+00 * dy2 - lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j) - lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j) - lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j) - - lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j) - lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j) - lhsb(3,3,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(3,3,i,j) - > + tmp1 * 2.0d+00 * dy3 - lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j) - lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j) - - lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j) - lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j) - lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j) - lhsb(4,4,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(4,4,i,j) - > + tmp1 * 2.0d+00 * dy4 - lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j) - - lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j) - lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j) - lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j) - lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j) - lhsb(5,5,i,j) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(5,5,i,j) - > + tmp1 * 2.0d+00 * dy5 - - lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i,j+1) - > - tmp1 * njac(1,1,i,j+1) - > - tmp1 * dy1 - lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i,j+1) - > - tmp1 * njac(1,2,i,j+1) - lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i,j+1) - > - tmp1 * njac(1,3,i,j+1) - lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i,j+1) - > - tmp1 * njac(1,4,i,j+1) - lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i,j+1) - > - tmp1 * njac(1,5,i,j+1) - - lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i,j+1) - > - tmp1 * njac(2,1,i,j+1) - lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i,j+1) - > - tmp1 * njac(2,2,i,j+1) - > - tmp1 * dy2 - lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i,j+1) - > - tmp1 * njac(2,3,i,j+1) - lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i,j+1) - > - tmp1 * njac(2,4,i,j+1) - lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i,j+1) - > - tmp1 * njac(2,5,i,j+1) - - lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i,j+1) - > - tmp1 * njac(3,1,i,j+1) - lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i,j+1) - > - tmp1 * njac(3,2,i,j+1) - lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i,j+1) - > - tmp1 * njac(3,3,i,j+1) - > - tmp1 * dy3 - lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i,j+1) - > - tmp1 * njac(3,4,i,j+1) - lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i,j+1) - > - tmp1 * njac(3,5,i,j+1) - - lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i,j+1) - > - tmp1 * njac(4,1,i,j+1) - lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i,j+1) - > - tmp1 * njac(4,2,i,j+1) - lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i,j+1) - > - tmp1 * njac(4,3,i,j+1) - lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i,j+1) - > - tmp1 * njac(4,4,i,j+1) - > - tmp1 * dy4 - lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i,j+1) - > - tmp1 * njac(4,5,i,j+1) - - lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i,j+1) - > - tmp1 * njac(5,1,i,j+1) - lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i,j+1) - > - tmp1 * njac(5,2,i,j+1) - lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i,j+1) - > - tmp1 * njac(5,3,i,j+1) - lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i,j+1) - > - tmp1 * njac(5,4,i,j+1) - lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i,j+1) - > - tmp1 * njac(5,5,i,j+1) - > - tmp1 * dy5 - - enddo - enddo - - -c--------------------------------------------------------------------- -c outer most do loops - sweeping in i direction -c--------------------------------------------------------------------- - if (first .eq. 1) then - -c--------------------------------------------------------------------- -c multiply c(i,jstart,k) by b_inverse and copy back to c -c multiply rhs(jstart) by b_inverse(jstart) and copy to rhs -c--------------------------------------------------------------------- -!dir$ ivdep - do i=start(1,c),isize - call binvcrhs( lhsb(1,1,i,jstart), - > lhsc(1,1,i,jstart,k,c), - > rhs(1,i,jstart,k,c) ) - enddo - - endif - -c--------------------------------------------------------------------- -c begin inner most do loop -c do all the elements of the cell unless last -c--------------------------------------------------------------------- - do j=jstart+first,jsize-last -!dir$ ivdep - do i=start(1,c),isize - -c--------------------------------------------------------------------- -c subtract A*lhs_vector(j-1) from lhs_vector(j) -c -c rhs(j) = rhs(j) - A*rhs(j-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,i,j), - > rhs(1,i,j-1,k,c),rhs(1,i,j,k,c)) - -c--------------------------------------------------------------------- -c B(j) = B(j) - C(j-1)*A(j) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,i,j), - > lhsc(1,1,i,j-1,k,c), - > lhsb(1,1,i,j)) - -c--------------------------------------------------------------------- -c multiply c(i,j,k) by b_inverse and copy back to c -c multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs -c--------------------------------------------------------------------- - call binvcrhs( lhsb(1,1,i,j), - > lhsc(1,1,i,j,k,c), - > rhs(1,i,j,k,c) ) - - enddo - enddo - -c--------------------------------------------------------------------- -c Now finish up special cases for last cell -c--------------------------------------------------------------------- - if (last .eq. 1) then - -!dir$ ivdep - do i=start(1,c),isize -c--------------------------------------------------------------------- -c rhs(jsize) = rhs(jsize) - A*rhs(jsize-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,i,jsize), - > rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c)) - -c--------------------------------------------------------------------- -c B(jsize) = B(jsize) - C(jsize-1)*A(jsize) -c call matmul_sub(aa,i,jsize,k,c, -c $ cc,i,jsize-1,k,c,bb,i,jsize,k,c) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,i,jsize), - > lhsc(1,1,i,jsize-1,k,c), - > lhsb(1,1,i,jsize)) - -c--------------------------------------------------------------------- -c multiply rhs(jsize) by b_inverse(jsize) and copy to rhs -c--------------------------------------------------------------------- - call binvrhs( lhsb(1,1,i,jsize), - > rhs(1,i,jsize,k,c) ) - enddo - - endif - enddo - - - return - end - - - diff --git a/examples/smpi/NAS/BT/z_solve.f b/examples/smpi/NAS/BT/z_solve.f deleted file mode 100644 index d7a5a2f1ec..0000000000 --- a/examples/smpi/NAS/BT/z_solve.f +++ /dev/null @@ -1,776 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_solve - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Performs line solves in Z direction by first factoring -c the block-tridiagonal matrix into an upper triangular matrix, -c and then performing back substitution to solve for the unknow -c vectors of each line. -c -c Make sure we treat elements zero to cell_size in the direction -c of the sweep. -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer c, kstart, stage, - > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), - > isize,jsize,ksize,send_id - - kstart = 0 - -c--------------------------------------------------------------------- -c in our terminology stage is the number of the cell in the y-direction -c i.e. stage = 1 means the start of the line stage=ncells means end -c--------------------------------------------------------------------- - do stage = 1,ncells - c = slice(3,stage) - isize = cell_size(1,c) - 1 - jsize = cell_size(2,c) - 1 - ksize = cell_size(3,c) - 1 -c--------------------------------------------------------------------- -c set last-cell flag -c--------------------------------------------------------------------- - if (stage .eq. ncells) then - last = 1 - else - last = 0 - endif - - if (stage .eq. 1) then -c--------------------------------------------------------------------- -c This is the first cell, so solve without receiving data -c--------------------------------------------------------------------- - first = 1 -c call lhsz(c) - call z_solve_cell(first,last,c) - else -c--------------------------------------------------------------------- -c Not the first cell of this line, so receive info from -c processor working on preceeding cell -c--------------------------------------------------------------------- - first = 0 - call z_receive_solve_info(recv_id,c) -c--------------------------------------------------------------------- -c overlap computations and communications -c--------------------------------------------------------------------- -c call lhsz(c) -c--------------------------------------------------------------------- -c wait for completion -c--------------------------------------------------------------------- - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) -c--------------------------------------------------------------------- -c install C'(kstart+1) and rhs'(kstart+1) to be used in this cell -c--------------------------------------------------------------------- - call z_unpack_solve_info(c) - call z_solve_cell(first,last,c) - endif - - if (last .eq. 0) call z_send_solve_info(send_id,c) - enddo - -c--------------------------------------------------------------------- -c now perform backsubstitution in reverse direction -c--------------------------------------------------------------------- - do stage = ncells, 1, -1 - c = slice(3,stage) - first = 0 - last = 0 - if (stage .eq. 1) first = 1 - if (stage .eq. ncells) then - last = 1 -c--------------------------------------------------------------------- -c last cell, so perform back substitute without waiting -c--------------------------------------------------------------------- - call z_backsubstitute(first, last,c) - else - call z_receive_backsub_info(recv_id,c) - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) - call z_unpack_backsub_info(c) - call z_backsubstitute(first,last,c) - endif - if (first .eq. 0) call z_send_backsub_info(send_id,c) - enddo - - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_unpack_solve_info(c) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack C'(-1) and rhs'(-1) for -c all i and j -c--------------------------------------------------------------------- - - include 'header.h' - - integer i,j,m,n,ptr,c,kstart - - kstart = 0 - ptr = 0 - do j=0,JMAX-1 - do i=0,IMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_send_solve_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send C'(kend) and rhs'(kend) for -c all i and j -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i,j,m,n,ksize,ptr,c,ip,jp - integer error,send_id,buffer_size - - ksize = cell_size(3,c)-1 - ip = cell_coord(1,c) - 1 - jp = cell_coord(2,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - -c--------------------------------------------------------------------- -c pack up buffer -c--------------------------------------------------------------------- - ptr = 0 - do j=0,JMAX-1 - do i=0,IMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,i,j,ksize,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - -c--------------------------------------------------------------------- -c send buffer -c--------------------------------------------------------------------- - call mpi_isend(in_buffer, buffer_size, - > dp_type, successor(3), - > BOTTOM+ip+jp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_send_backsub_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send U(jstart) for all i and j -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i,j,n,ptr,c,kstart,ip,jp - integer error,send_id,buffer_size - -c--------------------------------------------------------------------- -c Send element 0 to previous processor -c--------------------------------------------------------------------- - kstart = 0 - ip = cell_coord(1,c)-1 - jp = cell_coord(2,c)-1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - ptr = 0 - do j=0,JMAX-1 - do i=0,IMAX-1 - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,i,j,kstart,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - call mpi_isend(in_buffer, buffer_size, - > dp_type, predecessor(3), - > TOP+ip+jp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_unpack_backsub_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack U(ksize) for all i and j -c--------------------------------------------------------------------- - - include 'header.h' - - integer i,j,n,ptr,c - - ptr = 0 - do j=0,JMAX-1 - do i=0,IMAX-1 - do n=1,BLOCK_SIZE - backsub_info(n,i,j,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_receive_backsub_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer error,recv_id,ip,jp,c,buffer_size - ip = cell_coord(1,c) - 1 - jp = cell_coord(2,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - call mpi_irecv(out_buffer, buffer_size, - > dp_type, successor(3), - > TOP+ip+jp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_receive_solve_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer ip,jp,recv_id,error,c,buffer_size - ip = cell_coord(1,c) - 1 - jp = cell_coord(2,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - call mpi_irecv(out_buffer, buffer_size, - > dp_type, predecessor(3), - > BOTTOM+ip+jp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_backsubstitute(first, last, c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c back solve: if last cell, then generate U(ksize)=rhs(ksize) -c else assume U(ksize) is loaded in un pack backsub_info -c so just use it -c after call u(kstart) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - - integer first, last, c, i, k - integer m,n,j,jsize,isize,ksize,kstart - - kstart = 0 - isize = cell_size(1,c)-end(1,c)-1 - jsize = cell_size(2,c)-end(2,c)-1 - ksize = cell_size(3,c)-1 - if (last .eq. 0) then - do j=start(2,c),jsize - do i=start(1,c),isize -c--------------------------------------------------------------------- -c U(jsize) uses info from previous cell if not last cell -c--------------------------------------------------------------------- - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) - > - lhsc(m,n,i,j,ksize,c)* - > backsub_info(n,i,j,c) - enddo - enddo - enddo - enddo - endif - do k=ksize-1,kstart,-1 - do j=start(2,c),jsize - do i=start(1,c),isize - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - > - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c) - enddo - enddo - enddo - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_solve_cell(first,last,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c performs guaussian elimination on this cell. -c -c assumes that unpacking routines for non-first cells -c preload C' and rhs' from previous cell. -c -c assumed send happens outside this routine, but that -c c'(KMAX) and rhs'(KMAX) will be sent to next cell. -c--------------------------------------------------------------------- - - include 'header.h' - include 'work_lhs.h' - - integer first,last,c - integer i,j,k,isize,ksize,jsize,kstart - double precision utmp(6,-2:KMAX+1) - - kstart = 0 - isize = cell_size(1,c)-end(1,c)-1 - jsize = cell_size(2,c)-end(2,c)-1 - ksize = cell_size(3,c)-1 - - call lhsabinit(lhsa, lhsb, ksize) - - do j=start(2,c),jsize - do i=start(1,c),isize - -c--------------------------------------------------------------------- -c This function computes the left hand side for the three z-factors -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Compute the indices for storing the block-diagonal matrix; -c determine c (labeled f) and s jacobians for cell c -c--------------------------------------------------------------------- - do k = start(3,c)-1, cell_size(3,c)-end(3,c) - utmp(1,k) = 1.0d0 / u(1,i,j,k,c) - utmp(2,k) = u(2,i,j,k,c) - utmp(3,k) = u(3,i,j,k,c) - utmp(4,k) = u(4,i,j,k,c) - utmp(5,k) = u(5,i,j,k,c) - utmp(6,k) = qs(i,j,k,c) - end do - - do k = start(3,c)-1, cell_size(3,c)-end(3,c) - - tmp1 = utmp(1,k) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - fjac(1,1,k) = 0.0d+00 - fjac(1,2,k) = 0.0d+00 - fjac(1,3,k) = 0.0d+00 - fjac(1,4,k) = 1.0d+00 - fjac(1,5,k) = 0.0d+00 - - fjac(2,1,k) = - ( utmp(2,k)*utmp(4,k) ) - > * tmp2 - fjac(2,2,k) = utmp(4,k) * tmp1 - fjac(2,3,k) = 0.0d+00 - fjac(2,4,k) = utmp(2,k) * tmp1 - fjac(2,5,k) = 0.0d+00 - - fjac(3,1,k) = - ( utmp(3,k)*utmp(4,k) ) - > * tmp2 - fjac(3,2,k) = 0.0d+00 - fjac(3,3,k) = utmp(4,k) * tmp1 - fjac(3,4,k) = utmp(3,k) * tmp1 - fjac(3,5,k) = 0.0d+00 - - fjac(4,1,k) = - (utmp(4,k)*utmp(4,k) * tmp2 ) - > + c2 * utmp(6,k) - fjac(4,2,k) = - c2 * utmp(2,k) * tmp1 - fjac(4,3,k) = - c2 * utmp(3,k) * tmp1 - fjac(4,4,k) = ( 2.0d+00 - c2 ) - > * utmp(4,k) * tmp1 - fjac(4,5,k) = c2 - - fjac(5,1,k) = ( c2 * 2.0d0 * utmp(6,k) - > - c1 * ( utmp(5,k) * tmp1 ) ) - > * ( utmp(4,k) * tmp1 ) - fjac(5,2,k) = - c2 * ( utmp(2,k)*utmp(4,k) ) - > * tmp2 - fjac(5,3,k) = - c2 * ( utmp(3,k)*utmp(4,k) ) - > * tmp2 - fjac(5,4,k) = c1 * ( utmp(5,k) * tmp1 ) - > - c2 * ( utmp(6,k) - > + utmp(4,k)*utmp(4,k) * tmp2 ) - fjac(5,5,k) = c1 * utmp(4,k) * tmp1 - - njac(1,1,k) = 0.0d+00 - njac(1,2,k) = 0.0d+00 - njac(1,3,k) = 0.0d+00 - njac(1,4,k) = 0.0d+00 - njac(1,5,k) = 0.0d+00 - - njac(2,1,k) = - c3c4 * tmp2 * utmp(2,k) - njac(2,2,k) = c3c4 * tmp1 - njac(2,3,k) = 0.0d+00 - njac(2,4,k) = 0.0d+00 - njac(2,5,k) = 0.0d+00 - - njac(3,1,k) = - c3c4 * tmp2 * utmp(3,k) - njac(3,2,k) = 0.0d+00 - njac(3,3,k) = c3c4 * tmp1 - njac(3,4,k) = 0.0d+00 - njac(3,5,k) = 0.0d+00 - - njac(4,1,k) = - con43 * c3c4 * tmp2 * utmp(4,k) - njac(4,2,k) = 0.0d+00 - njac(4,3,k) = 0.0d+00 - njac(4,4,k) = con43 * c3 * c4 * tmp1 - njac(4,5,k) = 0.0d+00 - - njac(5,1,k) = - ( c3c4 - > - c1345 ) * tmp3 * (utmp(2,k)**2) - > - ( c3c4 - c1345 ) * tmp3 * (utmp(3,k)**2) - > - ( con43 * c3c4 - > - c1345 ) * tmp3 * (utmp(4,k)**2) - > - c1345 * tmp2 * utmp(5,k) - - njac(5,2,k) = ( c3c4 - c1345 ) * tmp2 * utmp(2,k) - njac(5,3,k) = ( c3c4 - c1345 ) * tmp2 * utmp(3,k) - njac(5,4,k) = ( con43 * c3c4 - > - c1345 ) * tmp2 * utmp(4,k) - njac(5,5,k) = ( c1345 )* tmp1 - - - enddo - -c--------------------------------------------------------------------- -c now joacobians set, so form left hand side in z direction -c--------------------------------------------------------------------- - do k = start(3,c), ksize-end(3,c) - - tmp1 = dt * tz1 - tmp2 = dt * tz2 - - lhsa(1,1,k) = - tmp2 * fjac(1,1,k-1) - > - tmp1 * njac(1,1,k-1) - > - tmp1 * dz1 - lhsa(1,2,k) = - tmp2 * fjac(1,2,k-1) - > - tmp1 * njac(1,2,k-1) - lhsa(1,3,k) = - tmp2 * fjac(1,3,k-1) - > - tmp1 * njac(1,3,k-1) - lhsa(1,4,k) = - tmp2 * fjac(1,4,k-1) - > - tmp1 * njac(1,4,k-1) - lhsa(1,5,k) = - tmp2 * fjac(1,5,k-1) - > - tmp1 * njac(1,5,k-1) - - lhsa(2,1,k) = - tmp2 * fjac(2,1,k-1) - > - tmp1 * njac(2,1,k-1) - lhsa(2,2,k) = - tmp2 * fjac(2,2,k-1) - > - tmp1 * njac(2,2,k-1) - > - tmp1 * dz2 - lhsa(2,3,k) = - tmp2 * fjac(2,3,k-1) - > - tmp1 * njac(2,3,k-1) - lhsa(2,4,k) = - tmp2 * fjac(2,4,k-1) - > - tmp1 * njac(2,4,k-1) - lhsa(2,5,k) = - tmp2 * fjac(2,5,k-1) - > - tmp1 * njac(2,5,k-1) - - lhsa(3,1,k) = - tmp2 * fjac(3,1,k-1) - > - tmp1 * njac(3,1,k-1) - lhsa(3,2,k) = - tmp2 * fjac(3,2,k-1) - > - tmp1 * njac(3,2,k-1) - lhsa(3,3,k) = - tmp2 * fjac(3,3,k-1) - > - tmp1 * njac(3,3,k-1) - > - tmp1 * dz3 - lhsa(3,4,k) = - tmp2 * fjac(3,4,k-1) - > - tmp1 * njac(3,4,k-1) - lhsa(3,5,k) = - tmp2 * fjac(3,5,k-1) - > - tmp1 * njac(3,5,k-1) - - lhsa(4,1,k) = - tmp2 * fjac(4,1,k-1) - > - tmp1 * njac(4,1,k-1) - lhsa(4,2,k) = - tmp2 * fjac(4,2,k-1) - > - tmp1 * njac(4,2,k-1) - lhsa(4,3,k) = - tmp2 * fjac(4,3,k-1) - > - tmp1 * njac(4,3,k-1) - lhsa(4,4,k) = - tmp2 * fjac(4,4,k-1) - > - tmp1 * njac(4,4,k-1) - > - tmp1 * dz4 - lhsa(4,5,k) = - tmp2 * fjac(4,5,k-1) - > - tmp1 * njac(4,5,k-1) - - lhsa(5,1,k) = - tmp2 * fjac(5,1,k-1) - > - tmp1 * njac(5,1,k-1) - lhsa(5,2,k) = - tmp2 * fjac(5,2,k-1) - > - tmp1 * njac(5,2,k-1) - lhsa(5,3,k) = - tmp2 * fjac(5,3,k-1) - > - tmp1 * njac(5,3,k-1) - lhsa(5,4,k) = - tmp2 * fjac(5,4,k-1) - > - tmp1 * njac(5,4,k-1) - lhsa(5,5,k) = - tmp2 * fjac(5,5,k-1) - > - tmp1 * njac(5,5,k-1) - > - tmp1 * dz5 - - lhsb(1,1,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(1,1,k) - > + tmp1 * 2.0d+00 * dz1 - lhsb(1,2,k) = tmp1 * 2.0d+00 * njac(1,2,k) - lhsb(1,3,k) = tmp1 * 2.0d+00 * njac(1,3,k) - lhsb(1,4,k) = tmp1 * 2.0d+00 * njac(1,4,k) - lhsb(1,5,k) = tmp1 * 2.0d+00 * njac(1,5,k) - - lhsb(2,1,k) = tmp1 * 2.0d+00 * njac(2,1,k) - lhsb(2,2,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(2,2,k) - > + tmp1 * 2.0d+00 * dz2 - lhsb(2,3,k) = tmp1 * 2.0d+00 * njac(2,3,k) - lhsb(2,4,k) = tmp1 * 2.0d+00 * njac(2,4,k) - lhsb(2,5,k) = tmp1 * 2.0d+00 * njac(2,5,k) - - lhsb(3,1,k) = tmp1 * 2.0d+00 * njac(3,1,k) - lhsb(3,2,k) = tmp1 * 2.0d+00 * njac(3,2,k) - lhsb(3,3,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(3,3,k) - > + tmp1 * 2.0d+00 * dz3 - lhsb(3,4,k) = tmp1 * 2.0d+00 * njac(3,4,k) - lhsb(3,5,k) = tmp1 * 2.0d+00 * njac(3,5,k) - - lhsb(4,1,k) = tmp1 * 2.0d+00 * njac(4,1,k) - lhsb(4,2,k) = tmp1 * 2.0d+00 * njac(4,2,k) - lhsb(4,3,k) = tmp1 * 2.0d+00 * njac(4,3,k) - lhsb(4,4,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(4,4,k) - > + tmp1 * 2.0d+00 * dz4 - lhsb(4,5,k) = tmp1 * 2.0d+00 * njac(4,5,k) - - lhsb(5,1,k) = tmp1 * 2.0d+00 * njac(5,1,k) - lhsb(5,2,k) = tmp1 * 2.0d+00 * njac(5,2,k) - lhsb(5,3,k) = tmp1 * 2.0d+00 * njac(5,3,k) - lhsb(5,4,k) = tmp1 * 2.0d+00 * njac(5,4,k) - lhsb(5,5,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(5,5,k) - > + tmp1 * 2.0d+00 * dz5 - - lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,k+1) - > - tmp1 * njac(1,1,k+1) - > - tmp1 * dz1 - lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,k+1) - > - tmp1 * njac(1,2,k+1) - lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,k+1) - > - tmp1 * njac(1,3,k+1) - lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,k+1) - > - tmp1 * njac(1,4,k+1) - lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,k+1) - > - tmp1 * njac(1,5,k+1) - - lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,k+1) - > - tmp1 * njac(2,1,k+1) - lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,k+1) - > - tmp1 * njac(2,2,k+1) - > - tmp1 * dz2 - lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,k+1) - > - tmp1 * njac(2,3,k+1) - lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,k+1) - > - tmp1 * njac(2,4,k+1) - lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,k+1) - > - tmp1 * njac(2,5,k+1) - - lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,k+1) - > - tmp1 * njac(3,1,k+1) - lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,k+1) - > - tmp1 * njac(3,2,k+1) - lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,k+1) - > - tmp1 * njac(3,3,k+1) - > - tmp1 * dz3 - lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,k+1) - > - tmp1 * njac(3,4,k+1) - lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,k+1) - > - tmp1 * njac(3,5,k+1) - - lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,k+1) - > - tmp1 * njac(4,1,k+1) - lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,k+1) - > - tmp1 * njac(4,2,k+1) - lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,k+1) - > - tmp1 * njac(4,3,k+1) - lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,k+1) - > - tmp1 * njac(4,4,k+1) - > - tmp1 * dz4 - lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,k+1) - > - tmp1 * njac(4,5,k+1) - - lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,k+1) - > - tmp1 * njac(5,1,k+1) - lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,k+1) - > - tmp1 * njac(5,2,k+1) - lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,k+1) - > - tmp1 * njac(5,3,k+1) - lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,k+1) - > - tmp1 * njac(5,4,k+1) - lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,k+1) - > - tmp1 * njac(5,5,k+1) - > - tmp1 * dz5 - - enddo - - -c--------------------------------------------------------------------- -c outer most do loops - sweeping in i direction -c--------------------------------------------------------------------- - if (first .eq. 1) then - -c--------------------------------------------------------------------- -c multiply c(i,j,kstart) by b_inverse and copy back to c -c multiply rhs(kstart) by b_inverse(kstart) and copy to rhs -c--------------------------------------------------------------------- - call binvcrhs( lhsb(1,1,kstart), - > lhsc(1,1,i,j,kstart,c), - > rhs(1,i,j,kstart,c) ) - - endif - -c--------------------------------------------------------------------- -c begin inner most do loop -c do all the elements of the cell unless last -c--------------------------------------------------------------------- - do k=kstart+first,ksize-last - -c--------------------------------------------------------------------- -c subtract A*lhs_vector(k-1) from lhs_vector(k) -c -c rhs(k) = rhs(k) - A*rhs(k-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,k), - > rhs(1,i,j,k-1,c),rhs(1,i,j,k,c)) - -c--------------------------------------------------------------------- -c B(k) = B(k) - C(k-1)*A(k) -c call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,k), - > lhsc(1,1,i,j,k-1,c), - > lhsb(1,1,k)) - -c--------------------------------------------------------------------- -c multiply c(i,j,k) by b_inverse and copy back to c -c multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs -c--------------------------------------------------------------------- - call binvcrhs( lhsb(1,1,k), - > lhsc(1,1,i,j,k,c), - > rhs(1,i,j,k,c) ) - - enddo - -c--------------------------------------------------------------------- -c Now finish up special cases for last cell -c--------------------------------------------------------------------- - if (last .eq. 1) then - -c--------------------------------------------------------------------- -c rhs(ksize) = rhs(ksize) - A*rhs(ksize-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,ksize), - > rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c)) - -c--------------------------------------------------------------------- -c B(ksize) = B(ksize) - C(ksize-1)*A(ksize) -c call matmul_sub(aa,i,j,ksize,c, -c $ cc,i,j,ksize-1,c,bb,i,j,ksize,c) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,ksize), - > lhsc(1,1,i,j,ksize-1,c), - > lhsb(1,1,ksize)) - -c--------------------------------------------------------------------- -c multiply rhs(ksize) by b_inverse(ksize) and copy to rhs -c--------------------------------------------------------------------- - call binvrhs( lhsb(1,1,ksize), - > rhs(1,i,j,ksize,c) ) - - endif - enddo - enddo - - - return - end - - - - - - diff --git a/examples/smpi/NAS/BT/z_solve_vec.f b/examples/smpi/NAS/BT/z_solve_vec.f deleted file mode 100644 index 2c27fb00d1..0000000000 --- a/examples/smpi/NAS/BT/z_solve_vec.f +++ /dev/null @@ -1,793 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_solve - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Performs line solves in Z direction by first factoring -c the block-tridiagonal matrix into an upper triangular matrix, -c and then performing back substitution to solve for the unknow -c vectors of each line. -c -c Make sure we treat elements zero to cell_size in the direction -c of the sweep. -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer c, kstart, stage, - > first, last, recv_id, error, r_status(MPI_STATUS_SIZE), - > isize,jsize,ksize,send_id - - kstart = 0 - -c--------------------------------------------------------------------- -c in our terminology stage is the number of the cell in the y-direct -c i.e. stage = 1 means the start of the line stage=ncells means end -c--------------------------------------------------------------------- - do stage = 1,ncells - c = slice(3,stage) - isize = cell_size(1,c) - 1 - jsize = cell_size(2,c) - 1 - ksize = cell_size(3,c) - 1 -c--------------------------------------------------------------------- -c set last-cell flag -c--------------------------------------------------------------------- - if (stage .eq. ncells) then - last = 1 - else - last = 0 - endif - - if (stage .eq. 1) then -c--------------------------------------------------------------------- -c This is the first cell, so solve without receiving data -c--------------------------------------------------------------------- - first = 1 -c call lhsz(c) - call z_solve_cell(first,last,c) - else -c--------------------------------------------------------------------- -c Not the first cell of this line, so receive info from -c processor working on preceeding cell -c--------------------------------------------------------------------- - first = 0 - call z_receive_solve_info(recv_id,c) -c--------------------------------------------------------------------- -c overlap computations and communications -c--------------------------------------------------------------------- -c call lhsz(c) -c--------------------------------------------------------------------- -c wait for completion -c--------------------------------------------------------------------- - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) -c--------------------------------------------------------------------- -c install C'(kstart+1) and rhs'(kstart+1) to be used in this cell -c--------------------------------------------------------------------- - call z_unpack_solve_info(c) - call z_solve_cell(first,last,c) - endif - - if (last .eq. 0) call z_send_solve_info(send_id,c) - enddo - -c--------------------------------------------------------------------- -c now perform backsubstitution in reverse direction -c--------------------------------------------------------------------- - do stage = ncells, 1, -1 - c = slice(3,stage) - first = 0 - last = 0 - if (stage .eq. 1) first = 1 - if (stage .eq. ncells) then - last = 1 -c--------------------------------------------------------------------- -c last cell, so perform back substitute without waiting -c--------------------------------------------------------------------- - call z_backsubstitute(first, last,c) - else - call z_receive_backsub_info(recv_id,c) - call mpi_wait(send_id,r_status,error) - call mpi_wait(recv_id,r_status,error) - call z_unpack_backsub_info(c) - call z_backsubstitute(first,last,c) - endif - if (first .eq. 0) call z_send_backsub_info(send_id,c) - enddo - - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_unpack_solve_info(c) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack C'(-1) and rhs'(-1) for -c all i and j -c--------------------------------------------------------------------- - - include 'header.h' - - integer i,j,m,n,ptr,c,kstart - - kstart = 0 - ptr = 0 - do j=0,JMAX-1 - do i=0,IMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - lhsc(m,n,i,j,kstart-1,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - rhs(n,i,j,kstart-1,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_send_solve_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send C'(kend) and rhs'(kend) for -c all i and j -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i,j,m,n,ksize,ptr,c,ip,jp - integer error,send_id,buffer_size - - ksize = cell_size(3,c)-1 - ip = cell_coord(1,c) - 1 - jp = cell_coord(2,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - -c--------------------------------------------------------------------- -c pack up buffer -c--------------------------------------------------------------------- - ptr = 0 - do j=0,JMAX-1 - do i=0,IMAX-1 - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = lhsc(m,n,i,j,ksize,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,i,j,ksize,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - -c--------------------------------------------------------------------- -c send buffer -c--------------------------------------------------------------------- - call mpi_isend(in_buffer, buffer_size, - > dp_type, successor(3), - > BOTTOM+ip+jp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_send_backsub_info(send_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c pack up and send U(jstart) for all i and j -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i,j,n,ptr,c,kstart,ip,jp - integer error,send_id,buffer_size - -c--------------------------------------------------------------------- -c Send element 0 to previous processor -c--------------------------------------------------------------------- - kstart = 0 - ip = cell_coord(1,c)-1 - jp = cell_coord(2,c)-1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - ptr = 0 - do j=0,JMAX-1 - do i=0,IMAX-1 - do n=1,BLOCK_SIZE - in_buffer(ptr+n) = rhs(n,i,j,kstart,c) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - call mpi_isend(in_buffer, buffer_size, - > dp_type, predecessor(3), - > TOP+ip+jp*NCELLS, comm_solve, - > send_id,error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_unpack_backsub_info(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c unpack U(ksize) for all i and j -c--------------------------------------------------------------------- - - include 'header.h' - - integer i,j,n,ptr,c - - ptr = 0 - do j=0,JMAX-1 - do i=0,IMAX-1 - do n=1,BLOCK_SIZE - backsub_info(n,i,j,c) = out_buffer(ptr+n) - enddo - ptr = ptr+BLOCK_SIZE - enddo - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_receive_backsub_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer error,recv_id,ip,jp,c,buffer_size - ip = cell_coord(1,c) - 1 - jp = cell_coord(2,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE - call mpi_irecv(out_buffer, buffer_size, - > dp_type, successor(3), - > TOP+ip+jp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_receive_solve_info(recv_id,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c post mpi receives -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer ip,jp,recv_id,error,c,buffer_size - ip = cell_coord(1,c) - 1 - jp = cell_coord(2,c) - 1 - buffer_size=MAX_CELL_DIM*MAX_CELL_DIM* - > (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE) - call mpi_irecv(out_buffer, buffer_size, - > dp_type, predecessor(3), - > BOTTOM+ip+jp*NCELLS, comm_solve, - > recv_id, error) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_backsubstitute(first, last, c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c back solve: if last cell, then generate U(ksize)=rhs(ksize) -c else assume U(ksize) is loaded in un pack backsub_info -c so just use it -c after call u(kstart) will be sent to next cell -c--------------------------------------------------------------------- - - include 'header.h' - - integer first, last, c, i, k - integer m,n,j,jsize,isize,ksize,kstart - - kstart = 0 - isize = cell_size(1,c)-end(1,c)-1 - jsize = cell_size(2,c)-end(2,c)-1 - ksize = cell_size(3,c)-1 - if (last .eq. 0) then - do j=start(2,c),jsize - do i=start(1,c),isize -c--------------------------------------------------------------------- -c U(jsize) uses info from previous cell if not last cell -c--------------------------------------------------------------------- - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,j,ksize,c) = rhs(m,i,j,ksize,c) - > - lhsc(m,n,i,j,ksize,c)* - > backsub_info(n,i,j,c) - enddo - enddo - enddo - enddo - endif - do k=ksize-1,kstart,-1 - do j=start(2,c),jsize - do i=start(1,c),isize - do m=1,BLOCK_SIZE - do n=1,BLOCK_SIZE - rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - > - lhsc(m,n,i,j,k,c)*rhs(n,i,j,k+1,c) - enddo - enddo - enddo - enddo - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_solve_cell(first,last,c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c performs guaussian elimination on this cell. -c -c assumes that unpacking routines for non-first cells -c preload C' and rhs' from previous cell. -c -c assumed send happens outside this routine, but that -c c'(KMAX) and rhs'(KMAX) will be sent to next cell. -c--------------------------------------------------------------------- - - include 'header.h' - include 'work_lhs_vec.h' - - integer first,last,c - integer i,j,k,m,n,isize,ksize,jsize,kstart - - kstart = 0 - isize = cell_size(1,c)-end(1,c)-1 - jsize = cell_size(2,c)-end(2,c)-1 - ksize = cell_size(3,c)-1 - -c--------------------------------------------------------------------- -c zero the left hand side for starters -c set diagonal values to 1. This is overkill, but convenient -c--------------------------------------------------------------------- - do i = 0, isize - do m = 1, 5 - do n = 1, 5 - lhsa(m,n,i,0) = 0.0d0 - lhsb(m,n,i,0) = 0.0d0 - lhsa(m,n,i,ksize) = 0.0d0 - lhsb(m,n,i,ksize) = 0.0d0 - enddo - lhsb(m,m,i,0) = 1.0d0 - lhsb(m,m,i,ksize) = 1.0d0 - enddo - enddo - - do j=start(2,c),jsize - -c--------------------------------------------------------------------- -c This function computes the left hand side for the three z-factors -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Compute the indices for storing the block-diagonal matrix; -c determine c (labeled f) and s jacobians for cell c -c--------------------------------------------------------------------- - - do k = start(3,c)-1, cell_size(3,c)-end(3,c) - do i=start(1,c),isize - - tmp1 = 1.0d0 / u(1,i,j,k,c) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - fjac(1,1,i,k) = 0.0d+00 - fjac(1,2,i,k) = 0.0d+00 - fjac(1,3,i,k) = 0.0d+00 - fjac(1,4,i,k) = 1.0d+00 - fjac(1,5,i,k) = 0.0d+00 - - fjac(2,1,i,k) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) - > * tmp2 - fjac(2,2,i,k) = u(4,i,j,k,c) * tmp1 - fjac(2,3,i,k) = 0.0d+00 - fjac(2,4,i,k) = u(2,i,j,k,c) * tmp1 - fjac(2,5,i,k) = 0.0d+00 - - fjac(3,1,i,k) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) ) - > * tmp2 - fjac(3,2,i,k) = 0.0d+00 - fjac(3,3,i,k) = u(4,i,j,k,c) * tmp1 - fjac(3,4,i,k) = u(3,i,j,k,c) * tmp1 - fjac(3,5,i,k) = 0.0d+00 - - fjac(4,1,i,k) = - (u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 ) - > + c2 * qs(i,j,k,c) - fjac(4,2,i,k) = - c2 * u(2,i,j,k,c) * tmp1 - fjac(4,3,i,k) = - c2 * u(3,i,j,k,c) * tmp1 - fjac(4,4,i,k) = ( 2.0d+00 - c2 ) - > * u(4,i,j,k,c) * tmp1 - fjac(4,5,i,k) = c2 - - fjac(5,1,i,k) = ( c2 * 2.0d0 * qs(i,j,k,c) - > - c1 * ( u(5,i,j,k,c) * tmp1 ) ) - > * ( u(4,i,j,k,c) * tmp1 ) - fjac(5,2,i,k) = - c2 * ( u(2,i,j,k,c)*u(4,i,j,k,c) ) - > * tmp2 - fjac(5,3,i,k) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) ) - > * tmp2 - fjac(5,4,i,k) = c1 * ( u(5,i,j,k,c) * tmp1 ) - > - c2 * ( qs(i,j,k,c) - > + u(4,i,j,k,c)*u(4,i,j,k,c) * tmp2 ) - fjac(5,5,i,k) = c1 * u(4,i,j,k,c) * tmp1 - - njac(1,1,i,k) = 0.0d+00 - njac(1,2,i,k) = 0.0d+00 - njac(1,3,i,k) = 0.0d+00 - njac(1,4,i,k) = 0.0d+00 - njac(1,5,i,k) = 0.0d+00 - - njac(2,1,i,k) = - c3c4 * tmp2 * u(2,i,j,k,c) - njac(2,2,i,k) = c3c4 * tmp1 - njac(2,3,i,k) = 0.0d+00 - njac(2,4,i,k) = 0.0d+00 - njac(2,5,i,k) = 0.0d+00 - - njac(3,1,i,k) = - c3c4 * tmp2 * u(3,i,j,k,c) - njac(3,2,i,k) = 0.0d+00 - njac(3,3,i,k) = c3c4 * tmp1 - njac(3,4,i,k) = 0.0d+00 - njac(3,5,i,k) = 0.0d+00 - - njac(4,1,i,k) = - con43 * c3c4 * tmp2 * u(4,i,j,k,c) - njac(4,2,i,k) = 0.0d+00 - njac(4,3,i,k) = 0.0d+00 - njac(4,4,i,k) = con43 * c3 * c4 * tmp1 - njac(4,5,i,k) = 0.0d+00 - - njac(5,1,i,k) = - ( c3c4 - > - c1345 ) * tmp3 * (u(2,i,j,k,c)**2) - > - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2) - > - ( con43 * c3c4 - > - c1345 ) * tmp3 * (u(4,i,j,k,c)**2) - > - c1345 * tmp2 * u(5,i,j,k,c) - - njac(5,2,i,k) = ( c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c) - njac(5,3,i,k) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c) - njac(5,4,i,k) = ( con43 * c3c4 - > - c1345 ) * tmp2 * u(4,i,j,k,c) - njac(5,5,i,k) = ( c1345 )* tmp1 - - - enddo - enddo - -c--------------------------------------------------------------------- -c now joacobians set, so form left hand side in z direction -c--------------------------------------------------------------------- - do k = start(3,c), ksize-end(3,c) - do i=start(1,c),isize - - tmp1 = dt * tz1 - tmp2 = dt * tz2 - - lhsa(1,1,i,k) = - tmp2 * fjac(1,1,i,k-1) - > - tmp1 * njac(1,1,i,k-1) - > - tmp1 * dz1 - lhsa(1,2,i,k) = - tmp2 * fjac(1,2,i,k-1) - > - tmp1 * njac(1,2,i,k-1) - lhsa(1,3,i,k) = - tmp2 * fjac(1,3,i,k-1) - > - tmp1 * njac(1,3,i,k-1) - lhsa(1,4,i,k) = - tmp2 * fjac(1,4,i,k-1) - > - tmp1 * njac(1,4,i,k-1) - lhsa(1,5,i,k) = - tmp2 * fjac(1,5,i,k-1) - > - tmp1 * njac(1,5,i,k-1) - - lhsa(2,1,i,k) = - tmp2 * fjac(2,1,i,k-1) - > - tmp1 * njac(2,1,i,k-1) - lhsa(2,2,i,k) = - tmp2 * fjac(2,2,i,k-1) - > - tmp1 * njac(2,2,i,k-1) - > - tmp1 * dz2 - lhsa(2,3,i,k) = - tmp2 * fjac(2,3,i,k-1) - > - tmp1 * njac(2,3,i,k-1) - lhsa(2,4,i,k) = - tmp2 * fjac(2,4,i,k-1) - > - tmp1 * njac(2,4,i,k-1) - lhsa(2,5,i,k) = - tmp2 * fjac(2,5,i,k-1) - > - tmp1 * njac(2,5,i,k-1) - - lhsa(3,1,i,k) = - tmp2 * fjac(3,1,i,k-1) - > - tmp1 * njac(3,1,i,k-1) - lhsa(3,2,i,k) = - tmp2 * fjac(3,2,i,k-1) - > - tmp1 * njac(3,2,i,k-1) - lhsa(3,3,i,k) = - tmp2 * fjac(3,3,i,k-1) - > - tmp1 * njac(3,3,i,k-1) - > - tmp1 * dz3 - lhsa(3,4,i,k) = - tmp2 * fjac(3,4,i,k-1) - > - tmp1 * njac(3,4,i,k-1) - lhsa(3,5,i,k) = - tmp2 * fjac(3,5,i,k-1) - > - tmp1 * njac(3,5,i,k-1) - - lhsa(4,1,i,k) = - tmp2 * fjac(4,1,i,k-1) - > - tmp1 * njac(4,1,i,k-1) - lhsa(4,2,i,k) = - tmp2 * fjac(4,2,i,k-1) - > - tmp1 * njac(4,2,i,k-1) - lhsa(4,3,i,k) = - tmp2 * fjac(4,3,i,k-1) - > - tmp1 * njac(4,3,i,k-1) - lhsa(4,4,i,k) = - tmp2 * fjac(4,4,i,k-1) - > - tmp1 * njac(4,4,i,k-1) - > - tmp1 * dz4 - lhsa(4,5,i,k) = - tmp2 * fjac(4,5,i,k-1) - > - tmp1 * njac(4,5,i,k-1) - - lhsa(5,1,i,k) = - tmp2 * fjac(5,1,i,k-1) - > - tmp1 * njac(5,1,i,k-1) - lhsa(5,2,i,k) = - tmp2 * fjac(5,2,i,k-1) - > - tmp1 * njac(5,2,i,k-1) - lhsa(5,3,i,k) = - tmp2 * fjac(5,3,i,k-1) - > - tmp1 * njac(5,3,i,k-1) - lhsa(5,4,i,k) = - tmp2 * fjac(5,4,i,k-1) - > - tmp1 * njac(5,4,i,k-1) - lhsa(5,5,i,k) = - tmp2 * fjac(5,5,i,k-1) - > - tmp1 * njac(5,5,i,k-1) - > - tmp1 * dz5 - - lhsb(1,1,i,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(1,1,i,k) - > + tmp1 * 2.0d+00 * dz1 - lhsb(1,2,i,k) = tmp1 * 2.0d+00 * njac(1,2,i,k) - lhsb(1,3,i,k) = tmp1 * 2.0d+00 * njac(1,3,i,k) - lhsb(1,4,i,k) = tmp1 * 2.0d+00 * njac(1,4,i,k) - lhsb(1,5,i,k) = tmp1 * 2.0d+00 * njac(1,5,i,k) - - lhsb(2,1,i,k) = tmp1 * 2.0d+00 * njac(2,1,i,k) - lhsb(2,2,i,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(2,2,i,k) - > + tmp1 * 2.0d+00 * dz2 - lhsb(2,3,i,k) = tmp1 * 2.0d+00 * njac(2,3,i,k) - lhsb(2,4,i,k) = tmp1 * 2.0d+00 * njac(2,4,i,k) - lhsb(2,5,i,k) = tmp1 * 2.0d+00 * njac(2,5,i,k) - - lhsb(3,1,i,k) = tmp1 * 2.0d+00 * njac(3,1,i,k) - lhsb(3,2,i,k) = tmp1 * 2.0d+00 * njac(3,2,i,k) - lhsb(3,3,i,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(3,3,i,k) - > + tmp1 * 2.0d+00 * dz3 - lhsb(3,4,i,k) = tmp1 * 2.0d+00 * njac(3,4,i,k) - lhsb(3,5,i,k) = tmp1 * 2.0d+00 * njac(3,5,i,k) - - lhsb(4,1,i,k) = tmp1 * 2.0d+00 * njac(4,1,i,k) - lhsb(4,2,i,k) = tmp1 * 2.0d+00 * njac(4,2,i,k) - lhsb(4,3,i,k) = tmp1 * 2.0d+00 * njac(4,3,i,k) - lhsb(4,4,i,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(4,4,i,k) - > + tmp1 * 2.0d+00 * dz4 - lhsb(4,5,i,k) = tmp1 * 2.0d+00 * njac(4,5,i,k) - - lhsb(5,1,i,k) = tmp1 * 2.0d+00 * njac(5,1,i,k) - lhsb(5,2,i,k) = tmp1 * 2.0d+00 * njac(5,2,i,k) - lhsb(5,3,i,k) = tmp1 * 2.0d+00 * njac(5,3,i,k) - lhsb(5,4,i,k) = tmp1 * 2.0d+00 * njac(5,4,i,k) - lhsb(5,5,i,k) = 1.0d+00 - > + tmp1 * 2.0d+00 * njac(5,5,i,k) - > + tmp1 * 2.0d+00 * dz5 - - lhsc(1,1,i,j,k,c) = tmp2 * fjac(1,1,i,k+1) - > - tmp1 * njac(1,1,i,k+1) - > - tmp1 * dz1 - lhsc(1,2,i,j,k,c) = tmp2 * fjac(1,2,i,k+1) - > - tmp1 * njac(1,2,i,k+1) - lhsc(1,3,i,j,k,c) = tmp2 * fjac(1,3,i,k+1) - > - tmp1 * njac(1,3,i,k+1) - lhsc(1,4,i,j,k,c) = tmp2 * fjac(1,4,i,k+1) - > - tmp1 * njac(1,4,i,k+1) - lhsc(1,5,i,j,k,c) = tmp2 * fjac(1,5,i,k+1) - > - tmp1 * njac(1,5,i,k+1) - - lhsc(2,1,i,j,k,c) = tmp2 * fjac(2,1,i,k+1) - > - tmp1 * njac(2,1,i,k+1) - lhsc(2,2,i,j,k,c) = tmp2 * fjac(2,2,i,k+1) - > - tmp1 * njac(2,2,i,k+1) - > - tmp1 * dz2 - lhsc(2,3,i,j,k,c) = tmp2 * fjac(2,3,i,k+1) - > - tmp1 * njac(2,3,i,k+1) - lhsc(2,4,i,j,k,c) = tmp2 * fjac(2,4,i,k+1) - > - tmp1 * njac(2,4,i,k+1) - lhsc(2,5,i,j,k,c) = tmp2 * fjac(2,5,i,k+1) - > - tmp1 * njac(2,5,i,k+1) - - lhsc(3,1,i,j,k,c) = tmp2 * fjac(3,1,i,k+1) - > - tmp1 * njac(3,1,i,k+1) - lhsc(3,2,i,j,k,c) = tmp2 * fjac(3,2,i,k+1) - > - tmp1 * njac(3,2,i,k+1) - lhsc(3,3,i,j,k,c) = tmp2 * fjac(3,3,i,k+1) - > - tmp1 * njac(3,3,i,k+1) - > - tmp1 * dz3 - lhsc(3,4,i,j,k,c) = tmp2 * fjac(3,4,i,k+1) - > - tmp1 * njac(3,4,i,k+1) - lhsc(3,5,i,j,k,c) = tmp2 * fjac(3,5,i,k+1) - > - tmp1 * njac(3,5,i,k+1) - - lhsc(4,1,i,j,k,c) = tmp2 * fjac(4,1,i,k+1) - > - tmp1 * njac(4,1,i,k+1) - lhsc(4,2,i,j,k,c) = tmp2 * fjac(4,2,i,k+1) - > - tmp1 * njac(4,2,i,k+1) - lhsc(4,3,i,j,k,c) = tmp2 * fjac(4,3,i,k+1) - > - tmp1 * njac(4,3,i,k+1) - lhsc(4,4,i,j,k,c) = tmp2 * fjac(4,4,i,k+1) - > - tmp1 * njac(4,4,i,k+1) - > - tmp1 * dz4 - lhsc(4,5,i,j,k,c) = tmp2 * fjac(4,5,i,k+1) - > - tmp1 * njac(4,5,i,k+1) - - lhsc(5,1,i,j,k,c) = tmp2 * fjac(5,1,i,k+1) - > - tmp1 * njac(5,1,i,k+1) - lhsc(5,2,i,j,k,c) = tmp2 * fjac(5,2,i,k+1) - > - tmp1 * njac(5,2,i,k+1) - lhsc(5,3,i,j,k,c) = tmp2 * fjac(5,3,i,k+1) - > - tmp1 * njac(5,3,i,k+1) - lhsc(5,4,i,j,k,c) = tmp2 * fjac(5,4,i,k+1) - > - tmp1 * njac(5,4,i,k+1) - lhsc(5,5,i,j,k,c) = tmp2 * fjac(5,5,i,k+1) - > - tmp1 * njac(5,5,i,k+1) - > - tmp1 * dz5 - - enddo - enddo - - -c--------------------------------------------------------------------- -c outer most do loops - sweeping in i direction -c--------------------------------------------------------------------- - if (first .eq. 1) then - -c--------------------------------------------------------------------- -c multiply c(i,j,kstart) by b_inverse and copy back to c -c multiply rhs(kstart) by b_inverse(kstart) and copy to rhs -c--------------------------------------------------------------------- -!dir$ ivdep - do i=start(1,c),isize - call binvcrhs( lhsb(1,1,i,kstart), - > lhsc(1,1,i,j,kstart,c), - > rhs(1,i,j,kstart,c) ) - enddo - - endif - -c--------------------------------------------------------------------- -c begin inner most do loop -c do all the elements of the cell unless last -c--------------------------------------------------------------------- - do k=kstart+first,ksize-last -!dir$ ivdep - do i=start(1,c),isize - -c--------------------------------------------------------------------- -c subtract A*lhs_vector(k-1) from lhs_vector(k) -c -c rhs(k) = rhs(k) - A*rhs(k-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,i,k), - > rhs(1,i,j,k-1,c),rhs(1,i,j,k,c)) - -c--------------------------------------------------------------------- -c B(k) = B(k) - C(k-1)*A(k) -c call matmul_sub(aa,i,j,k,c,cc,i,j,k-1,c,bb,i,j,k,c) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,i,k), - > lhsc(1,1,i,j,k-1,c), - > lhsb(1,1,i,k)) - -c--------------------------------------------------------------------- -c multiply c(i,j,k) by b_inverse and copy back to c -c multiply rhs(i,j,1) by b_inverse(i,j,1) and copy to rhs -c--------------------------------------------------------------------- - call binvcrhs( lhsb(1,1,i,k), - > lhsc(1,1,i,j,k,c), - > rhs(1,i,j,k,c) ) - - enddo - enddo - -c--------------------------------------------------------------------- -c Now finish up special cases for last cell -c--------------------------------------------------------------------- - if (last .eq. 1) then - -!dir$ ivdep - do i=start(1,c),isize -c--------------------------------------------------------------------- -c rhs(ksize) = rhs(ksize) - A*rhs(ksize-1) -c--------------------------------------------------------------------- - call matvec_sub(lhsa(1,1,i,ksize), - > rhs(1,i,j,ksize-1,c),rhs(1,i,j,ksize,c)) - -c--------------------------------------------------------------------- -c B(ksize) = B(ksize) - C(ksize-1)*A(ksize) -c call matmul_sub(aa,i,j,ksize,c, -c $ cc,i,j,ksize-1,c,bb,i,j,ksize,c) -c--------------------------------------------------------------------- - call matmul_sub(lhsa(1,1,i,ksize), - > lhsc(1,1,i,j,ksize-1,c), - > lhsb(1,1,i,ksize)) - -c--------------------------------------------------------------------- -c multiply rhs(ksize) by b_inverse(ksize) and copy to rhs -c--------------------------------------------------------------------- - call binvrhs( lhsb(1,1,i,ksize), - > rhs(1,i,j,ksize,c) ) - enddo - - endif - enddo - - - return - end - - - - - - diff --git a/examples/smpi/NAS/CG/Makefile b/examples/smpi/NAS/CG/Makefile deleted file mode 100644 index 33e52c697b..0000000000 --- a/examples/smpi/NAS/CG/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -SHELL=/bin/sh -BENCHMARK=cg -BENCHMARKU=CG - -include ../config/make.def - -OBJS = cg.o ${COMMON}/print_results.o \ - ${COMMON}/${RAND}.o ${COMMON}/timers.o - -include ../sys/make.common - -${PROGRAM}: config ${OBJS} - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} - -cg.o: cg.f mpinpb.h npbparams.h - ${FCOMPILE} cg.f - -clean: - - rm -f *.o *~ - - rm -f npbparams.h core - - - diff --git a/examples/smpi/NAS/CG/cg.f b/examples/smpi/NAS/CG/cg.f deleted file mode 100644 index 0d425d78a4..0000000000 --- a/examples/smpi/NAS/CG/cg.f +++ /dev/null @@ -1,1787 +0,0 @@ -!-------------------------------------------------------------------------! -! ! -! N A S P A R A L L E L B E N C H M A R K S 3.3 ! -! ! -! C G ! -! ! -!-------------------------------------------------------------------------! -! ! -! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! -! It is described in NAS Technical Reports 95-020 and 02-007 ! -! ! -! Permission to use, copy, distribute and modify this software ! -! for any purpose with or without fee is hereby granted. We ! -! request, however, that all derived work reference the NAS ! -! Parallel Benchmarks 3.3. This software is provided "as is" ! -! without express or implied warranty. ! -! ! -! Information on NPB 3.3, including the technical report, the ! -! original specifications, source code, results and information ! -! on how to submit new results, is available at: ! -! ! -! http://www.nas.nasa.gov/Software/NPB/ ! -! ! -! Send comments or suggestions to npb@nas.nasa.gov ! -! ! -! NAS Parallel Benchmarks Group ! -! NASA Ames Research Center ! -! Mail Stop: T27A-1 ! -! Moffett Field, CA 94035-1000 ! -! ! -! E-mail: npb@nas.nasa.gov ! -! Fax: (650) 604-3957 ! -! ! -!-------------------------------------------------------------------------! - - -c--------------------------------------------------------------------- -c -c Authors: M. Yarrow -c C. Kuszmaul -c R. F. Van der Wijngaart -c H. Jin -c -c--------------------------------------------------------------------- - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - program cg -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - - implicit none - - include 'mpinpb.h' - integer status(MPI_STATUS_SIZE), request, ierr - - include 'npbparams.h' - -c--------------------------------------------------------------------- -c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows. -c num_proc_cols and num_proc_cols are to be found in npbparams.h. -c When num_procs is not square, then num_proc_cols must be = 2*num_proc_rows. -c--------------------------------------------------------------------- - integer num_procs - parameter( num_procs = num_proc_cols * num_proc_rows ) - - - -c--------------------------------------------------------------------- -c Class specific parameters: -c It appears here for reference only. -c These are their values, however, this info is imported in the npbparams.h -c include file, which is written by the sys/setparams.c program. -c--------------------------------------------------------------------- - -C---------- -C Class S: -C---------- -CC parameter( na=1400, -CC > nonzer=7, -CC > shift=10., -CC > niter=15, -CC > rcond=1.0d-1 ) -C---------- -C Class W: -C---------- -CC parameter( na=7000, -CC > nonzer=8, -CC > shift=12., -CC > niter=15, -CC > rcond=1.0d-1 ) -C---------- -C Class A: -C---------- -CC parameter( na=14000, -CC > nonzer=11, -CC > shift=20., -CC > niter=15, -CC > rcond=1.0d-1 ) -C---------- -C Class B: -C---------- -CC parameter( na=75000, -CC > nonzer=13, -CC > shift=60., -CC > niter=75, -CC > rcond=1.0d-1 ) -C---------- -C Class C: -C---------- -CC parameter( na=150000, -CC > nonzer=15, -CC > shift=110., -CC > niter=75, -CC > rcond=1.0d-1 ) -C---------- -C Class D: -C---------- -CC parameter( na=1500000, -CC > nonzer=21, -CC > shift=500., -CC > niter=100, -CC > rcond=1.0d-1 ) -C---------- -C Class E: -C---------- -CC parameter( na=9000000, -CC > nonzer=26, -CC > shift=1500., -CC > niter=100, -CC > rcond=1.0d-1 ) - - - - integer nz - parameter( nz = na*(nonzer+1)/num_procs*(nonzer+1)+nonzer - > + na*(nonzer+2+num_procs/256)/num_proc_cols ) - - - - common / partit_size / naa, nzz, - > npcols, nprows, - > proc_col, proc_row, - > firstrow, - > lastrow, - > firstcol, - > lastcol, - > exch_proc, - > exch_recv_length, - > send_start, - > send_len - integer naa, nzz, - > npcols, nprows, - > proc_col, proc_row, - > firstrow, - > lastrow, - > firstcol, - > lastcol, - > exch_proc, - > exch_recv_length, - > send_start, - > send_len - - - common / main_int_mem / colidx, rowstr, - > iv, arow, acol - integer colidx(nz), rowstr(na+1), - > iv(2*na+1), arow(nz), acol(nz) - - - common / main_flt_mem / v, aelt, a, - > x, - > z, - > p, - > q, - > r, - > w - double precision v(na+1), aelt(nz), a(nz), - > x(na/num_proc_rows+2), - > z(na/num_proc_rows+2), - > p(na/num_proc_rows+2), - > q(na/num_proc_rows+2), - > r(na/num_proc_rows+2), - > w(na/num_proc_rows+2) - - - common /urando/ amult, tran - double precision amult, tran - - - - integer l2npcols - integer reduce_exch_proc(num_proc_cols) - integer reduce_send_starts(num_proc_cols) - integer reduce_send_lengths(num_proc_cols) - integer reduce_recv_starts(num_proc_cols) - integer reduce_recv_lengths(num_proc_cols) - - integer i, j, k, it - - double precision zeta, randlc - external randlc - double precision rnorm - double precision norm_temp1(2), norm_temp2(2) - - double precision t, tmax, mflops - external timer_read - double precision timer_read - character class - logical verified - double precision zeta_verify_value, epsilon, err - - -c--------------------------------------------------------------------- -c Set up mpi initialization and number of proc testing -c--------------------------------------------------------------------- - call initialize_mpi - - - if( na .eq. 1400 .and. - & nonzer .eq. 7 .and. - & niter .eq. 15 .and. - & shift .eq. 10.d0 ) then - class = 'S' - zeta_verify_value = 8.5971775078648d0 - else if( na .eq. 7000 .and. - & nonzer .eq. 8 .and. - & niter .eq. 15 .and. - & shift .eq. 12.d0 ) then - class = 'W' - zeta_verify_value = 10.362595087124d0 - else if( na .eq. 14000 .and. - & nonzer .eq. 11 .and. - & niter .eq. 15 .and. - & shift .eq. 20.d0 ) then - class = 'A' - zeta_verify_value = 17.130235054029d0 - else if( na .eq. 75000 .and. - & nonzer .eq. 13 .and. - & niter .eq. 75 .and. - & shift .eq. 60.d0 ) then - class = 'B' - zeta_verify_value = 22.712745482631d0 - else if( na .eq. 150000 .and. - & nonzer .eq. 15 .and. - & niter .eq. 75 .and. - & shift .eq. 110.d0 ) then - class = 'C' - zeta_verify_value = 28.973605592845d0 - else if( na .eq. 1500000 .and. - & nonzer .eq. 21 .and. - & niter .eq. 100 .and. - & shift .eq. 500.d0 ) then - class = 'D' - zeta_verify_value = 52.514532105794d0 - else if( na .eq. 9000000 .and. - & nonzer .eq. 26 .and. - & niter .eq. 100 .and. - & shift .eq. 1.5d3 ) then - class = 'E' - zeta_verify_value = 77.522164599383d0 - else - class = 'U' - endif - - if( me .eq. root )then - write( *,1000 ) - write( *,1001 ) na - write( *,1002 ) niter - write( *,1003 ) nprocs - write( *,1004 ) nonzer - write( *,1005 ) shift - 1000 format(//,' NAS Parallel Benchmarks 3.3 -- CG Benchmark', /) - 1001 format(' Size: ', i10 ) - 1002 format(' Iterations: ', i5 ) - 1003 format(' Number of active processes: ', i5 ) - 1004 format(' Number of nonzeroes per row: ', i8) - 1005 format(' Eigenvalue shift: ', e8.3) - endif - - if (.not. convertdouble) then - dp_type = MPI_DOUBLE_PRECISION - else - dp_type = MPI_REAL - endif - - - naa = na - nzz = nz - - -c--------------------------------------------------------------------- -c Set up processor info, such as whether sq num of procs, etc -c--------------------------------------------------------------------- - call setup_proc_info( num_procs, - > num_proc_rows, - > num_proc_cols ) - - -c--------------------------------------------------------------------- -c Set up partition's submatrix info: firstcol, lastcol, firstrow, lastrow -c--------------------------------------------------------------------- - call setup_submatrix_info( l2npcols, - > reduce_exch_proc, - > reduce_send_starts, - > reduce_send_lengths, - > reduce_recv_starts, - > reduce_recv_lengths ) - - - -c--------------------------------------------------------------------- -c Inialize random number generator -c--------------------------------------------------------------------- - tran = 314159265.0D0 - amult = 1220703125.0D0 - zeta = randlc( tran, amult ) - -c--------------------------------------------------------------------- -c Set up partition's sparse random matrix for given class size -c--------------------------------------------------------------------- - call makea(naa, nzz, a, colidx, rowstr, nonzer, - > firstrow, lastrow, firstcol, lastcol, - > rcond, arow, acol, aelt, v, iv, shift) - - - -c--------------------------------------------------------------------- -c Note: as a result of the above call to makea: -c values of j used in indexing rowstr go from 1 --> lastrow-firstrow+1 -c values of colidx which are col indexes go from firstcol --> lastcol -c So: -c Shift the col index vals from actual (firstcol --> lastcol ) -c to local, i.e., (1 --> lastcol-firstcol+1) -c--------------------------------------------------------------------- - do j=1,lastrow-firstrow+1 - do k=rowstr(j),rowstr(j+1)-1 - colidx(k) = colidx(k) - firstcol + 1 - enddo - enddo - -c--------------------------------------------------------------------- -c set starting vector to (1, 1, .... 1) -c--------------------------------------------------------------------- - do i = 1, na/num_proc_rows+1 - x(i) = 1.0D0 - enddo - - zeta = 0.0d0 - -c--------------------------------------------------------------------- -c----> -c Do one iteration untimed to init all code and data page tables -c----> (then reinit, start timing, to niter its) -c--------------------------------------------------------------------- - do it = 1, 1 - -c--------------------------------------------------------------------- -c The call to the conjugate gradient routine: -c--------------------------------------------------------------------- - call conj_grad ( colidx, - > rowstr, - > x, - > z, - > a, - > p, - > q, - > r, - > w, - > rnorm, - > l2npcols, - > reduce_exch_proc, - > reduce_send_starts, - > reduce_send_lengths, - > reduce_recv_starts, - > reduce_recv_lengths ) - -c--------------------------------------------------------------------- -c zeta = shift + 1/(x.z) -c So, first: (x.z) -c Also, find norm of z -c So, first: (z.z) -c--------------------------------------------------------------------- - norm_temp1(1) = 0.0d0 - norm_temp1(2) = 0.0d0 - do j=1, lastcol-firstcol+1 - norm_temp1(1) = norm_temp1(1) + x(j)*z(j) - norm_temp1(2) = norm_temp1(2) + z(j)*z(j) - enddo - - do i = 1, l2npcols - call mpi_irecv( norm_temp2, - > 2, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > request, - > ierr ) - call mpi_send( norm_temp1, - > 2, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > ierr ) - call mpi_wait( request, status, ierr ) - - norm_temp1(1) = norm_temp1(1) + norm_temp2(1) - norm_temp1(2) = norm_temp1(2) + norm_temp2(2) - enddo - - norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) ) - - -c--------------------------------------------------------------------- -c Normalize z to obtain x -c--------------------------------------------------------------------- - do j=1, lastcol-firstcol+1 - x(j) = norm_temp1(2)*z(j) - enddo - - - enddo ! end of do one iteration untimed - - -c--------------------------------------------------------------------- -c set starting vector to (1, 1, .... 1) -c--------------------------------------------------------------------- -c -c NOTE: a questionable limit on size: should this be na/num_proc_cols+1 ? -c - do i = 1, na/num_proc_rows+1 - x(i) = 1.0D0 - enddo - - zeta = 0.0d0 - -c--------------------------------------------------------------------- -c Synchronize and start timing -c--------------------------------------------------------------------- - call mpi_barrier( mpi_comm_world, - > ierr ) - - call timer_clear( 1 ) - call timer_start( 1 ) - -c--------------------------------------------------------------------- -c----> -c Main Iteration for inverse power method -c----> -c--------------------------------------------------------------------- - do it = 1, niter - -c--------------------------------------------------------------------- -c The call to the conjugate gradient routine: -c--------------------------------------------------------------------- - call conj_grad ( colidx, - > rowstr, - > x, - > z, - > a, - > p, - > q, - > r, - > w, - > rnorm, - > l2npcols, - > reduce_exch_proc, - > reduce_send_starts, - > reduce_send_lengths, - > reduce_recv_starts, - > reduce_recv_lengths ) - - -c--------------------------------------------------------------------- -c zeta = shift + 1/(x.z) -c So, first: (x.z) -c Also, find norm of z -c So, first: (z.z) -c--------------------------------------------------------------------- - norm_temp1(1) = 0.0d0 - norm_temp1(2) = 0.0d0 - do j=1, lastcol-firstcol+1 - norm_temp1(1) = norm_temp1(1) + x(j)*z(j) - norm_temp1(2) = norm_temp1(2) + z(j)*z(j) - enddo - - do i = 1, l2npcols - call mpi_irecv( norm_temp2, - > 2, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > request, - > ierr ) - call mpi_send( norm_temp1, - > 2, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > ierr ) - call mpi_wait( request, status, ierr ) - - norm_temp1(1) = norm_temp1(1) + norm_temp2(1) - norm_temp1(2) = norm_temp1(2) + norm_temp2(2) - enddo - - norm_temp1(2) = 1.0d0 / sqrt( norm_temp1(2) ) - - - if( me .eq. root )then - zeta = shift + 1.0d0 / norm_temp1(1) - if( it .eq. 1 ) write( *,9000 ) - write( *,9001 ) it, rnorm, zeta - endif - 9000 format( /,' iteration ||r|| zeta' ) - 9001 format( 4x, i5, 7x, e20.14, f20.13 ) - -c--------------------------------------------------------------------- -c Normalize z to obtain x -c--------------------------------------------------------------------- - do j=1, lastcol-firstcol+1 - x(j) = norm_temp1(2)*z(j) - enddo - - - enddo ! end of main iter inv pow meth - - call timer_stop( 1 ) - -c--------------------------------------------------------------------- -c End of timed section -c--------------------------------------------------------------------- - - t = timer_read( 1 ) - - call mpi_reduce( t, - > tmax, - > 1, - > dp_type, - > MPI_MAX, - > root, - > mpi_comm_world, - > ierr ) - - if( me .eq. root )then - write(*,100) - 100 format(' Benchmark completed ') - - epsilon = 1.d-10 - if (class .ne. 'U') then - - err = abs( zeta - zeta_verify_value )/zeta_verify_value - if( err .le. epsilon ) then - verified = .TRUE. - write(*, 200) - write(*, 201) zeta - write(*, 202) err - 200 format(' VERIFICATION SUCCESSFUL ') - 201 format(' Zeta is ', E20.13) - 202 format(' Error is ', E20.13) - else - verified = .FALSE. - write(*, 300) - write(*, 301) zeta - write(*, 302) zeta_verify_value - 300 format(' VERIFICATION FAILED') - 301 format(' Zeta ', E20.13) - 302 format(' The correct zeta is ', E20.13) - endif - else - verified = .FALSE. - write (*, 400) - write (*, 401) - write (*, 201) zeta - 400 format(' Problem size unknown') - 401 format(' NO VERIFICATION PERFORMED') - endif - - - if( tmax .ne. 0. ) then - mflops = float( 2*niter*na ) - & * ( 3.+float( nonzer*(nonzer+1) ) - & + 25.*(5.+float( nonzer*(nonzer+1) )) - & + 3. ) / tmax / 1000000.0 - else - mflops = 0.0 - endif - - call print_results('CG', class, na, 0, 0, - > niter, nnodes_compiled, nprocs, tmax, - > mflops, ' floating point', - > verified, npbversion, compiletime, - > cs1, cs2, cs3, cs4, cs5, cs6, cs7) - - - endif - - - call mpi_finalize(ierr) - - - - end ! end main - - - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine initialize_mpi -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - - integer ierr - - - call mpi_init( ierr ) - call mpi_comm_rank( mpi_comm_world, me, ierr ) - call mpi_comm_size( mpi_comm_world, nprocs, ierr ) - root = 0 - - - return - end - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine setup_proc_info( num_procs, - > num_proc_rows, - > num_proc_cols ) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - - common / partit_size / naa, nzz, - > npcols, nprows, - > proc_col, proc_row, - > firstrow, - > lastrow, - > firstcol, - > lastcol, - > exch_proc, - > exch_recv_length, - > send_start, - > send_len - integer naa, nzz, - > npcols, nprows, - > proc_col, proc_row, - > firstrow, - > lastrow, - > firstcol, - > lastcol, - > exch_proc, - > exch_recv_length, - > send_start, - > send_len - - integer num_procs, num_proc_cols, num_proc_rows - integer i, ierr - integer log2nprocs - -c--------------------------------------------------------------------- -c num_procs must be a power of 2, and num_procs=num_proc_cols*num_proc_rows -c When num_procs is not square, then num_proc_cols = 2*num_proc_rows -c--------------------------------------------------------------------- -c First, number of procs must be power of two. -c--------------------------------------------------------------------- - if( nprocs .ne. num_procs )then - if( me .eq. root ) write( *,9000 ) nprocs, num_procs - 9000 format( /,'Error: ',/,'num of procs allocated (', - > i4, ' )', - > /,'is not equal to',/, - > 'compiled number of procs (', - > i4, ' )',/ ) - call mpi_finalize(ierr) - stop - endif - - - i = num_proc_cols - 100 continue - if( i .ne. 1 .and. i/2*2 .ne. i )then - if ( me .eq. root ) then - write( *,* ) 'Error: num_proc_cols is ', - > num_proc_cols, - > ' which is not a power of two' - endif - call mpi_finalize(ierr) - stop - endif - i = i / 2 - if( i .ne. 0 )then - goto 100 - endif - - i = num_proc_rows - 200 continue - if( i .ne. 1 .and. i/2*2 .ne. i )then - if ( me .eq. root ) then - write( *,* ) 'Error: num_proc_rows is ', - > num_proc_rows, - > ' which is not a power of two' - endif - call mpi_finalize(ierr) - stop - endif - i = i / 2 - if( i .ne. 0 )then - goto 200 - endif - - log2nprocs = 0 - i = nprocs - 300 continue - if( i .ne. 1 .and. i/2*2 .ne. i )then - write( *,* ) 'Error: nprocs is ', - > nprocs, - > ' which is not a power of two' - call mpi_finalize(ierr) - stop - endif - i = i / 2 - if( i .ne. 0 )then - log2nprocs = log2nprocs + 1 - goto 300 - endif - -CC write( *,* ) 'nprocs, log2nprocs: ',nprocs,log2nprocs - - - npcols = num_proc_cols - nprows = num_proc_rows - - - return - end - - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine setup_submatrix_info( l2npcols, - > reduce_exch_proc, - > reduce_send_starts, - > reduce_send_lengths, - > reduce_recv_starts, - > reduce_recv_lengths ) - > -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - - integer col_size, row_size - - common / partit_size / naa, nzz, - > npcols, nprows, - > proc_col, proc_row, - > firstrow, - > lastrow, - > firstcol, - > lastcol, - > exch_proc, - > exch_recv_length, - > send_start, - > send_len - integer naa, nzz, - > npcols, nprows, - > proc_col, proc_row, - > firstrow, - > lastrow, - > firstcol, - > lastcol, - > exch_proc, - > exch_recv_length, - > send_start, - > send_len - - integer reduce_exch_proc(*) - integer reduce_send_starts(*) - integer reduce_send_lengths(*) - integer reduce_recv_starts(*) - integer reduce_recv_lengths(*) - - integer i, j - integer div_factor - integer l2npcols - - - proc_row = me / npcols - proc_col = me - proc_row*npcols - - - -c--------------------------------------------------------------------- -c If naa evenly divisible by npcols, then it is evenly divisible -c by nprows -c--------------------------------------------------------------------- - - if( naa/npcols*npcols .eq. naa )then - col_size = naa/npcols - firstcol = proc_col*col_size + 1 - lastcol = firstcol - 1 + col_size - row_size = naa/nprows - firstrow = proc_row*row_size + 1 - lastrow = firstrow - 1 + row_size -c--------------------------------------------------------------------- -c If naa not evenly divisible by npcols, then first subdivide for nprows -c and then, if npcols not equal to nprows (i.e., not a sq number of procs), -c get col subdivisions by dividing by 2 each row subdivision. -c--------------------------------------------------------------------- - else - if( proc_row .lt. naa - naa/nprows*nprows)then - row_size = naa/nprows+ 1 - firstrow = proc_row*row_size + 1 - lastrow = firstrow - 1 + row_size - else - row_size = naa/nprows - firstrow = (naa - naa/nprows*nprows)*(row_size+1) - > + (proc_row-(naa-naa/nprows*nprows)) - > *row_size + 1 - lastrow = firstrow - 1 + row_size - endif - if( npcols .eq. nprows )then - if( proc_col .lt. naa - naa/npcols*npcols )then - col_size = naa/npcols+ 1 - firstcol = proc_col*col_size + 1 - lastcol = firstcol - 1 + col_size - else - col_size = naa/npcols - firstcol = (naa - naa/npcols*npcols)*(col_size+1) - > + (proc_col-(naa-naa/npcols*npcols)) - > *col_size + 1 - lastcol = firstcol - 1 + col_size - endif - else - if( (proc_col/2) .lt. - > naa - naa/(npcols/2)*(npcols/2) )then - col_size = naa/(npcols/2) + 1 - firstcol = (proc_col/2)*col_size + 1 - lastcol = firstcol - 1 + col_size - else - col_size = naa/(npcols/2) - firstcol = (naa - naa/(npcols/2)*(npcols/2)) - > *(col_size+1) - > + ((proc_col/2)-(naa-naa/(npcols/2)*(npcols/2))) - > *col_size + 1 - lastcol = firstcol - 1 + col_size - endif -CC write( *,* ) col_size,firstcol,lastcol - if( mod( me,2 ) .eq. 0 )then - lastcol = firstcol - 1 + (col_size-1)/2 + 1 - else - firstcol = firstcol + (col_size-1)/2 + 1 - lastcol = firstcol - 1 + col_size/2 -CC write( *,* ) firstcol,lastcol - endif - endif - endif - - - - if( npcols .eq. nprows )then - send_start = 1 - send_len = lastrow - firstrow + 1 - else - if( mod( me,2 ) .eq. 0 )then - send_start = 1 - send_len = (1 + lastrow-firstrow+1)/2 - else - send_start = (1 + lastrow-firstrow+1)/2 + 1 - send_len = (lastrow-firstrow+1)/2 - endif - endif - - - - -c--------------------------------------------------------------------- -c Transpose exchange processor -c--------------------------------------------------------------------- - - if( npcols .eq. nprows )then - exch_proc = mod( me,nprows )*nprows + me/nprows - else - exch_proc = 2*(mod( me/2,nprows )*nprows + me/2/nprows) - > + mod( me,2 ) - endif - - - - i = npcols / 2 - l2npcols = 0 - do while( i .gt. 0 ) - l2npcols = l2npcols + 1 - i = i / 2 - enddo - - -c--------------------------------------------------------------------- -c Set up the reduce phase schedules... -c--------------------------------------------------------------------- - - div_factor = npcols - do i = 1, l2npcols - - j = mod( proc_col+div_factor/2, div_factor ) - > + proc_col / div_factor * div_factor - reduce_exch_proc(i) = proc_row*npcols + j - - div_factor = div_factor / 2 - - enddo - - - do i = l2npcols, 1, -1 - - if( nprows .eq. npcols )then - reduce_send_starts(i) = send_start - reduce_send_lengths(i) = send_len - reduce_recv_lengths(i) = lastrow - firstrow + 1 - else - reduce_recv_lengths(i) = send_len - if( i .eq. l2npcols )then - reduce_send_lengths(i) = lastrow-firstrow+1 - send_len - if( me/2*2 .eq. me )then - reduce_send_starts(i) = send_start + send_len - else - reduce_send_starts(i) = 1 - endif - else - reduce_send_lengths(i) = send_len - reduce_send_starts(i) = send_start - endif - endif - reduce_recv_starts(i) = send_start - - enddo - - - exch_recv_length = lastcol - firstcol + 1 - - - return - end - - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine conj_grad ( colidx, - > rowstr, - > x, - > z, - > a, - > p, - > q, - > r, - > w, - > rnorm, - > l2npcols, - > reduce_exch_proc, - > reduce_send_starts, - > reduce_send_lengths, - > reduce_recv_starts, - > reduce_recv_lengths ) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Floaging point arrays here are named as in NPB1 spec discussion of -c CG algorithm -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - - integer status(MPI_STATUS_SIZE ), request - - - common / partit_size / naa, nzz, - > npcols, nprows, - > proc_col, proc_row, - > firstrow, - > lastrow, - > firstcol, - > lastcol, - > exch_proc, - > exch_recv_length, - > send_start, - > send_len - integer naa, nzz, - > npcols, nprows, - > proc_col, proc_row, - > firstrow, - > lastrow, - > firstcol, - > lastcol, - > exch_proc, - > exch_recv_length, - > send_start, - > send_len - - - - double precision x(*), - > z(*), - > a(nzz) - integer colidx(nzz), rowstr(naa+1) - - double precision p(*), - > q(*), - > r(*), - > w(*) ! used as work temporary - - integer l2npcols - integer reduce_exch_proc(l2npcols) - integer reduce_send_starts(l2npcols) - integer reduce_send_lengths(l2npcols) - integer reduce_recv_starts(l2npcols) - integer reduce_recv_lengths(l2npcols) - - integer i, j, k, ierr - integer cgit, cgitmax - - double precision d, sum, rho, rho0, alpha, beta, rnorm - - external timer_read - double precision timer_read - - data cgitmax / 25 / - - -c--------------------------------------------------------------------- -c Initialize the CG algorithm: -c--------------------------------------------------------------------- - do j=1,naa/nprows+1 - q(j) = 0.0d0 - z(j) = 0.0d0 - r(j) = x(j) - p(j) = r(j) - w(j) = 0.0d0 - enddo - - -c--------------------------------------------------------------------- -c rho = r.r -c Now, obtain the norm of r: First, sum squares of r elements locally... -c--------------------------------------------------------------------- - sum = 0.0d0 - do j=1, lastcol-firstcol+1 - sum = sum + r(j)*r(j) - enddo - -c--------------------------------------------------------------------- -c Exchange and sum with procs identified in reduce_exch_proc -c (This is equivalent to mpi_allreduce.) -c Sum the partial sums of rho, leaving rho on all processors -c--------------------------------------------------------------------- - do i = 1, l2npcols - call mpi_irecv( rho, - > 1, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > request, - > ierr ) - call mpi_send( sum, - > 1, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > ierr ) - call mpi_wait( request, status, ierr ) - - sum = sum + rho - enddo - rho = sum - - - -c--------------------------------------------------------------------- -c----> -c The conj grad iteration loop -c----> -c--------------------------------------------------------------------- - do cgit = 1, cgitmax - - -c--------------------------------------------------------------------- -c q = A.p -c The partition submatrix-vector multiply: use workspace w -c--------------------------------------------------------------------- - do j=1,lastrow-firstrow+1 - sum = 0.d0 - do k=rowstr(j),rowstr(j+1)-1 - sum = sum + a(k)*p(colidx(k)) - enddo - w(j) = sum - enddo - -c--------------------------------------------------------------------- -c Sum the partition submatrix-vec A.p's across rows -c Exchange and sum piece of w with procs identified in reduce_exch_proc -c--------------------------------------------------------------------- - do i = l2npcols, 1, -1 - call mpi_irecv( q(reduce_recv_starts(i)), - > reduce_recv_lengths(i), - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > request, - > ierr ) - call mpi_send( w(reduce_send_starts(i)), - > reduce_send_lengths(i), - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > ierr ) - call mpi_wait( request, status, ierr ) - do j=send_start,send_start + reduce_recv_lengths(i) - 1 - w(j) = w(j) + q(j) - enddo - enddo - - -c--------------------------------------------------------------------- -c Exchange piece of q with transpose processor: -c--------------------------------------------------------------------- - if( l2npcols .ne. 0 )then - call mpi_irecv( q, - > exch_recv_length, - > dp_type, - > exch_proc, - > 1, - > mpi_comm_world, - > request, - > ierr ) - - call mpi_send( w(send_start), - > send_len, - > dp_type, - > exch_proc, - > 1, - > mpi_comm_world, - > ierr ) - call mpi_wait( request, status, ierr ) - else - do j=1,exch_recv_length - q(j) = w(j) - enddo - endif - - -c--------------------------------------------------------------------- -c Clear w for reuse... -c--------------------------------------------------------------------- - do j=1, max( lastrow-firstrow+1, lastcol-firstcol+1 ) - w(j) = 0.0d0 - enddo - - -c--------------------------------------------------------------------- -c Obtain p.q -c--------------------------------------------------------------------- - sum = 0.0d0 - do j=1, lastcol-firstcol+1 - sum = sum + p(j)*q(j) - enddo - -c--------------------------------------------------------------------- -c Obtain d with a sum-reduce -c--------------------------------------------------------------------- - do i = 1, l2npcols - call mpi_irecv( d, - > 1, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > request, - > ierr ) - call mpi_send( sum, - > 1, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > ierr ) - - call mpi_wait( request, status, ierr ) - - sum = sum + d - enddo - d = sum - - -c--------------------------------------------------------------------- -c Obtain alpha = rho / (p.q) -c--------------------------------------------------------------------- - alpha = rho / d - -c--------------------------------------------------------------------- -c Save a temporary of rho -c--------------------------------------------------------------------- - rho0 = rho - -c--------------------------------------------------------------------- -c Obtain z = z + alpha*p -c and r = r - alpha*q -c--------------------------------------------------------------------- - do j=1, lastcol-firstcol+1 - z(j) = z(j) + alpha*p(j) - r(j) = r(j) - alpha*q(j) - enddo - -c--------------------------------------------------------------------- -c rho = r.r -c Now, obtain the norm of r: First, sum squares of r elements locally... -c--------------------------------------------------------------------- - sum = 0.0d0 - do j=1, lastcol-firstcol+1 - sum = sum + r(j)*r(j) - enddo - -c--------------------------------------------------------------------- -c Obtain rho with a sum-reduce -c--------------------------------------------------------------------- - do i = 1, l2npcols - call mpi_irecv( rho, - > 1, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > request, - > ierr ) - call mpi_send( sum, - > 1, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > ierr ) - call mpi_wait( request, status, ierr ) - - sum = sum + rho - enddo - rho = sum - -c--------------------------------------------------------------------- -c Obtain beta: -c--------------------------------------------------------------------- - beta = rho / rho0 - -c--------------------------------------------------------------------- -c p = r + beta*p -c--------------------------------------------------------------------- - do j=1, lastcol-firstcol+1 - p(j) = r(j) + beta*p(j) - enddo - - - - enddo ! end of do cgit=1,cgitmax - - - -c--------------------------------------------------------------------- -c Compute residual norm explicitly: ||r|| = ||x - A.z|| -c First, form A.z -c The partition submatrix-vector multiply -c--------------------------------------------------------------------- - do j=1,lastrow-firstrow+1 - sum = 0.d0 - do k=rowstr(j),rowstr(j+1)-1 - sum = sum + a(k)*z(colidx(k)) - enddo - w(j) = sum - enddo - - - -c--------------------------------------------------------------------- -c Sum the partition submatrix-vec A.z's across rows -c--------------------------------------------------------------------- - do i = l2npcols, 1, -1 - call mpi_irecv( r(reduce_recv_starts(i)), - > reduce_recv_lengths(i), - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > request, - > ierr ) - call mpi_send( w(reduce_send_starts(i)), - > reduce_send_lengths(i), - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > ierr ) - call mpi_wait( request, status, ierr ) - - do j=send_start,send_start + reduce_recv_lengths(i) - 1 - w(j) = w(j) + r(j) - enddo - enddo - - -c--------------------------------------------------------------------- -c Exchange piece of q with transpose processor: -c--------------------------------------------------------------------- - if( l2npcols .ne. 0 )then - call mpi_irecv( r, - > exch_recv_length, - > dp_type, - > exch_proc, - > 1, - > mpi_comm_world, - > request, - > ierr ) - - call mpi_send( w(send_start), - > send_len, - > dp_type, - > exch_proc, - > 1, - > mpi_comm_world, - > ierr ) - call mpi_wait( request, status, ierr ) - else - do j=1,exch_recv_length - r(j) = w(j) - enddo - endif - - -c--------------------------------------------------------------------- -c At this point, r contains A.z -c--------------------------------------------------------------------- - sum = 0.0d0 - do j=1, lastcol-firstcol+1 - d = x(j) - r(j) - sum = sum + d*d - enddo - -c--------------------------------------------------------------------- -c Obtain d with a sum-reduce -c--------------------------------------------------------------------- - do i = 1, l2npcols - call mpi_irecv( d, - > 1, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > request, - > ierr ) - call mpi_send( sum, - > 1, - > dp_type, - > reduce_exch_proc(i), - > i, - > mpi_comm_world, - > ierr ) - call mpi_wait( request, status, ierr ) - - sum = sum + d - enddo - d = sum - - - if( me .eq. root ) rnorm = sqrt( d ) - - - - return - end ! end of routine conj_grad - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine makea( n, nz, a, colidx, rowstr, nonzer, - > firstrow, lastrow, firstcol, lastcol, - > rcond, arow, acol, aelt, v, iv, shift ) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - integer n, nz - integer firstrow, lastrow, firstcol, lastcol - integer colidx(nz), rowstr(n+1) - integer iv(2*n+1), arow(nz), acol(nz) - double precision v(n+1), aelt(nz) - double precision rcond, a(nz), shift - -c--------------------------------------------------------------------- -c generate the test problem for benchmark 6 -c makea generates a sparse matrix with a -c prescribed sparsity distribution -c -c parameter type usage -c -c input -c -c n i number of cols/rows of matrix -c nz i nonzeros as declared array size -c rcond r*8 condition number -c shift r*8 main diagonal shift -c -c output -c -c a r*8 array for nonzeros -c colidx i col indices -c rowstr i row pointers -c -c workspace -c -c iv, arow, acol i -c v, aelt r*8 -c--------------------------------------------------------------------- - - integer i, nnza, iouter, ivelt, ivelt1, irow, nzv, NONZER - -c--------------------------------------------------------------------- -c nonzer is approximately (int(sqrt(nnza /n))); -c--------------------------------------------------------------------- - - double precision size, ratio, scale - external sparse, sprnvc, vecset - - size = 1.0D0 - ratio = rcond ** (1.0D0 / dfloat(n)) - nnza = 0 - -c--------------------------------------------------------------------- -c Initialize iv(n+1 .. 2n) to zero. -c Used by sprnvc to mark nonzero positions -c--------------------------------------------------------------------- - - do i = 1, n - iv(n+i) = 0 - enddo - do iouter = 1, n - nzv = nonzer - call sprnvc( n, nzv, v, colidx, iv(1), iv(n+1) ) - call vecset( n, v, colidx, nzv, iouter, .5D0 ) - do ivelt = 1, nzv - jcol = colidx(ivelt) - if (jcol.ge.firstcol .and. jcol.le.lastcol) then - scale = size * v(ivelt) - do ivelt1 = 1, nzv - irow = colidx(ivelt1) - if (irow.ge.firstrow .and. irow.le.lastrow) then - nnza = nnza + 1 - if (nnza .gt. nz) goto 9999 - acol(nnza) = jcol - arow(nnza) = irow - aelt(nnza) = v(ivelt1) * scale - endif - enddo - endif - enddo - size = size * ratio - enddo - - -c--------------------------------------------------------------------- -c ... add the identity * rcond to the generated matrix to bound -c the smallest eigenvalue from below by rcond -c--------------------------------------------------------------------- - do i = firstrow, lastrow - if (i.ge.firstcol .and. i.le.lastcol) then - iouter = n + i - nnza = nnza + 1 - if (nnza .gt. nz) goto 9999 - acol(nnza) = i - arow(nnza) = i - aelt(nnza) = rcond - shift - endif - enddo - - -c--------------------------------------------------------------------- -c ... make the sparse matrix from list of elements with duplicates -c (v and iv are used as workspace) -c--------------------------------------------------------------------- - call sparse( a, colidx, rowstr, n, arow, acol, aelt, - > firstrow, lastrow, - > v, iv(1), iv(n+1), nnza ) - return - - 9999 continue - write(*,*) 'Space for matrix elements exceeded in makea' - write(*,*) 'nnza, nzmax = ',nnza, nz - write(*,*) ' iouter = ',iouter - - stop - end -c-------end of makea------------------------------ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine sparse( a, colidx, rowstr, n, arow, acol, aelt, - > firstrow, lastrow, - > x, mark, nzloc, nnza ) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit logical (a-z) - integer colidx(*), rowstr(*) - integer firstrow, lastrow - integer n, arow(*), acol(*), nnza - double precision a(*), aelt(*) - -c--------------------------------------------------------------------- -c rows range from firstrow to lastrow -c the rowstr pointers are defined for nrows = lastrow-firstrow+1 values -c--------------------------------------------------------------------- - integer nzloc(n), nrows - double precision x(n) - logical mark(n) - -c--------------------------------------------------- -c generate a sparse matrix from a list of -c [col, row, element] tri -c--------------------------------------------------- - - integer i, j, jajp1, nza, k, nzrow - double precision xi - -c--------------------------------------------------------------------- -c how many rows of result -c--------------------------------------------------------------------- - nrows = lastrow - firstrow + 1 - -c--------------------------------------------------------------------- -c ...count the number of triples in each row -c--------------------------------------------------------------------- - do j = 1, n - rowstr(j) = 0 - mark(j) = .false. - enddo - rowstr(n+1) = 0 - - do nza = 1, nnza - j = (arow(nza) - firstrow + 1) + 1 - rowstr(j) = rowstr(j) + 1 - enddo - - rowstr(1) = 1 - do j = 2, nrows+1 - rowstr(j) = rowstr(j) + rowstr(j-1) - enddo - - -c--------------------------------------------------------------------- -c ... rowstr(j) now is the location of the first nonzero -c of row j of a -c--------------------------------------------------------------------- - - -c--------------------------------------------------------------------- -c ... do a bucket sort of the triples on the row index -c--------------------------------------------------------------------- - do nza = 1, nnza - j = arow(nza) - firstrow + 1 - k = rowstr(j) - a(k) = aelt(nza) - colidx(k) = acol(nza) - rowstr(j) = rowstr(j) + 1 - enddo - - -c--------------------------------------------------------------------- -c ... rowstr(j) now points to the first element of row j+1 -c--------------------------------------------------------------------- - do j = nrows, 1, -1 - rowstr(j+1) = rowstr(j) - enddo - rowstr(1) = 1 - - -c--------------------------------------------------------------------- -c ... generate the actual output rows by adding elements -c--------------------------------------------------------------------- - nza = 0 - do i = 1, n - x(i) = 0.0 - mark(i) = .false. - enddo - - jajp1 = rowstr(1) - do j = 1, nrows - nzrow = 0 - -c--------------------------------------------------------------------- -c ...loop over the jth row of a -c--------------------------------------------------------------------- - do k = jajp1 , rowstr(j+1)-1 - i = colidx(k) - x(i) = x(i) + a(k) - if ( (.not. mark(i)) .and. (x(i) .ne. 0.D0)) then - mark(i) = .true. - nzrow = nzrow + 1 - nzloc(nzrow) = i - endif - enddo - -c--------------------------------------------------------------------- -c ... extract the nonzeros of this row -c--------------------------------------------------------------------- - do k = 1, nzrow - i = nzloc(k) - mark(i) = .false. - xi = x(i) - x(i) = 0.D0 - if (xi .ne. 0.D0) then - nza = nza + 1 - a(nza) = xi - colidx(nza) = i - endif - enddo - jajp1 = rowstr(j+1) - rowstr(j+1) = nza + rowstr(1) - enddo -CC write (*, 11000) nza - return -11000 format ( //,'final nonzero count in sparse ', - 1 /,'number of nonzeros = ', i16 ) - end -c-------end of sparse----------------------------- - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine sprnvc( n, nz, v, iv, nzloc, mark ) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit logical (a-z) - double precision v(*) - integer n, nz, iv(*), nzloc(n), nn1 - integer mark(n) - common /urando/ amult, tran - double precision amult, tran - - -c--------------------------------------------------------------------- -c generate a sparse n-vector (v, iv) -c having nzv nonzeros -c -c mark(i) is set to 1 if position i is nonzero. -c mark is all zero on entry and is reset to all zero before exit -c this corrects a performance bug found by John G. Lewis, caused by -c reinitialization of mark on every one of the n calls to sprnvc -c--------------------------------------------------------------------- - - integer nzrow, nzv, ii, i, icnvrt - - external randlc, icnvrt - double precision randlc, vecelt, vecloc - - - nzv = 0 - nzrow = 0 - nn1 = 1 - 50 continue - nn1 = 2 * nn1 - if (nn1 .lt. n) goto 50 - -c--------------------------------------------------------------------- -c nn1 is the smallest power of two not less than n -c--------------------------------------------------------------------- - -100 continue - if (nzv .ge. nz) goto 110 - vecelt = randlc( tran, amult ) - -c--------------------------------------------------------------------- -c generate an integer between 1 and n in a portable manner -c--------------------------------------------------------------------- - vecloc = randlc(tran, amult) - i = icnvrt(vecloc, nn1) + 1 - if (i .gt. n) goto 100 - -c--------------------------------------------------------------------- -c was this integer generated already? -c--------------------------------------------------------------------- - if (mark(i) .eq. 0) then - mark(i) = 1 - nzrow = nzrow + 1 - nzloc(nzrow) = i - nzv = nzv + 1 - v(nzv) = vecelt - iv(nzv) = i - endif - goto 100 -110 continue - do ii = 1, nzrow - i = nzloc(ii) - mark(i) = 0 - enddo - return - end -c-------end of sprnvc----------------------------- - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - function icnvrt(x, ipwr2) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit logical (a-z) - double precision x - integer ipwr2, icnvrt - -c--------------------------------------------------------------------- -c scale a double precision number x in (0,1) by a power of 2 and chop it -c--------------------------------------------------------------------- - icnvrt = int(ipwr2 * x) - - return - end -c-------end of icnvrt----------------------------- - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine vecset(n, v, iv, nzv, i, val) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit logical (a-z) - integer n, iv(*), nzv, i, k - double precision v(*), val - -c--------------------------------------------------------------------- -c set ith element of sparse vector (v, iv) with -c nzv nonzeros to val -c--------------------------------------------------------------------- - - logical set - - set = .false. - do k = 1, nzv - if (iv(k) .eq. i) then - v(k) = val - set = .true. - endif - enddo - if (.not. set) then - nzv = nzv + 1 - v(nzv) = val - iv(nzv) = i - endif - return - end -c-------end of vecset----------------------------- - diff --git a/examples/smpi/NAS/CG/mpinpb.h b/examples/smpi/NAS/CG/mpinpb.h deleted file mode 100644 index 1f0368c0b7..0000000000 --- a/examples/smpi/NAS/CG/mpinpb.h +++ /dev/null @@ -1,9 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'mpif.h' - - integer me, nprocs, root, dp_type - common /mpistuff/ me, nprocs, root, dp_type - diff --git a/examples/smpi/NAS/FT/Makefile b/examples/smpi/NAS/FT/Makefile deleted file mode 100644 index 1cc6e1416b..0000000000 --- a/examples/smpi/NAS/FT/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -SHELL=/bin/sh -BENCHMARK=ft -BENCHMARKU=FT - -include ../config/make.def - -include ../sys/make.common - -OBJS = ft.o ${COMMON}/${RAND}.o ${COMMON}/print_results.o ${COMMON}/timers.o - -${PROGRAM}: config ${OBJS} - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} - - - -.f.o: - ${FCOMPILE} $< - -ft.o: ft.f global.h mpinpb.h npbparams.h - -clean: - - rm -f *.o *~ mputil* - - rm -f ft npbparams.h core diff --git a/examples/smpi/NAS/FT/README b/examples/smpi/NAS/FT/README deleted file mode 100644 index ab08b363b2..0000000000 --- a/examples/smpi/NAS/FT/README +++ /dev/null @@ -1,5 +0,0 @@ -This code implements the time integration of a three-dimensional -partial differential equation using the Fast Fourier Transform. -Some of the dimension statements are not F77 conforming and will -not work using the g77 compiler. All dimension statements, -however, are legal F90. \ No newline at end of file diff --git a/examples/smpi/NAS/FT/ft.f b/examples/smpi/NAS/FT/ft.f deleted file mode 100644 index 5e3a3b0b40..0000000000 --- a/examples/smpi/NAS/FT/ft.f +++ /dev/null @@ -1,1998 +0,0 @@ -!-------------------------------------------------------------------------! -! ! -! N A S P A R A L L E L B E N C H M A R K S 3.3 ! -! ! -! F T ! -! ! -!-------------------------------------------------------------------------! -! ! -! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! -! It is described in NAS Technical Reports 95-020 and 02-007 ! -! ! -! Permission to use, copy, distribute and modify this software ! -! for any purpose with or without fee is hereby granted. We ! -! request, however, that all derived work reference the NAS ! -! Parallel Benchmarks 3.3. This software is provided "as is" ! -! without express or implied warranty. ! -! ! -! Information on NPB 3.3, including the technical report, the ! -! original specifications, source code, results and information ! -! on how to submit new results, is available at: ! -! ! -! http://www.nas.nasa.gov/Software/NPB/ ! -! ! -! Send comments or suggestions to npb@nas.nasa.gov ! -! ! -! NAS Parallel Benchmarks Group ! -! NASA Ames Research Center ! -! Mail Stop: T27A-1 ! -! Moffett Field, CA 94035-1000 ! -! ! -! E-mail: npb@nas.nasa.gov ! -! Fax: (650) 604-3957 ! -! ! -!-------------------------------------------------------------------------! - -!TO REDUCE THE AMOUNT OF MEMORY REQUIRED BY THE BENCHMARK WE NO LONGER -!STORE THE ENTIRE TIME EVOLUTION ARRAY "EX" FOR ALL TIME STEPS, BUT -!JUST FOR THE FIRST. ALSO, IT IS STORED ONLY FOR THE PART OF THE GRID -!FOR WHICH THE CALLING PROCESSOR IS RESPONSIBLE, SO THAT THE MEMORY -!USAGE BECOMES SCALABLE. THIS NEW ARRAY IS CALLED "TWIDDLE" (SEE -!NPB3.0-SER) - -!TO AVOID PROBLEMS WITH VERY LARGE ARRAY SIZES THAT ARE COMPUTED BY -!MULTIPLYING GRID DIMENSIONS (CAUSING INTEGER OVERFLOW IN THE VARIABLE -!NTOTAL) AND SUBSEQUENTLY DIVIDING BY THE NUMBER OF PROCESSORS, WE -!COMPUTE THE SIZE OF ARRAY PARTITIONS MORE CONSERVATIVELY AS -!((NX*NY)/NP)*NZ, WHERE NX, NY, AND NZ ARE GRID DIMENSIONS AND NP IS -!THE NUMBER OF PROCESSORS, THE RESULT IS STORED IN "NTDIVNP". FOR THE -!PERFORMANCE CALCULATION WE STORE THE TOTAL NUMBER OF GRID POINTS IN A -!FLOATING POINT NUMBER "NTOTAL_F" INSTEAD OF AN INTEGER. -!THIS FIX WILL FAIL IF THE NUMBER OF PROCESSORS IS SMALL. - -!UGLY HACK OF SUBROUTINE IPOW46: FOR VERY LARGE GRIDS THE SINGLE EXPONENT -!FROM NPB2.3 MAY NOT FIT IN A 32-BIT INTEGER. HOWEVER, WE KNOW THAT THE -!"EXPONENT" ARGUMENT OF THIS ROUTINE CAN ALWAYS BE FACTORED INTO A TERM -!DIVISIBLE BY NX (EXP_1) AND ANOTHER TERM (EXP_2). NX IS USUALLY A POWER -!OF TWO, SO WE CAN KEEP HALVING IT UNTIL THE PRODUCT OF EXP_1 -!AND EXP_2 IS SMALL ENOUGH (NAMELY EXP_2 ITSELF). THIS UPDATED VERSION -!OF IPWO46, WHICH NOW TAKES THE TWO FACTORS OF "EXPONENT" AS SEPARATE -!ARGUMENTS, MAY BREAK DOWN IF EXP_1 DOES NOT CONTAIN A LARGE POWER OF TWO. - -c--------------------------------------------------------------------- -c -c Authors: D. Bailey -c W. Saphir -c R. F. Van der Wijngaart -c -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c FT benchmark -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - program ft - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpif.h' - include 'global.h' - integer i, ierr - -c--------------------------------------------------------------------- -c u0, u1, u2 are the main arrays in the problem. -c Depending on the decomposition, these arrays will have different -c dimensions. To accomodate all possibilities, we allocate them as -c one-dimensional arrays and pass them to subroutines for different -c views -c - u0 contains the initial (transformed) initial condition -c - u1 and u2 are working arrays -c--------------------------------------------------------------------- - - double complex u0(ntdivnp), - > u1(ntdivnp), - > u2(ntdivnp) - double precision twiddle(ntdivnp) -c--------------------------------------------------------------------- -c Large arrays are in common so that they are allocated on the -c heap rather than the stack. This common block is not -c referenced directly anywhere else. Padding is to avoid accidental -c cache problems, since all array sizes are powers of two. -c--------------------------------------------------------------------- - - double complex pad1(3), pad2(3), pad3(3) - common /bigarrays/ u0, pad1, u1, pad2, u2, pad3, twiddle - - integer iter - double precision total_time, mflops - logical verified - character class - - call MPI_Init(ierr) - -c--------------------------------------------------------------------- -c Run the entire problem once to make sure all data is touched. -c This reduces variable startup costs, which is important for such a -c short benchmark. The other NPB 2 implementations are similar. -c--------------------------------------------------------------------- - do i = 1, t_max - call timer_clear(i) - end do - - call setup() - call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3)) - call compute_initial_conditions(u1, dims(1,1), dims(2,1), - > dims(3,1)) - call fft_init (dims(1,1)) - call fft(1, u1, u0) - -c--------------------------------------------------------------------- -c Start over from the beginning. Note that all operations must -c be timed, in contrast to other benchmarks. -c--------------------------------------------------------------------- - do i = 1, t_max - call timer_clear(i) - end do - call MPI_Barrier(MPI_COMM_WORLD, ierr) - - call timer_start(T_total) - if (timers_enabled) call timer_start(T_setup) - - call compute_indexmap(twiddle, dims(1,3), dims(2,3), dims(3,3)) - call compute_initial_conditions(u1, dims(1,1), dims(2,1), - > dims(3,1)) - call fft_init (dims(1,1)) - - if (timers_enabled) call synchup() - if (timers_enabled) call timer_stop(T_setup) - - if (timers_enabled) call timer_start(T_fft) - call fft(1, u1, u0) - if (timers_enabled) call timer_stop(T_fft) - - do iter = 1, niter - if (timers_enabled) call timer_start(T_evolve) - call evolve(u0, u1, twiddle, dims(1,1), dims(2,1), dims(3,1)) - if (timers_enabled) call timer_stop(T_evolve) - if (timers_enabled) call timer_start(T_fft) - call fft(-1, u1, u2) - if (timers_enabled) call timer_stop(T_fft) - if (timers_enabled) call synchup() - if (timers_enabled) call timer_start(T_checksum) - call checksum(iter, u2, dims(1,1), dims(2,1), dims(3,1)) - if (timers_enabled) call timer_stop(T_checksum) - end do - - call verify(nx, ny, nz, niter, verified, class) - call timer_stop(t_total) - if (np .ne. np_min) verified = .false. - total_time = timer_read(t_total) - - if( total_time .ne. 0. ) then - mflops = 1.0d-6*ntotal_f * - > (14.8157+7.19641*log(ntotal_f) - > + (5.23518+7.21113*log(ntotal_f))*niter) - > /total_time - else - mflops = 0.0 - endif - if (me .eq. 0) then - call print_results('FT', class, nx, ny, nz, niter, np_min, np, - > total_time, mflops, ' floating point', verified, - > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, cs7) - endif - if (timers_enabled) call print_timers() - call MPI_Finalize(ierr) - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine evolve(u0, u1, twiddle, d1, d2, d3) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c evolve u0 -> u1 (t time steps) in fourier space -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer d1, d2, d3 - double precision exi - double complex u0(d1,d2,d3) - double complex u1(d1,d2,d3) - double precision twiddle(d1,d2,d3) - integer i, j, k - - do k = 1, d3 - do j = 1, d2 - do i = 1, d1 - u0(i,j,k) = u0(i,j,k)*(twiddle(i,j,k)) - u1(i,j,k) = u0(i,j,k) - end do - end do - end do - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine compute_initial_conditions(u0, d1, d2, d3) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Fill in array u0 with initial conditions from -c random number generator -c--------------------------------------------------------------------- - implicit none - include 'global.h' - integer d1, d2, d3 - double complex u0(d1, d2, d3) - integer k - double precision x0, start, an, dummy - -c--------------------------------------------------------------------- -c 0-D and 1-D layouts are easy because each processor gets a contiguous -c chunk of the array, in the Fortran ordering sense. -c For a 2-D layout, it's a bit more complicated. We always -c have entire x-lines (contiguous) in processor. -c We can do ny/np1 of them at a time since we have -c ny/np1 contiguous in y-direction. But then we jump -c by z-planes (nz/np2 of them, total). -c For the 0-D and 1-D layouts we could do larger chunks, but -c this turns out to have no measurable impact on performance. -c--------------------------------------------------------------------- - - - start = seed -c--------------------------------------------------------------------- -c Jump to the starting element for our first plane. -c--------------------------------------------------------------------- - call ipow46(a, 2*nx, (zstart(1)-1)*ny + (ystart(1)-1), an) - dummy = randlc(start, an) - call ipow46(a, 2*nx, ny, an) - -c--------------------------------------------------------------------- -c Go through by z planes filling in one square at a time. -c--------------------------------------------------------------------- - do k = 1, dims(3, 1) ! nz/np2 - x0 = start - call vranlc(2*nx*dims(2, 1), x0, a, u0(1, 1, k)) - if (k .ne. dims(3, 1)) dummy = randlc(start, an) - end do - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine ipow46(a, exp_1, exp_2, result) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute a^exponent mod 2^46 -c--------------------------------------------------------------------- - - implicit none - double precision a, result, dummy, q, r - integer exp_1, exp_2, n, n2, ierr - external randlc - double precision randlc - logical two_pow -c--------------------------------------------------------------------- -c Use -c a^n = a^(n/2)*a^(n/2) if n even else -c a^n = a*a^(n-1) if n odd -c--------------------------------------------------------------------- - result = 1 - if (exp_2 .eq. 0 .or. exp_1 .eq. 0) return - q = a - r = 1 - n = exp_1 - two_pow = .true. - - do while (two_pow) - n2 = n/2 - if (n2 * 2 .eq. n) then - dummy = randlc(q, q) - n = n2 - else - n = n * exp_2 - two_pow = .false. - endif - end do - - do while (n .gt. 1) - n2 = n/2 - if (n2 * 2 .eq. n) then - dummy = randlc(q, q) - n = n2 - else - dummy = randlc(r, q) - n = n-1 - endif - end do - dummy = randlc(r, q) - result = r - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setup - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'mpinpb.h' - include 'global.h' - - integer ierr, i, j, fstatus - debug = .FALSE. - - call MPI_Comm_size(MPI_COMM_WORLD, np, ierr) - call MPI_Comm_rank(MPI_COMM_WORLD, me, ierr) - - if (.not. convertdouble) then - dc_type = MPI_DOUBLE_COMPLEX - else - dc_type = MPI_COMPLEX - endif - - - if (me .eq. 0) then - write(*, 1000) - open (unit=2,file='inputft.data',status='old', iostat=fstatus) - - if (fstatus .eq. 0) then - write(*,233) - 233 format(' Reading from input file inputft.data') - read (2,*) niter - read (2,*) layout_type - read (2,*) np1, np2 - close(2) - -c--------------------------------------------------------------------- -c check to make sure input data is consistent -c--------------------------------------------------------------------- - - -c--------------------------------------------------------------------- -c 1. product of processor grid dims must equal number of processors -c--------------------------------------------------------------------- - - if (np1 * np2 .ne. np) then - write(*, 238) - 238 format(' np1 and np2 given in input file are not valid.') - write(*, 239) np1*np2, np - 239 format(' Product is ', i5, ' and should be ', i5) - call MPI_Abort(MPI_COMM_WORLD, 1, ierr) - stop - endif - -c--------------------------------------------------------------------- -c 2. layout type must be valid -c--------------------------------------------------------------------- - - if (layout_type .ne. layout_0D .and. - > layout_type .ne. layout_1D .and. - > layout_type .ne. layout_2D) then - write(*, 240) - 240 format(' Layout type specified in inputft.data is - > invalid ') - call MPI_Abort(MPI_COMM_WORLD, 1, ierr) - stop - endif - -c--------------------------------------------------------------------- -c 3. 0D layout must be 1x1 grid -c--------------------------------------------------------------------- - - if (layout_type .eq. layout_0D .and. - > (np1 .ne.1 .or. np2 .ne. 1)) then - write(*, 241) - 241 format(' For 0D layout, both np1 and np2 must be 1 ') - call MPI_Abort(MPI_COMM_WORLD, 1, ierr) - stop - endif -c--------------------------------------------------------------------- -c 4. 1D layout must be 1xN grid -c--------------------------------------------------------------------- - - if (layout_type .eq. layout_1D .and. np1 .ne. 1) then - write(*, 242) - 242 format(' For 1D layout, np1 must be 1 ') - call MPI_Abort(MPI_COMM_WORLD, 1, ierr) - stop - endif - - else - write(*,234) - niter = niter_default - if (np .eq. 1) then - np1 = 1 - np2 = 1 - layout_type = layout_0D - else if (np .le. nz) then - np1 = 1 - np2 = np - layout_type = layout_1D - else - np1 = nz - np2 = np/nz - layout_type = layout_2D - endif - endif - - if (np .lt. np_min) then - write(*, 10) np_min - 10 format(' Error: Compiled for ', I5, ' processors. ') - write(*, 11) np - 11 format(' Only ', i5, ' processors found ') - call MPI_Abort(MPI_COMM_WORLD, 1, ierr) - stop - endif - - 234 format(' No input file inputft.data. Using compiled defaults') - write(*, 1001) nx, ny, nz - write(*, 1002) niter - write(*, 1004) np - write(*, 1005) np1, np2 - if (np .ne. np_min) write(*, 1006) np_min - - if (layout_type .eq. layout_0D) then - write(*, 1010) '0D' - else if (layout_type .eq. layout_1D) then - write(*, 1010) '1D' - else - write(*, 1010) '2D' - endif - - 1000 format(//,' NAS Parallel Benchmarks 3.3 -- FT Benchmark',/) - 1001 format(' Size : ', i4, 'x', i4, 'x', i4) - 1002 format(' Iterations : ', 7x, i7) - 1004 format(' Number of processes : ', 7x, i7) - 1005 format(' Processor array : ', 5x, i4, 'x', i4) - 1006 format(' WARNING: compiled for ', i5, ' processes. ', - > ' Will not verify. ') - 1010 format(' Layout type : ', 9x, A5) - endif - - -c--------------------------------------------------------------------- -c Since np1, np2 and layout_type are in a common block, -c this sends all three. -c--------------------------------------------------------------------- - call MPI_BCAST(np1, 3, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) - call MPI_BCAST(niter, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr) - - if (np1 .eq. 1 .and. np2 .eq. 1) then - layout_type = layout_0D - else if (np1 .eq. 1) then - layout_type = layout_1D - else - layout_type = layout_2D - endif - - if (layout_type .eq. layout_0D) then - do i = 1, 3 - dims(1, i) = nx - dims(2, i) = ny - dims(3, i) = nz - end do - else if (layout_type .eq. layout_1D) then - dims(1, 1) = nx - dims(2, 1) = ny - dims(3, 1) = nz - - dims(1, 2) = nx - dims(2, 2) = ny - dims(3, 2) = nz - - dims(1, 3) = nz - dims(2, 3) = nx - dims(3, 3) = ny - else if (layout_type .eq. layout_2D) then - dims(1, 1) = nx - dims(2, 1) = ny - dims(3, 1) = nz - - dims(1, 2) = ny - dims(2, 2) = nx - dims(3, 2) = nz - - dims(1, 3) = nz - dims(2, 3) = nx - dims(3, 3) = ny - - endif - do i = 1, 3 - dims(2, i) = dims(2, i) / np1 - dims(3, i) = dims(3, i) / np2 - end do - - -c--------------------------------------------------------------------- -c Determine processor coordinates of this processor -c Processor grid is np1xnp2. -c Arrays are always (n1, n2/np1, n3/np2) -c Processor coords are zero-based. -c--------------------------------------------------------------------- - me2 = mod(me, np2) ! goes from 0...np2-1 - me1 = me/np2 ! goes from 0...np1-1 -c--------------------------------------------------------------------- -c Communicators for rows/columns of processor grid. -c commslice1 is communicator of all procs with same me1, ranked as me2 -c commslice2 is communicator of all procs with same me2, ranked as me1 -c mpi_comm_split(comm, color, key, ...) -c--------------------------------------------------------------------- - call MPI_Comm_split(MPI_COMM_WORLD, me1, me2, commslice1, ierr) - call MPI_Comm_split(MPI_COMM_WORLD, me2, me1, commslice2, ierr) - if (timers_enabled) call synchup() - - if (debug) print *, 'proc coords: ', me, me1, me2 - -c--------------------------------------------------------------------- -c Determine which section of the grid is owned by this -c processor. -c--------------------------------------------------------------------- - if (layout_type .eq. layout_0d) then - - do i = 1, 3 - xstart(i) = 1 - xend(i) = nx - ystart(i) = 1 - yend(i) = ny - zstart(i) = 1 - zend(i) = nz - end do - - else if (layout_type .eq. layout_1d) then - - xstart(1) = 1 - xend(1) = nx - ystart(1) = 1 - yend(1) = ny - zstart(1) = 1 + me2 * nz/np2 - zend(1) = (me2+1) * nz/np2 - - xstart(2) = 1 - xend(2) = nx - ystart(2) = 1 - yend(2) = ny - zstart(2) = 1 + me2 * nz/np2 - zend(2) = (me2+1) * nz/np2 - - xstart(3) = 1 - xend(3) = nx - ystart(3) = 1 + me2 * ny/np2 - yend(3) = (me2+1) * ny/np2 - zstart(3) = 1 - zend(3) = nz - - else if (layout_type .eq. layout_2d) then - - xstart(1) = 1 - xend(1) = nx - ystart(1) = 1 + me1 * ny/np1 - yend(1) = (me1+1) * ny/np1 - zstart(1) = 1 + me2 * nz/np2 - zend(1) = (me2+1) * nz/np2 - - xstart(2) = 1 + me1 * nx/np1 - xend(2) = (me1+1)*nx/np1 - ystart(2) = 1 - yend(2) = ny - zstart(2) = zstart(1) - zend(2) = zend(1) - - xstart(3) = xstart(2) - xend(3) = xend(2) - ystart(3) = 1 + me2 *ny/np2 - yend(3) = (me2+1)*ny/np2 - zstart(3) = 1 - zend(3) = nz - endif - -c--------------------------------------------------------------------- -c Set up info for blocking of ffts and transposes. This improves -c performance on cache-based systems. Blocking involves -c working on a chunk of the problem at a time, taking chunks -c along the first, second, or third dimension. -c -c - In cffts1 blocking is on 2nd dimension (with fft on 1st dim) -c - In cffts2/3 blocking is on 1st dimension (with fft on 2nd and 3rd dims) - -c Since 1st dim is always in processor, we'll assume it's long enough -c (default blocking factor is 16 so min size for 1st dim is 16) -c The only case we have to worry about is cffts1 in a 2d decomposition. -c so the blocking factor should not be larger than the 2nd dimension. -c--------------------------------------------------------------------- - - fftblock = fftblock_default - fftblockpad = fftblockpad_default - - if (layout_type .eq. layout_2d) then - if (dims(2, 1) .lt. fftblock) fftblock = dims(2, 1) - if (dims(2, 2) .lt. fftblock) fftblock = dims(2, 2) - if (dims(2, 3) .lt. fftblock) fftblock = dims(2, 3) - endif - - if (fftblock .ne. fftblock_default) fftblockpad = fftblock+3 - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine compute_indexmap(twiddle, d1, d2, d3) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute function from local (i,j,k) to ibar^2+jbar^2+kbar^2 -c for time evolution exponent. -c--------------------------------------------------------------------- - - implicit none - include 'mpinpb.h' - include 'global.h' - integer d1, d2, d3 - integer i, j, k, ii, ii2, jj, ij2, kk - double precision ap, twiddle(d1, d2, d3) - -c--------------------------------------------------------------------- -c this function is very different depending on whether -c we are in the 0d, 1d or 2d layout. Compute separately. -c basically we want to convert the fortran indices -c 1 2 3 4 5 6 7 8 -c to -c 0 1 2 3 -4 -3 -2 -1 -c The following magic formula does the trick: -c mod(i-1+n/2, n) - n/2 -c--------------------------------------------------------------------- - - ap = - 4.d0 * alpha * pi *pi - - if (layout_type .eq. layout_0d) then ! xyz layout - do i = 1, dims(1,3) - ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2 - ii2 = ii*ii - do j = 1, dims(2,3) - jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2 - ij2 = jj*jj+ii2 - do k = 1, dims(3,3) - kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2 - twiddle(i,j,k) = dexp(ap*dfloat(kk*kk+ij2)) - end do - end do - end do - else if (layout_type .eq. layout_1d) then ! zxy layout - do i = 1,dims(2,3) - ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2 - ii2 = ii*ii - do j = 1,dims(3,3) - jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2 - ij2 = jj*jj+ii2 - do k = 1,dims(1,3) - kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2 - twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2)) - end do - end do - end do - else if (layout_type .eq. layout_2d) then ! zxy layout - do i = 1,dims(2,3) - ii = mod(i+xstart(3)-2+nx/2, nx) - nx/2 - ii2 = ii*ii - do j = 1, dims(3,3) - jj = mod(j+ystart(3)-2+ny/2, ny) - ny/2 - ij2 = jj*jj+ii2 - do k =1,dims(1,3) - kk = mod(k+zstart(3)-2+nz/2, nz) - nz/2 - twiddle(k,i,j) = dexp(ap*dfloat(kk*kk+ij2)) - end do - end do - end do - else - print *, ' Unknown layout type ', layout_type - stop - endif - - return - end - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine print_timers() - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - integer i - include 'global.h' - character*25 tstrings(T_max) - data tstrings / ' total ', - > ' setup ', - > ' fft ', - > ' evolve ', - > ' checksum ', - > ' fftlow ', - > ' fftcopy ', - > ' transpose ', - > ' transpose1_loc ', - > ' transpose1_glo ', - > ' transpose1_fin ', - > ' transpose2_loc ', - > ' transpose2_glo ', - > ' transpose2_fin ', - > ' sync ' / - - if (me .ne. 0) return - do i = 1, t_max - if (timer_read(i) .ne. 0.0d0) then - write(*, 100) i, tstrings(i), timer_read(i) - endif - end do - 100 format(' timer ', i2, '(', A16, ') :', F10.6) - return - end - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine fft(dir, x1, x2) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer dir - double complex x1(ntdivnp), x2(ntdivnp) - - double complex scratch(fftblockpad_default*maxdim*2) - -c--------------------------------------------------------------------- -c note: args x1, x2 must be different arrays -c note: args for cfftsx are (direction, layout, xin, xout, scratch) -c xin/xout may be the same and it can be somewhat faster -c if they are -c note: args for transpose are (layout1, layout2, xin, xout) -c xin/xout must be different -c--------------------------------------------------------------------- - - if (dir .eq. 1) then - if (layout_type .eq. layout_0d) then - call cffts1(1, dims(1,1), dims(2,1), dims(3,1), - > x1, x1, scratch) - call cffts2(1, dims(1,2), dims(2,2), dims(3,2), - > x1, x1, scratch) - call cffts3(1, dims(1,3), dims(2,3), dims(3,3), - > x1, x2, scratch) - else if (layout_type .eq. layout_1d) then - call cffts1(1, dims(1,1), dims(2,1), dims(3,1), - > x1, x1, scratch) - call cffts2(1, dims(1,2), dims(2,2), dims(3,2), - > x1, x1, scratch) - if (timers_enabled) call timer_start(T_transpose) - call transpose_xy_z(2, 3, x1, x2) - if (timers_enabled) call timer_stop(T_transpose) - call cffts1(1, dims(1,3), dims(2,3), dims(3,3), - > x2, x2, scratch) - else if (layout_type .eq. layout_2d) then - call cffts1(1, dims(1,1), dims(2,1), dims(3,1), - > x1, x1, scratch) - if (timers_enabled) call timer_start(T_transpose) - call transpose_x_y(1, 2, x1, x2) - if (timers_enabled) call timer_stop(T_transpose) - call cffts1(1, dims(1,2), dims(2,2), dims(3,2), - > x2, x2, scratch) - if (timers_enabled) call timer_start(T_transpose) - call transpose_x_z(2, 3, x2, x1) - if (timers_enabled) call timer_stop(T_transpose) - call cffts1(1, dims(1,3), dims(2,3), dims(3,3), - > x1, x2, scratch) - endif - else - if (layout_type .eq. layout_0d) then - call cffts3(-1, dims(1,3), dims(2,3), dims(3,3), - > x1, x1, scratch) - call cffts2(-1, dims(1,2), dims(2,2), dims(3,2), - > x1, x1, scratch) - call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), - > x1, x2, scratch) - else if (layout_type .eq. layout_1d) then - call cffts1(-1, dims(1,3), dims(2,3), dims(3,3), - > x1, x1, scratch) - if (timers_enabled) call timer_start(T_transpose) - call transpose_x_yz(3, 2, x1, x2) - if (timers_enabled) call timer_stop(T_transpose) - call cffts2(-1, dims(1,2), dims(2,2), dims(3,2), - > x2, x2, scratch) - call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), - > x2, x2, scratch) - else if (layout_type .eq. layout_2d) then - call cffts1(-1, dims(1,3), dims(2,3), dims(3,3), - > x1, x1, scratch) - if (timers_enabled) call timer_start(T_transpose) - call transpose_x_z(3, 2, x1, x2) - if (timers_enabled) call timer_stop(T_transpose) - call cffts1(-1, dims(1,2), dims(2,2), dims(3,2), - > x2, x2, scratch) - if (timers_enabled) call timer_start(T_transpose) - call transpose_x_y(2, 1, x2, x1) - if (timers_enabled) call timer_stop(T_transpose) - call cffts1(-1, dims(1,1), dims(2,1), dims(3,1), - > x1, x2, scratch) - endif - endif - return - end - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine cffts1(is, d1, d2, d3, x, xout, y) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'global.h' - integer is, d1, d2, d3, logd1 - double complex x(d1,d2,d3) - double complex xout(d1,d2,d3) - double complex y(fftblockpad, d1, 2) - integer i, j, k, jj - - logd1 = ilog2(d1) - - do k = 1, d3 - do jj = 0, d2 - fftblock, fftblock - if (timers_enabled) call timer_start(T_fftcopy) - do j = 1, fftblock - do i = 1, d1 - y(j,i,1) = x(i,j+jj,k) - enddo - enddo - if (timers_enabled) call timer_stop(T_fftcopy) - - if (timers_enabled) call timer_start(T_fftlow) - call cfftz (is, logd1, d1, y, y(1,1,2)) - if (timers_enabled) call timer_stop(T_fftlow) - - if (timers_enabled) call timer_start(T_fftcopy) - do j = 1, fftblock - do i = 1, d1 - xout(i,j+jj,k) = y(j,i,1) - enddo - enddo - if (timers_enabled) call timer_stop(T_fftcopy) - enddo - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine cffts2(is, d1, d2, d3, x, xout, y) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'global.h' - integer is, d1, d2, d3, logd2 - double complex x(d1,d2,d3) - double complex xout(d1,d2,d3) - double complex y(fftblockpad, d2, 2) - integer i, j, k, ii - - logd2 = ilog2(d2) - - do k = 1, d3 - do ii = 0, d1 - fftblock, fftblock - if (timers_enabled) call timer_start(T_fftcopy) - do j = 1, d2 - do i = 1, fftblock - y(i,j,1) = x(i+ii,j,k) - enddo - enddo - if (timers_enabled) call timer_stop(T_fftcopy) - - if (timers_enabled) call timer_start(T_fftlow) - call cfftz (is, logd2, d2, y, y(1, 1, 2)) - if (timers_enabled) call timer_stop(T_fftlow) - - if (timers_enabled) call timer_start(T_fftcopy) - do j = 1, d2 - do i = 1, fftblock - xout(i+ii,j,k) = y(i,j,1) - enddo - enddo - if (timers_enabled) call timer_stop(T_fftcopy) - enddo - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine cffts3(is, d1, d2, d3, x, xout, y) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'global.h' - integer is, d1, d2, d3, logd3 - double complex x(d1,d2,d3) - double complex xout(d1,d2,d3) - double complex y(fftblockpad, d3, 2) - integer i, j, k, ii - - logd3 = ilog2(d3) - - do j = 1, d2 - do ii = 0, d1 - fftblock, fftblock - if (timers_enabled) call timer_start(T_fftcopy) - do k = 1, d3 - do i = 1, fftblock - y(i,k,1) = x(i+ii,j,k) - enddo - enddo - if (timers_enabled) call timer_stop(T_fftcopy) - - if (timers_enabled) call timer_start(T_fftlow) - call cfftz (is, logd3, d3, y, y(1, 1, 2)) - if (timers_enabled) call timer_stop(T_fftlow) - - if (timers_enabled) call timer_start(T_fftcopy) - do k = 1, d3 - do i = 1, fftblock - xout(i+ii,j,k) = y(i,k,1) - enddo - enddo - if (timers_enabled) call timer_stop(T_fftcopy) - enddo - enddo - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine fft_init (n) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute the roots-of-unity array that will be used for subsequent FFTs. -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - - integer m,n,nu,ku,i,j,ln - double precision t, ti - - -c--------------------------------------------------------------------- -c Initialize the U array with sines and cosines in a manner that permits -c stride one access at each FFT iteration. -c--------------------------------------------------------------------- - nu = n - m = ilog2(n) - u(1) = m - ku = 2 - ln = 1 - - do j = 1, m - t = pi / ln - - do i = 0, ln - 1 - ti = i * t - u(i+ku) = dcmplx (cos (ti), sin(ti)) - enddo - - ku = ku + ln - ln = 2 * ln - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine cfftz (is, m, n, x, y) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Computes NY N-point complex-to-complex FFTs of X using an algorithm due -c to Swarztrauber. X is both the input and the output array, while Y is a -c scratch array. It is assumed that N = 2^M. Before calling CFFTZ to -c perform FFTs, the array U must be initialized by calling CFFTZ with IS -c set to 0 and M set to MX, where MX is the maximum value of M for any -c subsequent call. -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - - integer is,m,n,i,j,l,mx - double complex x, y - - dimension x(fftblockpad,n), y(fftblockpad,n) - -c--------------------------------------------------------------------- -c Check if input parameters are invalid. -c--------------------------------------------------------------------- - mx = u(1) - if ((is .ne. 1 .and. is .ne. -1) .or. m .lt. 1 .or. m .gt. mx) - > then - write (*, 1) is, m, mx - 1 format ('CFFTZ: Either U has not been initialized, or else'/ - > 'one of the input parameters is invalid', 3I5) - stop - endif - -c--------------------------------------------------------------------- -c Perform one variant of the Stockham FFT. -c--------------------------------------------------------------------- - do l = 1, m, 2 - call fftz2 (is, l, m, n, fftblock, fftblockpad, u, x, y) - if (l .eq. m) goto 160 - call fftz2 (is, l + 1, m, n, fftblock, fftblockpad, u, y, x) - enddo - - goto 180 - -c--------------------------------------------------------------------- -c Copy Y to X. -c--------------------------------------------------------------------- - 160 do j = 1, n - do i = 1, fftblock - x(i,j) = y(i,j) - enddo - enddo - - 180 continue - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine fftz2 (is, l, m, n, ny, ny1, u, x, y) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Performs the L-th iteration of the second variant of the Stockham FFT. -c--------------------------------------------------------------------- - - implicit none - - integer is,k,l,m,n,ny,ny1,n1,li,lj,lk,ku,i,j,i11,i12,i21,i22 - double complex u,x,y,u1,x11,x21 - dimension u(n), x(ny1,n), y(ny1,n) - - -c--------------------------------------------------------------------- -c Set initial parameters. -c--------------------------------------------------------------------- - - n1 = n / 2 - lk = 2 ** (l - 1) - li = 2 ** (m - l) - lj = 2 * lk - ku = li + 1 - - do i = 0, li - 1 - i11 = i * lk + 1 - i12 = i11 + n1 - i21 = i * lj + 1 - i22 = i21 + lk - if (is .ge. 1) then - u1 = u(ku+i) - else - u1 = dconjg (u(ku+i)) - endif - -c--------------------------------------------------------------------- -c This loop is vectorizable. -c--------------------------------------------------------------------- - do k = 0, lk - 1 - do j = 1, ny - x11 = x(j,i11+k) - x21 = x(j,i12+k) - y(j,i21+k) = x11 + x21 - y(j,i22+k) = u1 * (x11 - x21) - enddo - enddo - enddo - - return - end - -c--------------------------------------------------------------------- - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - integer function ilog2(n) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - integer n, nn, lg - if (n .eq. 1) then - ilog2=0 - return - endif - lg = 1 - nn = 2 - do while (nn .lt. n) - nn = nn*2 - lg = lg+1 - end do - ilog2 = lg - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_x_yz(l1, l2, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer l1, l2 - double complex xin(ntdivnp), xout(ntdivnp) - - call transpose2_local(dims(1,l1),dims(2, l1)*dims(3, l1), - > xin, xout) - - call transpose2_global(xout, xin) - - call transpose2_finish(dims(1,l1),dims(2, l1)*dims(3, l1), - > xin, xout) - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_xy_z(l1, l2, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer l1, l2 - double complex xin(ntdivnp), xout(ntdivnp) - - call transpose2_local(dims(1,l1)*dims(2, l1),dims(3, l1), - > xin, xout) - call transpose2_global(xout, xin) - call transpose2_finish(dims(1,l1)*dims(2, l1),dims(3, l1), - > xin, xout) - - return - end - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose2_local(n1, n2, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'mpinpb.h' - include 'global.h' - integer n1, n2 - double complex xin(n1, n2), xout(n2, n1) - - double complex z(transblockpad, transblock) - - integer i, j, ii, jj - - if (timers_enabled) call timer_start(T_transxzloc) - -c--------------------------------------------------------------------- -c If possible, block the transpose for cache memory systems. -c How much does this help? Example: R8000 Power Challenge (90 MHz) -c Blocked version decreases time spend in this routine -c from 14 seconds to 5.2 seconds on 8 nodes class A. -c--------------------------------------------------------------------- - - if (n1 .lt. transblock .or. n2 .lt. transblock) then - if (n1 .ge. n2) then - do j = 1, n2 - do i = 1, n1 - xout(j, i) = xin(i, j) - end do - end do - else - do i = 1, n1 - do j = 1, n2 - xout(j, i) = xin(i, j) - end do - end do - endif - else - do j = 0, n2-1, transblock - do i = 0, n1-1, transblock - -c--------------------------------------------------------------------- -c Note: compiler should be able to take j+jj out of inner loop -c--------------------------------------------------------------------- - do jj = 1, transblock - do ii = 1, transblock - z(jj,ii) = xin(i+ii, j+jj) - end do - end do - - do ii = 1, transblock - do jj = 1, transblock - xout(j+jj, i+ii) = z(jj,ii) - end do - end do - - end do - end do - endif - if (timers_enabled) call timer_stop(T_transxzloc) - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose2_global(xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - include 'mpinpb.h' - double complex xin(ntdivnp) - double complex xout(ntdivnp) - integer ierr - - if (timers_enabled) call synchup() - - if (timers_enabled) call timer_start(T_transxzglo) - call mpi_alltoall(xin, ntdivnp/np, dc_type, - > xout, ntdivnp/np, dc_type, - > commslice1, ierr) - if (timers_enabled) call timer_stop(T_transxzglo) - - return - end - - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose2_finish(n1, n2, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer n1, n2, ioff - double complex xin(n2, n1/np2, 0:np2-1), xout(n2*np2, n1/np2) - - integer i, j, p - - if (timers_enabled) call timer_start(T_transxzfin) - do p = 0, np2-1 - ioff = p*n2 - do j = 1, n1/np2 - do i = 1, n2 - xout(i+ioff, j) = xin(i, j, p) - end do - end do - end do - if (timers_enabled) call timer_stop(T_transxzfin) - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_x_z(l1, l2, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer l1, l2 - double complex xin(ntdivnp), xout(ntdivnp) - - call transpose_x_z_local(dims(1,l1),dims(2,l1),dims(3,l1), - > xin, xout) - call transpose_x_z_global(dims(1,l1),dims(2,l1),dims(3,l1), - > xout, xin) - call transpose_x_z_finish(dims(1,l2),dims(2,l2),dims(3,l2), - > xin, xout) - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_x_z_local(d1, d2, d3, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer d1, d2, d3 - double complex xin(d1,d2,d3) - double complex xout(d3,d2,d1) - integer block1, block3 - integer i, j, k, kk, ii, i1, k1 - - double complex buf(transblockpad, maxdim) - if (timers_enabled) call timer_start(T_transxzloc) - if (d1 .lt. 32) goto 100 - block3 = d3 - if (block3 .eq. 1) goto 100 - if (block3 .gt. transblock) block3 = transblock - block1 = d1 - if (block1*block3 .gt. transblock*transblock) - > block1 = transblock*transblock/block3 -c--------------------------------------------------------------------- -c blocked transpose -c--------------------------------------------------------------------- - do j = 1, d2 - do kk = 0, d3-block3, block3 - do ii = 0, d1-block1, block1 - - do k = 1, block3 - k1 = k + kk - do i = 1, block1 - buf(k, i) = xin(i+ii, j, k1) - end do - end do - - do i = 1, block1 - i1 = i + ii - do k = 1, block3 - xout(k+kk, j, i1) = buf(k, i) - end do - end do - - end do - end do - end do - goto 200 - - -c--------------------------------------------------------------------- -c basic transpose -c--------------------------------------------------------------------- - 100 continue - - do j = 1, d2 - do k = 1, d3 - do i = 1, d1 - xout(k, j, i) = xin(i, j, k) - end do - end do - end do - -c--------------------------------------------------------------------- -c all done -c--------------------------------------------------------------------- - 200 continue - - if (timers_enabled) call timer_stop(T_transxzloc) - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_x_z_global(d1, d2, d3, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - include 'mpinpb.h' - integer d1, d2, d3 - double complex xin(d3,d2,d1) - double complex xout(d3,d2,d1) ! not real layout, but right size - integer ierr - - if (timers_enabled) call synchup() - -c--------------------------------------------------------------------- -c do transpose among all processes with same 1-coord (me1) -c--------------------------------------------------------------------- - if (timers_enabled)call timer_start(T_transxzglo) - call mpi_alltoall(xin, d1*d2*d3/np2, dc_type, - > xout, d1*d2*d3/np2, dc_type, - > commslice1, ierr) - if (timers_enabled) call timer_stop(T_transxzglo) - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_x_z_finish(d1, d2, d3, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer d1, d2, d3 - double complex xin(d1/np2, d2, d3, 0:np2-1) - double complex xout(d1,d2,d3) - integer i, j, k, p, ioff - if (timers_enabled) call timer_start(T_transxzfin) -c--------------------------------------------------------------------- -c this is the most straightforward way of doing it. the -c calculation in the inner loop doesn't help. -c do i = 1, d1/np2 -c do j = 1, d2 -c do k = 1, d3 -c do p = 0, np2-1 -c ii = i + p*d1/np2 -c xout(ii, j, k) = xin(i, j, k, p) -c end do -c end do -c end do -c end do -c--------------------------------------------------------------------- - - do p = 0, np2-1 - ioff = p*d1/np2 - do k = 1, d3 - do j = 1, d2 - do i = 1, d1/np2 - xout(i+ioff, j, k) = xin(i, j, k, p) - end do - end do - end do - end do - if (timers_enabled) call timer_stop(T_transxzfin) - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_x_y(l1, l2, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer l1, l2 - double complex xin(ntdivnp), xout(ntdivnp) - -c--------------------------------------------------------------------- -c xy transpose is a little tricky, since we don't want -c to touch 3rd axis. But alltoall must involve 3rd axis (most -c slowly varying) to be efficient. So we do -c (nx, ny/np1, nz/np2) -> (ny/np1, nz/np2, nx) (local) -c (ny/np1, nz/np2, nx) -> ((ny/np1*nz/np2)*np1, nx/np1) (global) -c then local finish. -c--------------------------------------------------------------------- - - - call transpose_x_y_local(dims(1,l1),dims(2,l1),dims(3,l1), - > xin, xout) - call transpose_x_y_global(dims(1,l1),dims(2,l1),dims(3,l1), - > xout, xin) - call transpose_x_y_finish(dims(1,l2),dims(2,l2),dims(3,l2), - > xin, xout) - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_x_y_local(d1, d2, d3, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer d1, d2, d3 - double complex xin(d1, d2, d3) - double complex xout(d2, d3, d1) - integer i, j, k - if (timers_enabled) call timer_start(T_transxyloc) - - do k = 1, d3 - do i = 1, d1 - do j = 1, d2 - xout(j,k,i)=xin(i,j,k) - end do - end do - end do - if (timers_enabled) call timer_stop(T_transxyloc) - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_x_y_global(d1, d2, d3, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - include 'mpinpb.h' - integer d1, d2, d3 -c--------------------------------------------------------------------- -c array is in form (ny/np1, nz/np2, nx) -c--------------------------------------------------------------------- - double complex xin(d2,d3,d1) - double complex xout(d2,d3,d1) ! not real layout but right size - integer ierr - - if (timers_enabled) call synchup() - -c--------------------------------------------------------------------- -c do transpose among all processes with same 1-coord (me1) -c--------------------------------------------------------------------- - if (timers_enabled) call timer_start(T_transxyglo) - call mpi_alltoall(xin, d1*d2*d3/np1, dc_type, - > xout, d1*d2*d3/np1, dc_type, - > commslice2, ierr) - if (timers_enabled) call timer_stop(T_transxyglo) - - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine transpose_x_y_finish(d1, d2, d3, xin, xout) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - integer d1, d2, d3 - double complex xin(d1/np1, d3, d2, 0:np1-1) - double complex xout(d1,d2,d3) - integer i, j, k, p, ioff - if (timers_enabled) call timer_start(T_transxyfin) -c--------------------------------------------------------------------- -c this is the most straightforward way of doing it. the -c calculation in the inner loop doesn't help. -c do i = 1, d1/np1 -c do j = 1, d2 -c do k = 1, d3 -c do p = 0, np1-1 -c ii = i + p*d1/np1 -c note order is screwy bcz we have (ny/np1, nz/np2, nx) -> (ny, nx/np1, nz/np2) -c xout(ii, j, k) = xin(i, k, j, p) -c end do -c end do -c end do -c end do -c--------------------------------------------------------------------- - - do p = 0, np1-1 - ioff = p*d1/np1 - do k = 1, d3 - do j = 1, d2 - do i = 1, d1/np1 - xout(i+ioff, j, k) = xin(i, k, j, p) - end do - end do - end do - end do - if (timers_enabled) call timer_stop(T_transxyfin) - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine checksum(i, u1, d1, d2, d3) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - include 'mpinpb.h' - integer i, d1, d2, d3 - double complex u1(d1, d2, d3) - integer j, q,r,s, ierr - double complex chk,allchk - chk = (0.0,0.0) - - do j=1,1024 - q = mod(j, nx)+1 - if (q .ge. xstart(1) .and. q .le. xend(1)) then - r = mod(3*j,ny)+1 - if (r .ge. ystart(1) .and. r .le. yend(1)) then - s = mod(5*j,nz)+1 - if (s .ge. zstart(1) .and. s .le. zend(1)) then - chk=chk+u1(q-xstart(1)+1,r-ystart(1)+1,s-zstart(1)+1) - end if - end if - end if - end do - chk = chk/ntotal_f - - call MPI_Reduce(chk, allchk, 1, dc_type, MPI_SUM, - > 0, MPI_COMM_WORLD, ierr) - if (me .eq. 0) then - write (*, 30) i, allchk - 30 format (' T =',I5,5X,'Checksum =',1P2D22.12) - endif - -c sums(i) = allchk -c If we compute the checksum for diagnostic purposes, we let i be -c negative, so the result will not be stored in an array - if (i .gt. 0) sums(i) = allchk - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine synchup - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - include 'mpinpb.h' - integer ierr - call timer_start(T_synch) - call mpi_barrier(MPI_COMM_WORLD, ierr) - call timer_stop(T_synch) - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine verify (d1, d2, d3, nt, verified, class) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - include 'global.h' - include 'mpinpb.h' - integer d1, d2, d3, nt - character class - logical verified - integer ierr, size, i - double precision err, epsilon - -c--------------------------------------------------------------------- -c Reference checksums -c--------------------------------------------------------------------- - double complex csum_ref(25) - - - class = 'U' - - if (me .ne. 0) return - - epsilon = 1.0d-12 - verified = .FALSE. - - if (d1 .eq. 64 .and. - > d2 .eq. 64 .and. - > d3 .eq. 64 .and. - > nt .eq. 6) then -c--------------------------------------------------------------------- -c Sample size reference checksums -c--------------------------------------------------------------------- - class = 'S' - csum_ref(1) = dcmplx(5.546087004964D+02, 4.845363331978D+02) - csum_ref(2) = dcmplx(5.546385409189D+02, 4.865304269511D+02) - csum_ref(3) = dcmplx(5.546148406171D+02, 4.883910722336D+02) - csum_ref(4) = dcmplx(5.545423607415D+02, 4.901273169046D+02) - csum_ref(5) = dcmplx(5.544255039624D+02, 4.917475857993D+02) - csum_ref(6) = dcmplx(5.542683411902D+02, 4.932597244941D+02) - - else if (d1 .eq. 128 .and. - > d2 .eq. 128 .and. - > d3 .eq. 32 .and. - > nt .eq. 6) then -c--------------------------------------------------------------------- -c Class W size reference checksums -c--------------------------------------------------------------------- - class = 'W' - csum_ref(1) = dcmplx(5.673612178944D+02, 5.293246849175D+02) - csum_ref(2) = dcmplx(5.631436885271D+02, 5.282149986629D+02) - csum_ref(3) = dcmplx(5.594024089970D+02, 5.270996558037D+02) - csum_ref(4) = dcmplx(5.560698047020D+02, 5.260027904925D+02) - csum_ref(5) = dcmplx(5.530898991250D+02, 5.249400845633D+02) - csum_ref(6) = dcmplx(5.504159734538D+02, 5.239212247086D+02) - - else if (d1 .eq. 256 .and. - > d2 .eq. 256 .and. - > d3 .eq. 128 .and. - > nt .eq. 6) then -c--------------------------------------------------------------------- -c Class A size reference checksums -c--------------------------------------------------------------------- - class = 'A' - csum_ref(1) = dcmplx(5.046735008193D+02, 5.114047905510D+02) - csum_ref(2) = dcmplx(5.059412319734D+02, 5.098809666433D+02) - csum_ref(3) = dcmplx(5.069376896287D+02, 5.098144042213D+02) - csum_ref(4) = dcmplx(5.077892868474D+02, 5.101336130759D+02) - csum_ref(5) = dcmplx(5.085233095391D+02, 5.104914655194D+02) - csum_ref(6) = dcmplx(5.091487099959D+02, 5.107917842803D+02) - - else if (d1 .eq. 512 .and. - > d2 .eq. 256 .and. - > d3 .eq. 256 .and. - > nt .eq. 20) then -c--------------------------------------------------------------------- -c Class B size reference checksums -c--------------------------------------------------------------------- - class = 'B' - csum_ref(1) = dcmplx(5.177643571579D+02, 5.077803458597D+02) - csum_ref(2) = dcmplx(5.154521291263D+02, 5.088249431599D+02) - csum_ref(3) = dcmplx(5.146409228649D+02, 5.096208912659D+02) - csum_ref(4) = dcmplx(5.142378756213D+02, 5.101023387619D+02) - csum_ref(5) = dcmplx(5.139626667737D+02, 5.103976610617D+02) - csum_ref(6) = dcmplx(5.137423460082D+02, 5.105948019802D+02) - csum_ref(7) = dcmplx(5.135547056878D+02, 5.107404165783D+02) - csum_ref(8) = dcmplx(5.133910925466D+02, 5.108576573661D+02) - csum_ref(9) = dcmplx(5.132470705390D+02, 5.109577278523D+02) - csum_ref(10) = dcmplx(5.131197729984D+02, 5.110460304483D+02) - csum_ref(11) = dcmplx(5.130070319283D+02, 5.111252433800D+02) - csum_ref(12) = dcmplx(5.129070537032D+02, 5.111968077718D+02) - csum_ref(13) = dcmplx(5.128182883502D+02, 5.112616233064D+02) - csum_ref(14) = dcmplx(5.127393733383D+02, 5.113203605551D+02) - csum_ref(15) = dcmplx(5.126691062020D+02, 5.113735928093D+02) - csum_ref(16) = dcmplx(5.126064276004D+02, 5.114218460548D+02) - csum_ref(17) = dcmplx(5.125504076570D+02, 5.114656139760D+02) - csum_ref(18) = dcmplx(5.125002331720D+02, 5.115053595966D+02) - csum_ref(19) = dcmplx(5.124551951846D+02, 5.115415130407D+02) - csum_ref(20) = dcmplx(5.124146770029D+02, 5.115744692211D+02) - - else if (d1 .eq. 512 .and. - > d2 .eq. 512 .and. - > d3 .eq. 512 .and. - > nt .eq. 20) then -c--------------------------------------------------------------------- -c Class C size reference checksums -c--------------------------------------------------------------------- - class = 'C' - csum_ref(1) = dcmplx(5.195078707457D+02, 5.149019699238D+02) - csum_ref(2) = dcmplx(5.155422171134D+02, 5.127578201997D+02) - csum_ref(3) = dcmplx(5.144678022222D+02, 5.122251847514D+02) - csum_ref(4) = dcmplx(5.140150594328D+02, 5.121090289018D+02) - csum_ref(5) = dcmplx(5.137550426810D+02, 5.121143685824D+02) - csum_ref(6) = dcmplx(5.135811056728D+02, 5.121496764568D+02) - csum_ref(7) = dcmplx(5.134569343165D+02, 5.121870921893D+02) - csum_ref(8) = dcmplx(5.133651975661D+02, 5.122193250322D+02) - csum_ref(9) = dcmplx(5.132955192805D+02, 5.122454735794D+02) - csum_ref(10) = dcmplx(5.132410471738D+02, 5.122663649603D+02) - csum_ref(11) = dcmplx(5.131971141679D+02, 5.122830879827D+02) - csum_ref(12) = dcmplx(5.131605205716D+02, 5.122965869718D+02) - csum_ref(13) = dcmplx(5.131290734194D+02, 5.123075927445D+02) - csum_ref(14) = dcmplx(5.131012720314D+02, 5.123166486553D+02) - csum_ref(15) = dcmplx(5.130760908195D+02, 5.123241541685D+02) - csum_ref(16) = dcmplx(5.130528295923D+02, 5.123304037599D+02) - csum_ref(17) = dcmplx(5.130310107773D+02, 5.123356167976D+02) - csum_ref(18) = dcmplx(5.130103090133D+02, 5.123399592211D+02) - csum_ref(19) = dcmplx(5.129905029333D+02, 5.123435588985D+02) - csum_ref(20) = dcmplx(5.129714421109D+02, 5.123465164008D+02) - - else if (d1 .eq. 2048 .and. - > d2 .eq. 1024 .and. - > d3 .eq. 1024 .and. - > nt .eq. 25) then -c--------------------------------------------------------------------- -c Class D size reference checksums -c--------------------------------------------------------------------- - class = 'D' - csum_ref(1) = dcmplx(5.122230065252D+02, 5.118534037109D+02) - csum_ref(2) = dcmplx(5.120463975765D+02, 5.117061181082D+02) - csum_ref(3) = dcmplx(5.119865766760D+02, 5.117096364601D+02) - csum_ref(4) = dcmplx(5.119518799488D+02, 5.117373863950D+02) - csum_ref(5) = dcmplx(5.119269088223D+02, 5.117680347632D+02) - csum_ref(6) = dcmplx(5.119082416858D+02, 5.117967875532D+02) - csum_ref(7) = dcmplx(5.118943814638D+02, 5.118225281841D+02) - csum_ref(8) = dcmplx(5.118842385057D+02, 5.118451629348D+02) - csum_ref(9) = dcmplx(5.118769435632D+02, 5.118649119387D+02) - csum_ref(10) = dcmplx(5.118718203448D+02, 5.118820803844D+02) - csum_ref(11) = dcmplx(5.118683569061D+02, 5.118969781011D+02) - csum_ref(12) = dcmplx(5.118661708593D+02, 5.119098918835D+02) - csum_ref(13) = dcmplx(5.118649768950D+02, 5.119210777066D+02) - csum_ref(14) = dcmplx(5.118645605626D+02, 5.119307604484D+02) - csum_ref(15) = dcmplx(5.118647586618D+02, 5.119391362671D+02) - csum_ref(16) = dcmplx(5.118654451572D+02, 5.119463757241D+02) - csum_ref(17) = dcmplx(5.118665212451D+02, 5.119526269238D+02) - csum_ref(18) = dcmplx(5.118679083821D+02, 5.119580184108D+02) - csum_ref(19) = dcmplx(5.118695433664D+02, 5.119626617538D+02) - csum_ref(20) = dcmplx(5.118713748264D+02, 5.119666538138D+02) - csum_ref(21) = dcmplx(5.118733606701D+02, 5.119700787219D+02) - csum_ref(22) = dcmplx(5.118754661974D+02, 5.119730095953D+02) - csum_ref(23) = dcmplx(5.118776626738D+02, 5.119755100241D+02) - csum_ref(24) = dcmplx(5.118799262314D+02, 5.119776353561D+02) - csum_ref(25) = dcmplx(5.118822370068D+02, 5.119794338060D+02) - - else if (d1 .eq. 4096 .and. - > d2 .eq. 2048 .and. - > d3 .eq. 2048 .and. - > nt .eq. 25) then -c--------------------------------------------------------------------- -c Class E size reference checksums -c--------------------------------------------------------------------- - class = 'E' - csum_ref(1) = dcmplx(5.121601045346D+02, 5.117395998266D+02) - csum_ref(2) = dcmplx(5.120905403678D+02, 5.118614716182D+02) - csum_ref(3) = dcmplx(5.120623229306D+02, 5.119074203747D+02) - csum_ref(4) = dcmplx(5.120438418997D+02, 5.119345900733D+02) - csum_ref(5) = dcmplx(5.120311521872D+02, 5.119551325550D+02) - csum_ref(6) = dcmplx(5.120226088809D+02, 5.119720179919D+02) - csum_ref(7) = dcmplx(5.120169296534D+02, 5.119861371665D+02) - csum_ref(8) = dcmplx(5.120131225172D+02, 5.119979364402D+02) - csum_ref(9) = dcmplx(5.120104767108D+02, 5.120077674092D+02) - csum_ref(10) = dcmplx(5.120085127969D+02, 5.120159443121D+02) - csum_ref(11) = dcmplx(5.120069224127D+02, 5.120227453670D+02) - csum_ref(12) = dcmplx(5.120055158164D+02, 5.120284096041D+02) - csum_ref(13) = dcmplx(5.120041820159D+02, 5.120331373793D+02) - csum_ref(14) = dcmplx(5.120028605402D+02, 5.120370938679D+02) - csum_ref(15) = dcmplx(5.120015223011D+02, 5.120404138831D+02) - csum_ref(16) = dcmplx(5.120001570022D+02, 5.120432068837D+02) - csum_ref(17) = dcmplx(5.119987650555D+02, 5.120455615860D+02) - csum_ref(18) = dcmplx(5.119973525091D+02, 5.120475499442D+02) - csum_ref(19) = dcmplx(5.119959279472D+02, 5.120492304629D+02) - csum_ref(20) = dcmplx(5.119945006558D+02, 5.120506508902D+02) - csum_ref(21) = dcmplx(5.119930795911D+02, 5.120518503782D+02) - csum_ref(22) = dcmplx(5.119916728462D+02, 5.120528612016D+02) - csum_ref(23) = dcmplx(5.119902874185D+02, 5.120537101195D+02) - csum_ref(24) = dcmplx(5.119889291565D+02, 5.120544194514D+02) - csum_ref(25) = dcmplx(5.119876028049D+02, 5.120550079284D+02) - - endif - - - if (class .ne. 'U') then - - do i = 1, nt - err = abs( (sums(i) - csum_ref(i)) / csum_ref(i) ) - if (.not.(err .le. epsilon)) goto 100 - end do - verified = .TRUE. - 100 continue - - endif - - call MPI_COMM_SIZE(MPI_COMM_WORLD, size, ierr) - if (size .ne. np) then - write(*, 4010) np - write(*, 4011) - write(*, 4012) -c--------------------------------------------------------------------- -c multiple statements because some Fortran compilers have -c problems with long strings. -c--------------------------------------------------------------------- - 4010 format( ' Warning: benchmark was compiled for ', i5, - > 'processors') - 4011 format( ' Must be run on this many processors for official', - > ' verification') - 4012 format( ' so memory access is repeatable') - verified = .false. - endif - - if (class .ne. 'U') then - if (verified) then - write(*,2000) - 2000 format(' Result verification successful') - else - write(*,2001) - 2001 format(' Result verification failed') - endif - endif - print *, 'class = ', class - - return - end - - diff --git a/examples/smpi/NAS/FT/global.h b/examples/smpi/NAS/FT/global.h deleted file mode 100644 index 3e534bb48d..0000000000 --- a/examples/smpi/NAS/FT/global.h +++ /dev/null @@ -1,134 +0,0 @@ - include 'npbparams.h' - -c 2D processor array -> 2D grid decomposition (by pencils) -c If processor array is 1xN or -> 1D grid decomposition (by planes) -c If processor array is 1x1 -> 0D grid decomposition -c For simplicity, do not treat Nx1 (np2 = 1) specially - integer np1, np2, np - -c basic decomposition strategy - integer layout_type - integer layout_0D, layout_1D, layout_2D - parameter (layout_0D = 0, layout_1D = 1, layout_2D = 2) - - common /procgrid/ np1, np2, layout_type, np - - -c Cache blocking params. These values are good for most -c RISC processors. -c FFT parameters: -c fftblock controls how many ffts are done at a time. -c The default is appropriate for most cache-based machines -c On vector machines, the FFT can be vectorized with vector -c length equal to the block size, so the block size should -c be as large as possible. This is the size of the smallest -c dimension of the problem: 128 for class A, 256 for class B and -c 512 for class C. -c Transpose parameters: -c transblock is the blocking factor for the transposes when there -c is a 1-D layout. On vector machines it should probably be -c large (largest dimension of the problem). - - - integer fftblock_default, fftblockpad_default - parameter (fftblock_default=16, fftblockpad_default=18) - integer transblock, transblockpad - parameter(transblock=32, transblockpad=34) - - integer fftblock, fftblockpad - common /blockinfo/ fftblock, fftblockpad - -c we need a bunch of logic to keep track of how -c arrays are laid out. -c coords of this processor - integer me, me1, me2 - common /coords/ me, me1, me2 -c need a communicator for row/col in processor grid - integer commslice1, commslice2 - common /comms/ commslice1, commslice2 - - - -c There are basically three stages -c 1: x-y-z layout -c 2: after x-transform (before y) -c 3: after y-transform (before z) -c The computation proceeds logically as - -c set up initial conditions -c fftx(1) -c transpose (1->2) -c ffty(2) -c transpose (2->3) -c fftz(3) -c time evolution -c fftz(3) -c transpose (3->2) -c ffty(2) -c transpose (2->1) -c fftx(1) -c compute residual(1) - -c for the 0D, 1D, 2D strategies, the layouts look like xxx -c -c 0D 1D 2D -c 1: xyz xyz xyz -c 2: xyz xyz yxz -c 3: xyz zyx zxy - -c the array dimensions are stored in dims(coord, phase) - integer dims(3, 3) - integer xstart(3), ystart(3), zstart(3) - integer xend(3), yend(3), zend(3) - common /layout/ dims, - > xstart, ystart, zstart, - > xend, yend, zend - - integer T_total, T_setup, T_fft, T_evolve, T_checksum, - > T_fftlow, T_fftcopy, T_transpose, - > T_transxzloc, T_transxzglo, T_transxzfin, - > T_transxyloc, T_transxyglo, T_transxyfin, - > T_synch, T_max - parameter (T_total = 1, T_setup = 2, T_fft = 3, - > T_evolve = 4, T_checksum = 5, - > T_fftlow = 6, T_fftcopy = 7, T_transpose = 8, - > T_transxzloc = 9, T_transxzglo = 10, T_transxzfin = 11, - > T_transxyloc = 12, T_transxyglo = 13, - > T_transxyfin = 14, T_synch = 15, T_max = 15) - - - - logical timers_enabled - parameter (timers_enabled = .false.) - - - external timer_read - double precision timer_read - external ilog2 - integer ilog2 - - external randlc - double precision randlc - - -c other stuff - logical debug, debugsynch - common /dbg/ debug, debugsynch - - double precision seed, a, pi, alpha - parameter (seed = 314159265.d0, a = 1220703125.d0, - > pi = 3.141592653589793238d0, alpha=1.0d-6) - -c roots of unity array -c relies on x being largest dimension? - double complex u(nx) - common /ucomm/ u - - -c for checksum data - double complex sums(0:niter_default) - common /sumcomm/ sums - -c number of iterations - integer niter - common /iter/ niter diff --git a/examples/smpi/NAS/FT/inputft.data.sample b/examples/smpi/NAS/FT/inputft.data.sample deleted file mode 100644 index 448ac42bc0..0000000000 --- a/examples/smpi/NAS/FT/inputft.data.sample +++ /dev/null @@ -1,3 +0,0 @@ -6 ! number of iterations -2 ! layout type. 0 = 0d, 1 = 1d, 2 = 2d -2 4 ! processor layout. 0d must be "1 1"; 1d must be "1 N" diff --git a/examples/smpi/NAS/FT/mpinpb.h b/examples/smpi/NAS/FT/mpinpb.h deleted file mode 100644 index e43e552a84..0000000000 --- a/examples/smpi/NAS/FT/mpinpb.h +++ /dev/null @@ -1,4 +0,0 @@ - include 'mpif.h' -c mpi data types - integer dc_type - common /mpistuff/ dc_type diff --git a/examples/smpi/NAS/LU/Makefile b/examples/smpi/NAS/LU/Makefile deleted file mode 100644 index a05c94dc4a..0000000000 --- a/examples/smpi/NAS/LU/Makefile +++ /dev/null @@ -1,74 +0,0 @@ -SHELL=/bin/sh -BENCHMARK=lu -BENCHMARKU=LU -VEC= - -include ../config/make.def - -OBJS = lu.o init_comm.o read_input.o bcast_inputs.o proc_grid.o neighbors.o \ - nodedim.o subdomain.o setcoeff.o sethyper.o setbv.o exact.o setiv.o \ - erhs.o ssor.o exchange_1.o exchange_3.o exchange_4.o exchange_5.o \ - exchange_6.o rhs.o l2norm.o jacld.o blts$(VEC).o jacu.o buts$(VEC).o \ - error.o pintgr.o verify.o ${COMMON}/print_results.o ${COMMON}/timers.o - -include ../sys/make.common - - -# npbparams.h is included by applu.incl -# The following rule should do the trick but many make programs (not gmake) -# will do the wrong thing and rebuild the world every time (because the -# mod time on header.h is not changed. One solution would be to -# touch header.h but this might cause confusion if someone has -# accidentally deleted it. Instead, make the dependency on npbparams.h -# explicit in all the lines below (even though dependence is indirect). - -# applu.incl: npbparams.h - -${PROGRAM}: config - @if [ x$(VERSION) = xvec ] ; then \ - ${MAKE} VEC=_vec exec; \ - elif [ x$(VERSION) = xVEC ] ; then \ - ${MAKE} VEC=_vec exec; \ - else \ - ${MAKE} exec; \ - fi - -exec: $(OBJS) - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} - -.f.o : - ${FCOMPILE} $< - -lu.o: lu.f applu.incl npbparams.h -bcast_inputs.o: bcast_inputs.f applu.incl npbparams.h mpinpb.h -blts$(VEC).o: blts$(VEC).f -buts$(VEC).o: buts$(VEC).f -erhs.o: erhs.f applu.incl npbparams.h -error.o: error.f applu.incl npbparams.h mpinpb.h -exact.o: exact.f applu.incl npbparams.h -exchange_1.o: exchange_1.f applu.incl npbparams.h mpinpb.h -exchange_3.o: exchange_3.f applu.incl npbparams.h mpinpb.h -exchange_4.o: exchange_4.f applu.incl npbparams.h mpinpb.h -exchange_5.o: exchange_5.f applu.incl npbparams.h mpinpb.h -exchange_6.o: exchange_6.f applu.incl npbparams.h mpinpb.h -init_comm.o: init_comm.f applu.incl npbparams.h mpinpb.h -jacld.o: jacld.f applu.incl npbparams.h -jacu.o: jacu.f applu.incl npbparams.h -l2norm.o: l2norm.f mpinpb.h -neighbors.o: neighbors.f applu.incl npbparams.h -nodedim.o: nodedim.f -pintgr.o: pintgr.f applu.incl npbparams.h mpinpb.h -proc_grid.o: proc_grid.f applu.incl npbparams.h -read_input.o: read_input.f applu.incl npbparams.h mpinpb.h -rhs.o: rhs.f applu.incl npbparams.h -setbv.o: setbv.f applu.incl npbparams.h -setiv.o: setiv.f applu.incl npbparams.h -setcoeff.o: setcoeff.f applu.incl npbparams.h -sethyper.o: sethyper.f applu.incl npbparams.h -ssor.o: ssor.f applu.incl npbparams.h mpinpb.h -subdomain.o: subdomain.f applu.incl npbparams.h mpinpb.h -verify.o: verify.f applu.incl npbparams.h - -clean: - - /bin/rm -f npbparams.h - - /bin/rm -f *.o *~ diff --git a/examples/smpi/NAS/LU/applu.incl b/examples/smpi/NAS/LU/applu.incl deleted file mode 100644 index 413fc834e3..0000000000 --- a/examples/smpi/NAS/LU/applu.incl +++ /dev/null @@ -1,153 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- -c--- applu.incl -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c npbparams.h defines parameters that depend on the class and -c number of nodes -c--------------------------------------------------------------------- - - include 'npbparams.h' - -c--------------------------------------------------------------------- -c parameters which can be overridden in runtime config file -c (in addition to size of problem - isiz01,02,03 give the maximum size) -c ipr = 1 to print out verbose information -c omega = 2.0 is correct for all classes -c tolrsd is tolerance levels for steady state residuals -c--------------------------------------------------------------------- - integer ipr_default - parameter (ipr_default = 1) - double precision omega_default - parameter (omega_default = 1.2d0) - double precision tolrsd1_def, tolrsd2_def, tolrsd3_def, - > tolrsd4_def, tolrsd5_def - parameter (tolrsd1_def=1.0e-08, - > tolrsd2_def=1.0e-08, tolrsd3_def=1.0e-08, - > tolrsd4_def=1.0e-08, tolrsd5_def=1.0e-08) - - double precision c1, c2, c3, c4, c5 - parameter( c1 = 1.40d+00, c2 = 0.40d+00, - > c3 = 1.00d-01, c4 = 1.00d+00, - > c5 = 1.40d+00 ) - -c--------------------------------------------------------------------- -c grid -c--------------------------------------------------------------------- - integer nx, ny, nz - integer nx0, ny0, nz0 - integer ipt, ist, iend - integer jpt, jst, jend - integer ii1, ii2 - integer ji1, ji2 - integer ki1, ki2 - double precision dxi, deta, dzeta - double precision tx1, tx2, tx3 - double precision ty1, ty2, ty3 - double precision tz1, tz2, tz3 - - common/cgcon/ dxi, deta, dzeta, - > tx1, tx2, tx3, - > ty1, ty2, ty3, - > tz1, tz2, tz3, - > nx, ny, nz, - > nx0, ny0, nz0, - > ipt, ist, iend, - > jpt, jst, jend, - > ii1, ii2, - > ji1, ji2, - > ki1, ki2 - -c--------------------------------------------------------------------- -c dissipation -c--------------------------------------------------------------------- - double precision dx1, dx2, dx3, dx4, dx5 - double precision dy1, dy2, dy3, dy4, dy5 - double precision dz1, dz2, dz3, dz4, dz5 - double precision dssp - - common/disp/ dx1,dx2,dx3,dx4,dx5, - > dy1,dy2,dy3,dy4,dy5, - > dz1,dz2,dz3,dz4,dz5, - > dssp - -c--------------------------------------------------------------------- -c field variables and residuals -c--------------------------------------------------------------------- - double precision u(5,-1:isiz1+2,-1:isiz2+2,isiz3), - > rsd(5,-1:isiz1+2,-1:isiz2+2,isiz3), - > frct(5,-1:isiz1+2,-1:isiz2+2,isiz3), - > flux(5,0:isiz1+1,0:isiz2+1,isiz3) - - common/cvar/ u, - > rsd, - > frct, - > flux - - -c--------------------------------------------------------------------- -c output control parameters -c--------------------------------------------------------------------- - integer ipr, inorm - - common/cprcon/ ipr, inorm - -c--------------------------------------------------------------------- -c newton-raphson iteration control parameters -c--------------------------------------------------------------------- - integer itmax, invert - double precision dt, omega, tolrsd(5), - > rsdnm(5), errnm(5), frc, ttotal - - common/ctscon/ dt, omega, tolrsd, - > rsdnm, errnm, frc, ttotal, - > itmax, invert - - double precision a(5,5,isiz1,isiz2), - > b(5,5,isiz1,isiz2), - > c(5,5,isiz1,isiz2), - > d(5,5,isiz1,isiz2) - - common/cjac/ a, b, c, d - -c--------------------------------------------------------------------- -c coefficients of the exact solution -c--------------------------------------------------------------------- - double precision ce(5,13) - - common/cexact/ ce - -c--------------------------------------------------------------------- -c multi-processor common blocks -c--------------------------------------------------------------------- - integer id, ndim, num, xdim, ydim, row, col - common/dim/ id,ndim,num,xdim,ydim,row,col - - integer north,south,east,west - common/neigh/ north,south,east, west - - integer from_s,from_n,from_e,from_w - parameter (from_s=1,from_n=2,from_e=3,from_w=4) - - integer npmax - parameter (npmax=isiz01+isiz02) - - logical icommn(npmax+1),icomms(npmax+1), - > icomme(npmax+1),icommw(npmax+1) - double precision buf(5,2*isiz2*isiz3), - > buf1(5,2*isiz2*isiz3) - - common/comm/ buf, buf1, - > icommn,icomms, - > icomme,icommw - - double precision maxtime - common/timer/maxtime - - -c--------------------------------------------------------------------- -c end of include file -c--------------------------------------------------------------------- diff --git a/examples/smpi/NAS/LU/bcast_inputs.f b/examples/smpi/NAS/LU/bcast_inputs.f deleted file mode 100644 index c606724bec..0000000000 --- a/examples/smpi/NAS/LU/bcast_inputs.f +++ /dev/null @@ -1,41 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine bcast_inputs - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer ierr - -c--------------------------------------------------------------------- -c root broadcasts the data -c The data isn't contiguous or of the same type, so it's not -c clear how to send it in the "MPI" way. -c We could pack the info into a buffer or we could create -c an obscene datatype to handle it all at once. Since we only -c broadcast the data once, just use a separate broadcast for -c each piece. -c--------------------------------------------------------------------- - call MPI_BCAST(ipr, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) - call MPI_BCAST(inorm, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) - call MPI_BCAST(itmax, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) - call MPI_BCAST(dt, 1, dp_type, root, MPI_COMM_WORLD, ierr) - call MPI_BCAST(omega, 1, dp_type, root, MPI_COMM_WORLD, ierr) - call MPI_BCAST(tolrsd, 5, dp_type, root, MPI_COMM_WORLD, ierr) - call MPI_BCAST(nx0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) - call MPI_BCAST(ny0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) - call MPI_BCAST(nz0, 1, MPI_INTEGER, root, MPI_COMM_WORLD, ierr) - - return - end - - diff --git a/examples/smpi/NAS/LU/blts.f b/examples/smpi/NAS/LU/blts.f deleted file mode 100644 index 9861261b03..0000000000 --- a/examples/smpi/NAS/LU/blts.f +++ /dev/null @@ -1,261 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine blts ( ldmx, ldmy, ldmz, - > nx, ny, nz, k, - > omega, - > v, - > ldz, ldy, ldx, d, - > ist, iend, jst, jend, - > nx0, ny0, ipt, jpt) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c compute the regular-sparse, block lower triangular solution: -c -c v <-- ( L-inv ) * v -c -c--------------------------------------------------------------------- - - implicit none - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - integer ldmx, ldmy, ldmz - integer nx, ny, nz - integer k - double precision omega - double precision v( 5, -1:ldmx+2, -1:ldmy+2, *), - > ldz( 5, 5, ldmx, ldmy), - > ldy( 5, 5, ldmx, ldmy), - > ldx( 5, 5, ldmx, ldmy), - > d( 5, 5, ldmx, ldmy) - integer ist, iend - integer jst, jend - integer nx0, ny0 - integer ipt, jpt - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, m - integer iex - double precision tmp, tmp1 - double precision tmat(5,5) - - -c--------------------------------------------------------------------- -c receive data from north and west -c--------------------------------------------------------------------- - iex = 0 - call exchange_1( v,k,iex ) - - - do j = jst, jend - do i = ist, iend - do m = 1, 5 - - v( m, i, j, k ) = v( m, i, j, k ) - > - omega * ( ldz( m, 1, i, j ) * v( 1, i, j, k-1 ) - > + ldz( m, 2, i, j ) * v( 2, i, j, k-1 ) - > + ldz( m, 3, i, j ) * v( 3, i, j, k-1 ) - > + ldz( m, 4, i, j ) * v( 4, i, j, k-1 ) - > + ldz( m, 5, i, j ) * v( 5, i, j, k-1 ) ) - - end do - end do - end do - - - do j=jst,jend - do i = ist, iend - - do m = 1, 5 - - v( m, i, j, k ) = v( m, i, j, k ) - > - omega * ( ldy( m, 1, i, j ) * v( 1, i, j-1, k ) - > + ldx( m, 1, i, j ) * v( 1, i-1, j, k ) - > + ldy( m, 2, i, j ) * v( 2, i, j-1, k ) - > + ldx( m, 2, i, j ) * v( 2, i-1, j, k ) - > + ldy( m, 3, i, j ) * v( 3, i, j-1, k ) - > + ldx( m, 3, i, j ) * v( 3, i-1, j, k ) - > + ldy( m, 4, i, j ) * v( 4, i, j-1, k ) - > + ldx( m, 4, i, j ) * v( 4, i-1, j, k ) - > + ldy( m, 5, i, j ) * v( 5, i, j-1, k ) - > + ldx( m, 5, i, j ) * v( 5, i-1, j, k ) ) - - end do - -c--------------------------------------------------------------------- -c diagonal block inversion -c -c forward elimination -c--------------------------------------------------------------------- - do m = 1, 5 - tmat( m, 1 ) = d( m, 1, i, j ) - tmat( m, 2 ) = d( m, 2, i, j ) - tmat( m, 3 ) = d( m, 3, i, j ) - tmat( m, 4 ) = d( m, 4, i, j ) - tmat( m, 5 ) = d( m, 5, i, j ) - end do - - tmp1 = 1.0d+00 / tmat( 1, 1 ) - tmp = tmp1 * tmat( 2, 1 ) - tmat( 2, 2 ) = tmat( 2, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 2, 3 ) = tmat( 2, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 2, 4 ) = tmat( 2, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 2, 5 ) = tmat( 2, 5 ) - > - tmp * tmat( 1, 5 ) - v( 2, i, j, k ) = v( 2, i, j, k ) - > - v( 1, i, j, k ) * tmp - - tmp = tmp1 * tmat( 3, 1 ) - tmat( 3, 2 ) = tmat( 3, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 3, 3 ) = tmat( 3, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 3, 4 ) = tmat( 3, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 3, 5 ) = tmat( 3, 5 ) - > - tmp * tmat( 1, 5 ) - v( 3, i, j, k ) = v( 3, i, j, k ) - > - v( 1, i, j, k ) * tmp - - tmp = tmp1 * tmat( 4, 1 ) - tmat( 4, 2 ) = tmat( 4, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 4, 3 ) = tmat( 4, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 1, 5 ) - v( 4, i, j, k ) = v( 4, i, j, k ) - > - v( 1, i, j, k ) * tmp - - tmp = tmp1 * tmat( 5, 1 ) - tmat( 5, 2 ) = tmat( 5, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 5, 3 ) = tmat( 5, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 1, 5 ) - v( 5, i, j, k ) = v( 5, i, j, k ) - > - v( 1, i, j, k ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 2, 2 ) - tmp = tmp1 * tmat( 3, 2 ) - tmat( 3, 3 ) = tmat( 3, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 3, 4 ) = tmat( 3, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 3, 5 ) = tmat( 3, 5 ) - > - tmp * tmat( 2, 5 ) - v( 3, i, j, k ) = v( 3, i, j, k ) - > - v( 2, i, j, k ) * tmp - - tmp = tmp1 * tmat( 4, 2 ) - tmat( 4, 3 ) = tmat( 4, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 2, 5 ) - v( 4, i, j, k ) = v( 4, i, j, k ) - > - v( 2, i, j, k ) * tmp - - tmp = tmp1 * tmat( 5, 2 ) - tmat( 5, 3 ) = tmat( 5, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 2, 5 ) - v( 5, i, j, k ) = v( 5, i, j, k ) - > - v( 2, i, j, k ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 3, 3 ) - tmp = tmp1 * tmat( 4, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 3, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 3, 5 ) - v( 4, i, j, k ) = v( 4, i, j, k ) - > - v( 3, i, j, k ) * tmp - - tmp = tmp1 * tmat( 5, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 3, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 3, 5 ) - v( 5, i, j, k ) = v( 5, i, j, k ) - > - v( 3, i, j, k ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 4, 4 ) - tmp = tmp1 * tmat( 5, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 4, 5 ) - v( 5, i, j, k ) = v( 5, i, j, k ) - > - v( 4, i, j, k ) * tmp - -c--------------------------------------------------------------------- -c back substitution -c--------------------------------------------------------------------- - v( 5, i, j, k ) = v( 5, i, j, k ) - > / tmat( 5, 5 ) - - v( 4, i, j, k ) = v( 4, i, j, k ) - > - tmat( 4, 5 ) * v( 5, i, j, k ) - v( 4, i, j, k ) = v( 4, i, j, k ) - > / tmat( 4, 4 ) - - v( 3, i, j, k ) = v( 3, i, j, k ) - > - tmat( 3, 4 ) * v( 4, i, j, k ) - > - tmat( 3, 5 ) * v( 5, i, j, k ) - v( 3, i, j, k ) = v( 3, i, j, k ) - > / tmat( 3, 3 ) - - v( 2, i, j, k ) = v( 2, i, j, k ) - > - tmat( 2, 3 ) * v( 3, i, j, k ) - > - tmat( 2, 4 ) * v( 4, i, j, k ) - > - tmat( 2, 5 ) * v( 5, i, j, k ) - v( 2, i, j, k ) = v( 2, i, j, k ) - > / tmat( 2, 2 ) - - v( 1, i, j, k ) = v( 1, i, j, k ) - > - tmat( 1, 2 ) * v( 2, i, j, k ) - > - tmat( 1, 3 ) * v( 3, i, j, k ) - > - tmat( 1, 4 ) * v( 4, i, j, k ) - > - tmat( 1, 5 ) * v( 5, i, j, k ) - v( 1, i, j, k ) = v( 1, i, j, k ) - > / tmat( 1, 1 ) - - - enddo - enddo - -c--------------------------------------------------------------------- -c send data to east and south -c--------------------------------------------------------------------- - iex = 2 - call exchange_1( v,k,iex ) - - return - end - - diff --git a/examples/smpi/NAS/LU/blts_vec.f b/examples/smpi/NAS/LU/blts_vec.f deleted file mode 100644 index f90ea84560..0000000000 --- a/examples/smpi/NAS/LU/blts_vec.f +++ /dev/null @@ -1,334 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine blts ( ldmx, ldmy, ldmz, - > nx, ny, nz, k, - > omega, - > v, - > ldz, ldy, ldx, d, - > ist, iend, jst, jend, - > nx0, ny0, ipt, jpt) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c compute the regular-sparse, block lower triangular solution: -c -c v <-- ( L-inv ) * v -c -c--------------------------------------------------------------------- - - implicit none - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - integer ldmx, ldmy, ldmz - integer nx, ny, nz - integer k - double precision omega - double precision v( 5, -1:ldmx+2, -1:ldmy+2, *), - > ldz( 5, 5, ldmx, ldmy), - > ldy( 5, 5, ldmx, ldmy), - > ldx( 5, 5, ldmx, ldmy), - > d( 5, 5, ldmx, ldmy) - integer ist, iend - integer jst, jend - integer nx0, ny0 - integer ipt, jpt - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, m, l, istp, iendp - integer iex - double precision tmp, tmp1 - double precision tmat(5,5) - - -c--------------------------------------------------------------------- -c receive data from north and west -c--------------------------------------------------------------------- - iex = 0 - call exchange_1( v,k,iex ) - - - do j = jst, jend - do i = ist, iend - do m = 1, 5 - - v( m, i, j, k ) = v( m, i, j, k ) - > - omega * ( ldz( m, 1, i, j ) * v( 1, i, j, k-1 ) - > + ldz( m, 2, i, j ) * v( 2, i, j, k-1 ) - > + ldz( m, 3, i, j ) * v( 3, i, j, k-1 ) - > + ldz( m, 4, i, j ) * v( 4, i, j, k-1 ) - > + ldz( m, 5, i, j ) * v( 5, i, j, k-1 ) ) - - end do - end do - end do - - - do l = ist+jst, iend+jend - istp = max(l - jend, ist) - iendp = min(l - jst, iend) - -!dir$ ivdep - do i = istp, iendp - j = l - i - -!!dir$ unroll 5 -! manually unroll the loop -! do m = 1, 5 - - v( 1, i, j, k ) = v( 1, i, j, k ) - > - omega * ( ldy( 1, 1, i, j ) * v( 1, i, j-1, k ) - > + ldx( 1, 1, i, j ) * v( 1, i-1, j, k ) - > + ldy( 1, 2, i, j ) * v( 2, i, j-1, k ) - > + ldx( 1, 2, i, j ) * v( 2, i-1, j, k ) - > + ldy( 1, 3, i, j ) * v( 3, i, j-1, k ) - > + ldx( 1, 3, i, j ) * v( 3, i-1, j, k ) - > + ldy( 1, 4, i, j ) * v( 4, i, j-1, k ) - > + ldx( 1, 4, i, j ) * v( 4, i-1, j, k ) - > + ldy( 1, 5, i, j ) * v( 5, i, j-1, k ) - > + ldx( 1, 5, i, j ) * v( 5, i-1, j, k ) ) - v( 2, i, j, k ) = v( 2, i, j, k ) - > - omega * ( ldy( 2, 1, i, j ) * v( 1, i, j-1, k ) - > + ldx( 2, 1, i, j ) * v( 1, i-1, j, k ) - > + ldy( 2, 2, i, j ) * v( 2, i, j-1, k ) - > + ldx( 2, 2, i, j ) * v( 2, i-1, j, k ) - > + ldy( 2, 3, i, j ) * v( 3, i, j-1, k ) - > + ldx( 2, 3, i, j ) * v( 3, i-1, j, k ) - > + ldy( 2, 4, i, j ) * v( 4, i, j-1, k ) - > + ldx( 2, 4, i, j ) * v( 4, i-1, j, k ) - > + ldy( 2, 5, i, j ) * v( 5, i, j-1, k ) - > + ldx( 2, 5, i, j ) * v( 5, i-1, j, k ) ) - v( 3, i, j, k ) = v( 3, i, j, k ) - > - omega * ( ldy( 3, 1, i, j ) * v( 1, i, j-1, k ) - > + ldx( 3, 1, i, j ) * v( 1, i-1, j, k ) - > + ldy( 3, 2, i, j ) * v( 2, i, j-1, k ) - > + ldx( 3, 2, i, j ) * v( 2, i-1, j, k ) - > + ldy( 3, 3, i, j ) * v( 3, i, j-1, k ) - > + ldx( 3, 3, i, j ) * v( 3, i-1, j, k ) - > + ldy( 3, 4, i, j ) * v( 4, i, j-1, k ) - > + ldx( 3, 4, i, j ) * v( 4, i-1, j, k ) - > + ldy( 3, 5, i, j ) * v( 5, i, j-1, k ) - > + ldx( 3, 5, i, j ) * v( 5, i-1, j, k ) ) - v( 4, i, j, k ) = v( 4, i, j, k ) - > - omega * ( ldy( 4, 1, i, j ) * v( 1, i, j-1, k ) - > + ldx( 4, 1, i, j ) * v( 1, i-1, j, k ) - > + ldy( 4, 2, i, j ) * v( 2, i, j-1, k ) - > + ldx( 4, 2, i, j ) * v( 2, i-1, j, k ) - > + ldy( 4, 3, i, j ) * v( 3, i, j-1, k ) - > + ldx( 4, 3, i, j ) * v( 3, i-1, j, k ) - > + ldy( 4, 4, i, j ) * v( 4, i, j-1, k ) - > + ldx( 4, 4, i, j ) * v( 4, i-1, j, k ) - > + ldy( 4, 5, i, j ) * v( 5, i, j-1, k ) - > + ldx( 4, 5, i, j ) * v( 5, i-1, j, k ) ) - v( 5, i, j, k ) = v( 5, i, j, k ) - > - omega * ( ldy( 5, 1, i, j ) * v( 1, i, j-1, k ) - > + ldx( 5, 1, i, j ) * v( 1, i-1, j, k ) - > + ldy( 5, 2, i, j ) * v( 2, i, j-1, k ) - > + ldx( 5, 2, i, j ) * v( 2, i-1, j, k ) - > + ldy( 5, 3, i, j ) * v( 3, i, j-1, k ) - > + ldx( 5, 3, i, j ) * v( 3, i-1, j, k ) - > + ldy( 5, 4, i, j ) * v( 4, i, j-1, k ) - > + ldx( 5, 4, i, j ) * v( 4, i-1, j, k ) - > + ldy( 5, 5, i, j ) * v( 5, i, j-1, k ) - > + ldx( 5, 5, i, j ) * v( 5, i-1, j, k ) ) - -! end do - -c--------------------------------------------------------------------- -c diagonal block inversion -c -c forward elimination -c--------------------------------------------------------------------- -!!dir$ unroll 5 -! manually unroll the loop -! do m = 1, 5 - tmat( 1, 1 ) = d( 1, 1, i, j ) - tmat( 1, 2 ) = d( 1, 2, i, j ) - tmat( 1, 3 ) = d( 1, 3, i, j ) - tmat( 1, 4 ) = d( 1, 4, i, j ) - tmat( 1, 5 ) = d( 1, 5, i, j ) - tmat( 2, 1 ) = d( 2, 1, i, j ) - tmat( 2, 2 ) = d( 2, 2, i, j ) - tmat( 2, 3 ) = d( 2, 3, i, j ) - tmat( 2, 4 ) = d( 2, 4, i, j ) - tmat( 2, 5 ) = d( 2, 5, i, j ) - tmat( 3, 1 ) = d( 3, 1, i, j ) - tmat( 3, 2 ) = d( 3, 2, i, j ) - tmat( 3, 3 ) = d( 3, 3, i, j ) - tmat( 3, 4 ) = d( 3, 4, i, j ) - tmat( 3, 5 ) = d( 3, 5, i, j ) - tmat( 4, 1 ) = d( 4, 1, i, j ) - tmat( 4, 2 ) = d( 4, 2, i, j ) - tmat( 4, 3 ) = d( 4, 3, i, j ) - tmat( 4, 4 ) = d( 4, 4, i, j ) - tmat( 4, 5 ) = d( 4, 5, i, j ) - tmat( 5, 1 ) = d( 5, 1, i, j ) - tmat( 5, 2 ) = d( 5, 2, i, j ) - tmat( 5, 3 ) = d( 5, 3, i, j ) - tmat( 5, 4 ) = d( 5, 4, i, j ) - tmat( 5, 5 ) = d( 5, 5, i, j ) -! end do - - tmp1 = 1.0d+00 / tmat( 1, 1 ) - tmp = tmp1 * tmat( 2, 1 ) - tmat( 2, 2 ) = tmat( 2, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 2, 3 ) = tmat( 2, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 2, 4 ) = tmat( 2, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 2, 5 ) = tmat( 2, 5 ) - > - tmp * tmat( 1, 5 ) - v( 2, i, j, k ) = v( 2, i, j, k ) - > - v( 1, i, j, k ) * tmp - - tmp = tmp1 * tmat( 3, 1 ) - tmat( 3, 2 ) = tmat( 3, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 3, 3 ) = tmat( 3, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 3, 4 ) = tmat( 3, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 3, 5 ) = tmat( 3, 5 ) - > - tmp * tmat( 1, 5 ) - v( 3, i, j, k ) = v( 3, i, j, k ) - > - v( 1, i, j, k ) * tmp - - tmp = tmp1 * tmat( 4, 1 ) - tmat( 4, 2 ) = tmat( 4, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 4, 3 ) = tmat( 4, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 1, 5 ) - v( 4, i, j, k ) = v( 4, i, j, k ) - > - v( 1, i, j, k ) * tmp - - tmp = tmp1 * tmat( 5, 1 ) - tmat( 5, 2 ) = tmat( 5, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 5, 3 ) = tmat( 5, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 1, 5 ) - v( 5, i, j, k ) = v( 5, i, j, k ) - > - v( 1, i, j, k ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 2, 2 ) - tmp = tmp1 * tmat( 3, 2 ) - tmat( 3, 3 ) = tmat( 3, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 3, 4 ) = tmat( 3, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 3, 5 ) = tmat( 3, 5 ) - > - tmp * tmat( 2, 5 ) - v( 3, i, j, k ) = v( 3, i, j, k ) - > - v( 2, i, j, k ) * tmp - - tmp = tmp1 * tmat( 4, 2 ) - tmat( 4, 3 ) = tmat( 4, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 2, 5 ) - v( 4, i, j, k ) = v( 4, i, j, k ) - > - v( 2, i, j, k ) * tmp - - tmp = tmp1 * tmat( 5, 2 ) - tmat( 5, 3 ) = tmat( 5, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 2, 5 ) - v( 5, i, j, k ) = v( 5, i, j, k ) - > - v( 2, i, j, k ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 3, 3 ) - tmp = tmp1 * tmat( 4, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 3, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 3, 5 ) - v( 4, i, j, k ) = v( 4, i, j, k ) - > - v( 3, i, j, k ) * tmp - - tmp = tmp1 * tmat( 5, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 3, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 3, 5 ) - v( 5, i, j, k ) = v( 5, i, j, k ) - > - v( 3, i, j, k ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 4, 4 ) - tmp = tmp1 * tmat( 5, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 4, 5 ) - v( 5, i, j, k ) = v( 5, i, j, k ) - > - v( 4, i, j, k ) * tmp - -c--------------------------------------------------------------------- -c back substitution -c--------------------------------------------------------------------- - v( 5, i, j, k ) = v( 5, i, j, k ) - > / tmat( 5, 5 ) - - v( 4, i, j, k ) = v( 4, i, j, k ) - > - tmat( 4, 5 ) * v( 5, i, j, k ) - v( 4, i, j, k ) = v( 4, i, j, k ) - > / tmat( 4, 4 ) - - v( 3, i, j, k ) = v( 3, i, j, k ) - > - tmat( 3, 4 ) * v( 4, i, j, k ) - > - tmat( 3, 5 ) * v( 5, i, j, k ) - v( 3, i, j, k ) = v( 3, i, j, k ) - > / tmat( 3, 3 ) - - v( 2, i, j, k ) = v( 2, i, j, k ) - > - tmat( 2, 3 ) * v( 3, i, j, k ) - > - tmat( 2, 4 ) * v( 4, i, j, k ) - > - tmat( 2, 5 ) * v( 5, i, j, k ) - v( 2, i, j, k ) = v( 2, i, j, k ) - > / tmat( 2, 2 ) - - v( 1, i, j, k ) = v( 1, i, j, k ) - > - tmat( 1, 2 ) * v( 2, i, j, k ) - > - tmat( 1, 3 ) * v( 3, i, j, k ) - > - tmat( 1, 4 ) * v( 4, i, j, k ) - > - tmat( 1, 5 ) * v( 5, i, j, k ) - v( 1, i, j, k ) = v( 1, i, j, k ) - > / tmat( 1, 1 ) - - - enddo - enddo - -c--------------------------------------------------------------------- -c send data to east and south -c--------------------------------------------------------------------- - iex = 2 - call exchange_1( v,k,iex ) - - return - end - - diff --git a/examples/smpi/NAS/LU/buts.f b/examples/smpi/NAS/LU/buts.f deleted file mode 100644 index a6fc3d6217..0000000000 --- a/examples/smpi/NAS/LU/buts.f +++ /dev/null @@ -1,259 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine buts( ldmx, ldmy, ldmz, - > nx, ny, nz, k, - > omega, - > v, tv, - > d, udx, udy, udz, - > ist, iend, jst, jend, - > nx0, ny0, ipt, jpt ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c compute the regular-sparse, block upper triangular solution: -c -c v <-- ( U-inv ) * v -c -c--------------------------------------------------------------------- - - implicit none - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - integer ldmx, ldmy, ldmz - integer nx, ny, nz - integer k - double precision omega - double precision v( 5, -1:ldmx+2, -1:ldmy+2, *), - > tv(5, ldmx, ldmy), - > d( 5, 5, ldmx, ldmy), - > udx( 5, 5, ldmx, ldmy), - > udy( 5, 5, ldmx, ldmy), - > udz( 5, 5, ldmx, ldmy ) - integer ist, iend - integer jst, jend - integer nx0, ny0 - integer ipt, jpt - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, m - integer iex - double precision tmp, tmp1 - double precision tmat(5,5) - - -c--------------------------------------------------------------------- -c receive data from south and east -c--------------------------------------------------------------------- - iex = 1 - call exchange_1( v,k,iex ) - - do j = jend, jst, -1 - do i = iend, ist, -1 - do m = 1, 5 - tv( m, i, j ) = - > omega * ( udz( m, 1, i, j ) * v( 1, i, j, k+1 ) - > + udz( m, 2, i, j ) * v( 2, i, j, k+1 ) - > + udz( m, 3, i, j ) * v( 3, i, j, k+1 ) - > + udz( m, 4, i, j ) * v( 4, i, j, k+1 ) - > + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) ) - end do - end do - end do - - - do j = jend,jst,-1 - do i = iend,ist,-1 - - do m = 1, 5 - tv( m, i, j ) = tv( m, i, j ) - > + omega * ( udy( m, 1, i, j ) * v( 1, i, j+1, k ) - > + udx( m, 1, i, j ) * v( 1, i+1, j, k ) - > + udy( m, 2, i, j ) * v( 2, i, j+1, k ) - > + udx( m, 2, i, j ) * v( 2, i+1, j, k ) - > + udy( m, 3, i, j ) * v( 3, i, j+1, k ) - > + udx( m, 3, i, j ) * v( 3, i+1, j, k ) - > + udy( m, 4, i, j ) * v( 4, i, j+1, k ) - > + udx( m, 4, i, j ) * v( 4, i+1, j, k ) - > + udy( m, 5, i, j ) * v( 5, i, j+1, k ) - > + udx( m, 5, i, j ) * v( 5, i+1, j, k ) ) - end do - -c--------------------------------------------------------------------- -c diagonal block inversion -c--------------------------------------------------------------------- - do m = 1, 5 - tmat( m, 1 ) = d( m, 1, i, j ) - tmat( m, 2 ) = d( m, 2, i, j ) - tmat( m, 3 ) = d( m, 3, i, j ) - tmat( m, 4 ) = d( m, 4, i, j ) - tmat( m, 5 ) = d( m, 5, i, j ) - end do - - tmp1 = 1.0d+00 / tmat( 1, 1 ) - tmp = tmp1 * tmat( 2, 1 ) - tmat( 2, 2 ) = tmat( 2, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 2, 3 ) = tmat( 2, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 2, 4 ) = tmat( 2, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 2, 5 ) = tmat( 2, 5 ) - > - tmp * tmat( 1, 5 ) - tv( 2, i, j ) = tv( 2, i, j ) - > - tv( 1, i, j ) * tmp - - tmp = tmp1 * tmat( 3, 1 ) - tmat( 3, 2 ) = tmat( 3, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 3, 3 ) = tmat( 3, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 3, 4 ) = tmat( 3, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 3, 5 ) = tmat( 3, 5 ) - > - tmp * tmat( 1, 5 ) - tv( 3, i, j ) = tv( 3, i, j ) - > - tv( 1, i, j ) * tmp - - tmp = tmp1 * tmat( 4, 1 ) - tmat( 4, 2 ) = tmat( 4, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 4, 3 ) = tmat( 4, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 1, 5 ) - tv( 4, i, j ) = tv( 4, i, j ) - > - tv( 1, i, j ) * tmp - - tmp = tmp1 * tmat( 5, 1 ) - tmat( 5, 2 ) = tmat( 5, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 5, 3 ) = tmat( 5, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 1, 5 ) - tv( 5, i, j ) = tv( 5, i, j ) - > - tv( 1, i, j ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 2, 2 ) - tmp = tmp1 * tmat( 3, 2 ) - tmat( 3, 3 ) = tmat( 3, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 3, 4 ) = tmat( 3, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 3, 5 ) = tmat( 3, 5 ) - > - tmp * tmat( 2, 5 ) - tv( 3, i, j ) = tv( 3, i, j ) - > - tv( 2, i, j ) * tmp - - tmp = tmp1 * tmat( 4, 2 ) - tmat( 4, 3 ) = tmat( 4, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 2, 5 ) - tv( 4, i, j ) = tv( 4, i, j ) - > - tv( 2, i, j ) * tmp - - tmp = tmp1 * tmat( 5, 2 ) - tmat( 5, 3 ) = tmat( 5, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 2, 5 ) - tv( 5, i, j ) = tv( 5, i, j ) - > - tv( 2, i, j ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 3, 3 ) - tmp = tmp1 * tmat( 4, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 3, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 3, 5 ) - tv( 4, i, j ) = tv( 4, i, j ) - > - tv( 3, i, j ) * tmp - - tmp = tmp1 * tmat( 5, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 3, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 3, 5 ) - tv( 5, i, j ) = tv( 5, i, j ) - > - tv( 3, i, j ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 4, 4 ) - tmp = tmp1 * tmat( 5, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 4, 5 ) - tv( 5, i, j ) = tv( 5, i, j ) - > - tv( 4, i, j ) * tmp - -c--------------------------------------------------------------------- -c back substitution -c--------------------------------------------------------------------- - tv( 5, i, j ) = tv( 5, i, j ) - > / tmat( 5, 5 ) - - tv( 4, i, j ) = tv( 4, i, j ) - > - tmat( 4, 5 ) * tv( 5, i, j ) - tv( 4, i, j ) = tv( 4, i, j ) - > / tmat( 4, 4 ) - - tv( 3, i, j ) = tv( 3, i, j ) - > - tmat( 3, 4 ) * tv( 4, i, j ) - > - tmat( 3, 5 ) * tv( 5, i, j ) - tv( 3, i, j ) = tv( 3, i, j ) - > / tmat( 3, 3 ) - - tv( 2, i, j ) = tv( 2, i, j ) - > - tmat( 2, 3 ) * tv( 3, i, j ) - > - tmat( 2, 4 ) * tv( 4, i, j ) - > - tmat( 2, 5 ) * tv( 5, i, j ) - tv( 2, i, j ) = tv( 2, i, j ) - > / tmat( 2, 2 ) - - tv( 1, i, j ) = tv( 1, i, j ) - > - tmat( 1, 2 ) * tv( 2, i, j ) - > - tmat( 1, 3 ) * tv( 3, i, j ) - > - tmat( 1, 4 ) * tv( 4, i, j ) - > - tmat( 1, 5 ) * tv( 5, i, j ) - tv( 1, i, j ) = tv( 1, i, j ) - > / tmat( 1, 1 ) - - v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j ) - v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j ) - v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j ) - v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j ) - v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j ) - - - enddo - end do - -c--------------------------------------------------------------------- -c send data to north and west -c--------------------------------------------------------------------- - iex = 3 - call exchange_1( v,k,iex ) - - return - end diff --git a/examples/smpi/NAS/LU/buts_vec.f b/examples/smpi/NAS/LU/buts_vec.f deleted file mode 100644 index 813105d270..0000000000 --- a/examples/smpi/NAS/LU/buts_vec.f +++ /dev/null @@ -1,332 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine buts( ldmx, ldmy, ldmz, - > nx, ny, nz, k, - > omega, - > v, tv, - > d, udx, udy, udz, - > ist, iend, jst, jend, - > nx0, ny0, ipt, jpt ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c compute the regular-sparse, block upper triangular solution: -c -c v <-- ( U-inv ) * v -c -c--------------------------------------------------------------------- - - implicit none - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - integer ldmx, ldmy, ldmz - integer nx, ny, nz - integer k - double precision omega - double precision v( 5, -1:ldmx+2, -1:ldmy+2, *), - > tv(5, ldmx, ldmy), - > d( 5, 5, ldmx, ldmy), - > udx( 5, 5, ldmx, ldmy), - > udy( 5, 5, ldmx, ldmy), - > udz( 5, 5, ldmx, ldmy ) - integer ist, iend - integer jst, jend - integer nx0, ny0 - integer ipt, jpt - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, m, l, istp, iendp - integer iex - double precision tmp, tmp1 - double precision tmat(5,5) - - -c--------------------------------------------------------------------- -c receive data from south and east -c--------------------------------------------------------------------- - iex = 1 - call exchange_1( v,k,iex ) - - do j = jend, jst, -1 - do i = iend, ist, -1 - do m = 1, 5 - tv( m, i, j ) = - > omega * ( udz( m, 1, i, j ) * v( 1, i, j, k+1 ) - > + udz( m, 2, i, j ) * v( 2, i, j, k+1 ) - > + udz( m, 3, i, j ) * v( 3, i, j, k+1 ) - > + udz( m, 4, i, j ) * v( 4, i, j, k+1 ) - > + udz( m, 5, i, j ) * v( 5, i, j, k+1 ) ) - end do - end do - end do - - - do l = iend+jend, ist+jst, -1 - istp = max(l - jend, ist) - iendp = min(l - jst, iend) - -!dir$ ivdep - do i = istp, iendp - j = l - i - -!!dir$ unroll 5 -! manually unroll the loop -! do m = 1, 5 - tv( 1, i, j ) = tv( 1, i, j ) - > + omega * ( udy( 1, 1, i, j ) * v( 1, i, j+1, k ) - > + udx( 1, 1, i, j ) * v( 1, i+1, j, k ) - > + udy( 1, 2, i, j ) * v( 2, i, j+1, k ) - > + udx( 1, 2, i, j ) * v( 2, i+1, j, k ) - > + udy( 1, 3, i, j ) * v( 3, i, j+1, k ) - > + udx( 1, 3, i, j ) * v( 3, i+1, j, k ) - > + udy( 1, 4, i, j ) * v( 4, i, j+1, k ) - > + udx( 1, 4, i, j ) * v( 4, i+1, j, k ) - > + udy( 1, 5, i, j ) * v( 5, i, j+1, k ) - > + udx( 1, 5, i, j ) * v( 5, i+1, j, k ) ) - tv( 2, i, j ) = tv( 2, i, j ) - > + omega * ( udy( 2, 1, i, j ) * v( 1, i, j+1, k ) - > + udx( 2, 1, i, j ) * v( 1, i+1, j, k ) - > + udy( 2, 2, i, j ) * v( 2, i, j+1, k ) - > + udx( 2, 2, i, j ) * v( 2, i+1, j, k ) - > + udy( 2, 3, i, j ) * v( 3, i, j+1, k ) - > + udx( 2, 3, i, j ) * v( 3, i+1, j, k ) - > + udy( 2, 4, i, j ) * v( 4, i, j+1, k ) - > + udx( 2, 4, i, j ) * v( 4, i+1, j, k ) - > + udy( 2, 5, i, j ) * v( 5, i, j+1, k ) - > + udx( 2, 5, i, j ) * v( 5, i+1, j, k ) ) - tv( 3, i, j ) = tv( 3, i, j ) - > + omega * ( udy( 3, 1, i, j ) * v( 1, i, j+1, k ) - > + udx( 3, 1, i, j ) * v( 1, i+1, j, k ) - > + udy( 3, 2, i, j ) * v( 2, i, j+1, k ) - > + udx( 3, 2, i, j ) * v( 2, i+1, j, k ) - > + udy( 3, 3, i, j ) * v( 3, i, j+1, k ) - > + udx( 3, 3, i, j ) * v( 3, i+1, j, k ) - > + udy( 3, 4, i, j ) * v( 4, i, j+1, k ) - > + udx( 3, 4, i, j ) * v( 4, i+1, j, k ) - > + udy( 3, 5, i, j ) * v( 5, i, j+1, k ) - > + udx( 3, 5, i, j ) * v( 5, i+1, j, k ) ) - tv( 4, i, j ) = tv( 4, i, j ) - > + omega * ( udy( 4, 1, i, j ) * v( 1, i, j+1, k ) - > + udx( 4, 1, i, j ) * v( 1, i+1, j, k ) - > + udy( 4, 2, i, j ) * v( 2, i, j+1, k ) - > + udx( 4, 2, i, j ) * v( 2, i+1, j, k ) - > + udy( 4, 3, i, j ) * v( 3, i, j+1, k ) - > + udx( 4, 3, i, j ) * v( 3, i+1, j, k ) - > + udy( 4, 4, i, j ) * v( 4, i, j+1, k ) - > + udx( 4, 4, i, j ) * v( 4, i+1, j, k ) - > + udy( 4, 5, i, j ) * v( 5, i, j+1, k ) - > + udx( 4, 5, i, j ) * v( 5, i+1, j, k ) ) - tv( 5, i, j ) = tv( 5, i, j ) - > + omega * ( udy( 5, 1, i, j ) * v( 1, i, j+1, k ) - > + udx( 5, 1, i, j ) * v( 1, i+1, j, k ) - > + udy( 5, 2, i, j ) * v( 2, i, j+1, k ) - > + udx( 5, 2, i, j ) * v( 2, i+1, j, k ) - > + udy( 5, 3, i, j ) * v( 3, i, j+1, k ) - > + udx( 5, 3, i, j ) * v( 3, i+1, j, k ) - > + udy( 5, 4, i, j ) * v( 4, i, j+1, k ) - > + udx( 5, 4, i, j ) * v( 4, i+1, j, k ) - > + udy( 5, 5, i, j ) * v( 5, i, j+1, k ) - > + udx( 5, 5, i, j ) * v( 5, i+1, j, k ) ) -! end do - -c--------------------------------------------------------------------- -c diagonal block inversion -c--------------------------------------------------------------------- -!!dir$ unroll 5 -! manually unroll the loop -! do m = 1, 5 - tmat( 1, 1 ) = d( 1, 1, i, j ) - tmat( 1, 2 ) = d( 1, 2, i, j ) - tmat( 1, 3 ) = d( 1, 3, i, j ) - tmat( 1, 4 ) = d( 1, 4, i, j ) - tmat( 1, 5 ) = d( 1, 5, i, j ) - tmat( 2, 1 ) = d( 2, 1, i, j ) - tmat( 2, 2 ) = d( 2, 2, i, j ) - tmat( 2, 3 ) = d( 2, 3, i, j ) - tmat( 2, 4 ) = d( 2, 4, i, j ) - tmat( 2, 5 ) = d( 2, 5, i, j ) - tmat( 3, 1 ) = d( 3, 1, i, j ) - tmat( 3, 2 ) = d( 3, 2, i, j ) - tmat( 3, 3 ) = d( 3, 3, i, j ) - tmat( 3, 4 ) = d( 3, 4, i, j ) - tmat( 3, 5 ) = d( 3, 5, i, j ) - tmat( 4, 1 ) = d( 4, 1, i, j ) - tmat( 4, 2 ) = d( 4, 2, i, j ) - tmat( 4, 3 ) = d( 4, 3, i, j ) - tmat( 4, 4 ) = d( 4, 4, i, j ) - tmat( 4, 5 ) = d( 4, 5, i, j ) - tmat( 5, 1 ) = d( 5, 1, i, j ) - tmat( 5, 2 ) = d( 5, 2, i, j ) - tmat( 5, 3 ) = d( 5, 3, i, j ) - tmat( 5, 4 ) = d( 5, 4, i, j ) - tmat( 5, 5 ) = d( 5, 5, i, j ) -! end do - - tmp1 = 1.0d+00 / tmat( 1, 1 ) - tmp = tmp1 * tmat( 2, 1 ) - tmat( 2, 2 ) = tmat( 2, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 2, 3 ) = tmat( 2, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 2, 4 ) = tmat( 2, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 2, 5 ) = tmat( 2, 5 ) - > - tmp * tmat( 1, 5 ) - tv( 2, i, j ) = tv( 2, i, j ) - > - tv( 1, i, j ) * tmp - - tmp = tmp1 * tmat( 3, 1 ) - tmat( 3, 2 ) = tmat( 3, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 3, 3 ) = tmat( 3, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 3, 4 ) = tmat( 3, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 3, 5 ) = tmat( 3, 5 ) - > - tmp * tmat( 1, 5 ) - tv( 3, i, j ) = tv( 3, i, j ) - > - tv( 1, i, j ) * tmp - - tmp = tmp1 * tmat( 4, 1 ) - tmat( 4, 2 ) = tmat( 4, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 4, 3 ) = tmat( 4, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 1, 5 ) - tv( 4, i, j ) = tv( 4, i, j ) - > - tv( 1, i, j ) * tmp - - tmp = tmp1 * tmat( 5, 1 ) - tmat( 5, 2 ) = tmat( 5, 2 ) - > - tmp * tmat( 1, 2 ) - tmat( 5, 3 ) = tmat( 5, 3 ) - > - tmp * tmat( 1, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 1, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 1, 5 ) - tv( 5, i, j ) = tv( 5, i, j ) - > - tv( 1, i, j ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 2, 2 ) - tmp = tmp1 * tmat( 3, 2 ) - tmat( 3, 3 ) = tmat( 3, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 3, 4 ) = tmat( 3, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 3, 5 ) = tmat( 3, 5 ) - > - tmp * tmat( 2, 5 ) - tv( 3, i, j ) = tv( 3, i, j ) - > - tv( 2, i, j ) * tmp - - tmp = tmp1 * tmat( 4, 2 ) - tmat( 4, 3 ) = tmat( 4, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 2, 5 ) - tv( 4, i, j ) = tv( 4, i, j ) - > - tv( 2, i, j ) * tmp - - tmp = tmp1 * tmat( 5, 2 ) - tmat( 5, 3 ) = tmat( 5, 3 ) - > - tmp * tmat( 2, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 2, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 2, 5 ) - tv( 5, i, j ) = tv( 5, i, j ) - > - tv( 2, i, j ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 3, 3 ) - tmp = tmp1 * tmat( 4, 3 ) - tmat( 4, 4 ) = tmat( 4, 4 ) - > - tmp * tmat( 3, 4 ) - tmat( 4, 5 ) = tmat( 4, 5 ) - > - tmp * tmat( 3, 5 ) - tv( 4, i, j ) = tv( 4, i, j ) - > - tv( 3, i, j ) * tmp - - tmp = tmp1 * tmat( 5, 3 ) - tmat( 5, 4 ) = tmat( 5, 4 ) - > - tmp * tmat( 3, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 3, 5 ) - tv( 5, i, j ) = tv( 5, i, j ) - > - tv( 3, i, j ) * tmp - - - - tmp1 = 1.0d+00 / tmat( 4, 4 ) - tmp = tmp1 * tmat( 5, 4 ) - tmat( 5, 5 ) = tmat( 5, 5 ) - > - tmp * tmat( 4, 5 ) - tv( 5, i, j ) = tv( 5, i, j ) - > - tv( 4, i, j ) * tmp - -c--------------------------------------------------------------------- -c back substitution -c--------------------------------------------------------------------- - tv( 5, i, j ) = tv( 5, i, j ) - > / tmat( 5, 5 ) - - tv( 4, i, j ) = tv( 4, i, j ) - > - tmat( 4, 5 ) * tv( 5, i, j ) - tv( 4, i, j ) = tv( 4, i, j ) - > / tmat( 4, 4 ) - - tv( 3, i, j ) = tv( 3, i, j ) - > - tmat( 3, 4 ) * tv( 4, i, j ) - > - tmat( 3, 5 ) * tv( 5, i, j ) - tv( 3, i, j ) = tv( 3, i, j ) - > / tmat( 3, 3 ) - - tv( 2, i, j ) = tv( 2, i, j ) - > - tmat( 2, 3 ) * tv( 3, i, j ) - > - tmat( 2, 4 ) * tv( 4, i, j ) - > - tmat( 2, 5 ) * tv( 5, i, j ) - tv( 2, i, j ) = tv( 2, i, j ) - > / tmat( 2, 2 ) - - tv( 1, i, j ) = tv( 1, i, j ) - > - tmat( 1, 2 ) * tv( 2, i, j ) - > - tmat( 1, 3 ) * tv( 3, i, j ) - > - tmat( 1, 4 ) * tv( 4, i, j ) - > - tmat( 1, 5 ) * tv( 5, i, j ) - tv( 1, i, j ) = tv( 1, i, j ) - > / tmat( 1, 1 ) - - v( 1, i, j, k ) = v( 1, i, j, k ) - tv( 1, i, j ) - v( 2, i, j, k ) = v( 2, i, j, k ) - tv( 2, i, j ) - v( 3, i, j, k ) = v( 3, i, j, k ) - tv( 3, i, j ) - v( 4, i, j, k ) = v( 4, i, j, k ) - tv( 4, i, j ) - v( 5, i, j, k ) = v( 5, i, j, k ) - tv( 5, i, j ) - - - enddo - end do - -c--------------------------------------------------------------------- -c send data to north and west -c--------------------------------------------------------------------- - iex = 3 - call exchange_1( v,k,iex ) - - return - end diff --git a/examples/smpi/NAS/LU/erhs.f b/examples/smpi/NAS/LU/erhs.f deleted file mode 100644 index 928e2a9f50..0000000000 --- a/examples/smpi/NAS/LU/erhs.f +++ /dev/null @@ -1,536 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine erhs - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c compute the right hand side based on exact solution -c -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, k, m - integer iglob, jglob - integer iex - integer L1, L2 - integer ist1, iend1 - integer jst1, jend1 - double precision dsspm - double precision xi, eta, zeta - double precision q - double precision u21, u31, u41 - double precision tmp - double precision u21i, u31i, u41i, u51i - double precision u21j, u31j, u41j, u51j - double precision u21k, u31k, u41k, u51k - double precision u21im1, u31im1, u41im1, u51im1 - double precision u21jm1, u31jm1, u41jm1, u51jm1 - double precision u21km1, u31km1, u41km1, u51km1 - - dsspm = dssp - - - do k = 1, nz - do j = 1, ny - do i = 1, nx - do m = 1, 5 - frct( m, i, j, k ) = 0.0d+00 - end do - end do - end do - end do - - do k = 1, nz - zeta = ( dble(k-1) ) / ( nz - 1 ) - do j = 1, ny - jglob = jpt + j - eta = ( dble(jglob-1) ) / ( ny0 - 1 ) - do i = 1, nx - iglob = ipt + i - xi = ( dble(iglob-1) ) / ( nx0 - 1 ) - do m = 1, 5 - rsd(m,i,j,k) = ce(m,1) - > + ce(m,2) * xi - > + ce(m,3) * eta - > + ce(m,4) * zeta - > + ce(m,5) * xi * xi - > + ce(m,6) * eta * eta - > + ce(m,7) * zeta * zeta - > + ce(m,8) * xi * xi * xi - > + ce(m,9) * eta * eta * eta - > + ce(m,10) * zeta * zeta * zeta - > + ce(m,11) * xi * xi * xi * xi - > + ce(m,12) * eta * eta * eta * eta - > + ce(m,13) * zeta * zeta * zeta * zeta - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c xi-direction flux differences -c--------------------------------------------------------------------- -c -c iex = flag : iex = 0 north/south communication -c : iex = 1 east/west communication -c -c--------------------------------------------------------------------- - iex = 0 - -c--------------------------------------------------------------------- -c communicate and receive/send two rows of data -c--------------------------------------------------------------------- - call exchange_3 (rsd,iex) - - L1 = 0 - if (north.eq.-1) L1 = 1 - L2 = nx + 1 - if (south.eq.-1) L2 = nx - - ist1 = 1 - iend1 = nx - if (north.eq.-1) ist1 = 4 - if (south.eq.-1) iend1 = nx - 3 - - do k = 2, nz - 1 - do j = jst, jend - do i = L1, L2 - flux(1,i,j,k) = rsd(2,i,j,k) - u21 = rsd(2,i,j,k) / rsd(1,i,j,k) - q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k) - > + rsd(3,i,j,k) * rsd(3,i,j,k) - > + rsd(4,i,j,k) * rsd(4,i,j,k) ) - > / rsd(1,i,j,k) - flux(2,i,j,k) = rsd(2,i,j,k) * u21 + c2 * - > ( rsd(5,i,j,k) - q ) - flux(3,i,j,k) = rsd(3,i,j,k) * u21 - flux(4,i,j,k) = rsd(4,i,j,k) * u21 - flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u21 - end do - end do - end do - - do k = 2, nz - 1 - do j = jst, jend - do i = ist, iend - do m = 1, 5 - frct(m,i,j,k) = frct(m,i,j,k) - > - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) ) - end do - end do - do i = ist, L2 - tmp = 1.0d+00 / rsd(1,i,j,k) - - u21i = tmp * rsd(2,i,j,k) - u31i = tmp * rsd(3,i,j,k) - u41i = tmp * rsd(4,i,j,k) - u51i = tmp * rsd(5,i,j,k) - - tmp = 1.0d+00 / rsd(1,i-1,j,k) - - u21im1 = tmp * rsd(2,i-1,j,k) - u31im1 = tmp * rsd(3,i-1,j,k) - u41im1 = tmp * rsd(4,i-1,j,k) - u51im1 = tmp * rsd(5,i-1,j,k) - - flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * - > ( u21i - u21im1 ) - flux(3,i,j,k) = tx3 * ( u31i - u31im1 ) - flux(4,i,j,k) = tx3 * ( u41i - u41im1 ) - flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) - > * tx3 * ( ( u21i **2 + u31i **2 + u41i **2 ) - > - ( u21im1**2 + u31im1**2 + u41im1**2 ) ) - > + (1.0d+00/6.0d+00) - > * tx3 * ( u21i**2 - u21im1**2 ) - > + c1 * c5 * tx3 * ( u51i - u51im1 ) - end do - - do i = ist, iend - frct(1,i,j,k) = frct(1,i,j,k) - > + dx1 * tx1 * ( rsd(1,i-1,j,k) - > - 2.0d+00 * rsd(1,i,j,k) - > + rsd(1,i+1,j,k) ) - frct(2,i,j,k) = frct(2,i,j,k) - > + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) ) - > + dx2 * tx1 * ( rsd(2,i-1,j,k) - > - 2.0d+00 * rsd(2,i,j,k) - > + rsd(2,i+1,j,k) ) - frct(3,i,j,k) = frct(3,i,j,k) - > + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) ) - > + dx3 * tx1 * ( rsd(3,i-1,j,k) - > - 2.0d+00 * rsd(3,i,j,k) - > + rsd(3,i+1,j,k) ) - frct(4,i,j,k) = frct(4,i,j,k) - > + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) ) - > + dx4 * tx1 * ( rsd(4,i-1,j,k) - > - 2.0d+00 * rsd(4,i,j,k) - > + rsd(4,i+1,j,k) ) - frct(5,i,j,k) = frct(5,i,j,k) - > + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) ) - > + dx5 * tx1 * ( rsd(5,i-1,j,k) - > - 2.0d+00 * rsd(5,i,j,k) - > + rsd(5,i+1,j,k) ) - end do - -c--------------------------------------------------------------------- -c Fourth-order dissipation -c--------------------------------------------------------------------- - IF (north.eq.-1) then - do m = 1, 5 - frct(m,2,j,k) = frct(m,2,j,k) - > - dsspm * ( + 5.0d+00 * rsd(m,2,j,k) - > - 4.0d+00 * rsd(m,3,j,k) - > + rsd(m,4,j,k) ) - frct(m,3,j,k) = frct(m,3,j,k) - > - dsspm * ( - 4.0d+00 * rsd(m,2,j,k) - > + 6.0d+00 * rsd(m,3,j,k) - > - 4.0d+00 * rsd(m,4,j,k) - > + rsd(m,5,j,k) ) - end do - END IF - - do i = ist1,iend1 - do m = 1, 5 - frct(m,i,j,k) = frct(m,i,j,k) - > - dsspm * ( rsd(m,i-2,j,k) - > - 4.0d+00 * rsd(m,i-1,j,k) - > + 6.0d+00 * rsd(m,i,j,k) - > - 4.0d+00 * rsd(m,i+1,j,k) - > + rsd(m,i+2,j,k) ) - end do - end do - - IF (south.eq.-1) then - do m = 1, 5 - frct(m,nx-2,j,k) = frct(m,nx-2,j,k) - > - dsspm * ( rsd(m,nx-4,j,k) - > - 4.0d+00 * rsd(m,nx-3,j,k) - > + 6.0d+00 * rsd(m,nx-2,j,k) - > - 4.0d+00 * rsd(m,nx-1,j,k) ) - frct(m,nx-1,j,k) = frct(m,nx-1,j,k) - > - dsspm * ( rsd(m,nx-3,j,k) - > - 4.0d+00 * rsd(m,nx-2,j,k) - > + 5.0d+00 * rsd(m,nx-1,j,k) ) - end do - END IF - - end do - end do - -c--------------------------------------------------------------------- -c eta-direction flux differences -c--------------------------------------------------------------------- -c -c iex = flag : iex = 0 north/south communication -c : iex = 1 east/west communication -c -c--------------------------------------------------------------------- - iex = 1 - -c--------------------------------------------------------------------- -c communicate and receive/send two rows of data -c--------------------------------------------------------------------- - call exchange_3 (rsd,iex) - - L1 = 0 - if (west.eq.-1) L1 = 1 - L2 = ny + 1 - if (east.eq.-1) L2 = ny - - jst1 = 1 - jend1 = ny - if (west.eq.-1) jst1 = 4 - if (east.eq.-1) jend1 = ny - 3 - - do k = 2, nz - 1 - do j = L1, L2 - do i = ist, iend - flux(1,i,j,k) = rsd(3,i,j,k) - u31 = rsd(3,i,j,k) / rsd(1,i,j,k) - q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k) - > + rsd(3,i,j,k) * rsd(3,i,j,k) - > + rsd(4,i,j,k) * rsd(4,i,j,k) ) - > / rsd(1,i,j,k) - flux(2,i,j,k) = rsd(2,i,j,k) * u31 - flux(3,i,j,k) = rsd(3,i,j,k) * u31 + c2 * - > ( rsd(5,i,j,k) - q ) - flux(4,i,j,k) = rsd(4,i,j,k) * u31 - flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u31 - end do - end do - end do - - do k = 2, nz - 1 - do i = ist, iend - do j = jst, jend - do m = 1, 5 - frct(m,i,j,k) = frct(m,i,j,k) - > - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) ) - end do - end do - end do - - do j = jst, L2 - do i = ist, iend - tmp = 1.0d+00 / rsd(1,i,j,k) - - u21j = tmp * rsd(2,i,j,k) - u31j = tmp * rsd(3,i,j,k) - u41j = tmp * rsd(4,i,j,k) - u51j = tmp * rsd(5,i,j,k) - - tmp = 1.0d+00 / rsd(1,i,j-1,k) - - u21jm1 = tmp * rsd(2,i,j-1,k) - u31jm1 = tmp * rsd(3,i,j-1,k) - u41jm1 = tmp * rsd(4,i,j-1,k) - u51jm1 = tmp * rsd(5,i,j-1,k) - - flux(2,i,j,k) = ty3 * ( u21j - u21jm1 ) - flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * - > ( u31j - u31jm1 ) - flux(4,i,j,k) = ty3 * ( u41j - u41jm1 ) - flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) - > * ty3 * ( ( u21j **2 + u31j **2 + u41j **2 ) - > - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) ) - > + (1.0d+00/6.0d+00) - > * ty3 * ( u31j**2 - u31jm1**2 ) - > + c1 * c5 * ty3 * ( u51j - u51jm1 ) - end do - end do - - do j = jst, jend - do i = ist, iend - frct(1,i,j,k) = frct(1,i,j,k) - > + dy1 * ty1 * ( rsd(1,i,j-1,k) - > - 2.0d+00 * rsd(1,i,j,k) - > + rsd(1,i,j+1,k) ) - frct(2,i,j,k) = frct(2,i,j,k) - > + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) ) - > + dy2 * ty1 * ( rsd(2,i,j-1,k) - > - 2.0d+00 * rsd(2,i,j,k) - > + rsd(2,i,j+1,k) ) - frct(3,i,j,k) = frct(3,i,j,k) - > + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) ) - > + dy3 * ty1 * ( rsd(3,i,j-1,k) - > - 2.0d+00 * rsd(3,i,j,k) - > + rsd(3,i,j+1,k) ) - frct(4,i,j,k) = frct(4,i,j,k) - > + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) ) - > + dy4 * ty1 * ( rsd(4,i,j-1,k) - > - 2.0d+00 * rsd(4,i,j,k) - > + rsd(4,i,j+1,k) ) - frct(5,i,j,k) = frct(5,i,j,k) - > + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) ) - > + dy5 * ty1 * ( rsd(5,i,j-1,k) - > - 2.0d+00 * rsd(5,i,j,k) - > + rsd(5,i,j+1,k) ) - end do - end do - -c--------------------------------------------------------------------- -c fourth-order dissipation -c--------------------------------------------------------------------- - IF (west.eq.-1) then - do i = ist, iend - do m = 1, 5 - frct(m,i,2,k) = frct(m,i,2,k) - > - dsspm * ( + 5.0d+00 * rsd(m,i,2,k) - > - 4.0d+00 * rsd(m,i,3,k) - > + rsd(m,i,4,k) ) - frct(m,i,3,k) = frct(m,i,3,k) - > - dsspm * ( - 4.0d+00 * rsd(m,i,2,k) - > + 6.0d+00 * rsd(m,i,3,k) - > - 4.0d+00 * rsd(m,i,4,k) - > + rsd(m,i,5,k) ) - end do - end do - END IF - - do j = jst1, jend1 - do i = ist, iend - do m = 1, 5 - frct(m,i,j,k) = frct(m,i,j,k) - > - dsspm * ( rsd(m,i,j-2,k) - > - 4.0d+00 * rsd(m,i,j-1,k) - > + 6.0d+00 * rsd(m,i,j,k) - > - 4.0d+00 * rsd(m,i,j+1,k) - > + rsd(m,i,j+2,k) ) - end do - end do - end do - - IF (east.eq.-1) then - do i = ist, iend - do m = 1, 5 - frct(m,i,ny-2,k) = frct(m,i,ny-2,k) - > - dsspm * ( rsd(m,i,ny-4,k) - > - 4.0d+00 * rsd(m,i,ny-3,k) - > + 6.0d+00 * rsd(m,i,ny-2,k) - > - 4.0d+00 * rsd(m,i,ny-1,k) ) - frct(m,i,ny-1,k) = frct(m,i,ny-1,k) - > - dsspm * ( rsd(m,i,ny-3,k) - > - 4.0d+00 * rsd(m,i,ny-2,k) - > + 5.0d+00 * rsd(m,i,ny-1,k) ) - end do - end do - END IF - - end do - -c--------------------------------------------------------------------- -c zeta-direction flux differences -c--------------------------------------------------------------------- - do k = 1, nz - do j = jst, jend - do i = ist, iend - flux(1,i,j,k) = rsd(4,i,j,k) - u41 = rsd(4,i,j,k) / rsd(1,i,j,k) - q = 0.50d+00 * ( rsd(2,i,j,k) * rsd(2,i,j,k) - > + rsd(3,i,j,k) * rsd(3,i,j,k) - > + rsd(4,i,j,k) * rsd(4,i,j,k) ) - > / rsd(1,i,j,k) - flux(2,i,j,k) = rsd(2,i,j,k) * u41 - flux(3,i,j,k) = rsd(3,i,j,k) * u41 - flux(4,i,j,k) = rsd(4,i,j,k) * u41 + c2 * - > ( rsd(5,i,j,k) - q ) - flux(5,i,j,k) = ( c1 * rsd(5,i,j,k) - c2 * q ) * u41 - end do - end do - end do - - do k = 2, nz - 1 - do j = jst, jend - do i = ist, iend - do m = 1, 5 - frct(m,i,j,k) = frct(m,i,j,k) - > - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) ) - end do - end do - end do - end do - - do k = 2, nz - do j = jst, jend - do i = ist, iend - tmp = 1.0d+00 / rsd(1,i,j,k) - - u21k = tmp * rsd(2,i,j,k) - u31k = tmp * rsd(3,i,j,k) - u41k = tmp * rsd(4,i,j,k) - u51k = tmp * rsd(5,i,j,k) - - tmp = 1.0d+00 / rsd(1,i,j,k-1) - - u21km1 = tmp * rsd(2,i,j,k-1) - u31km1 = tmp * rsd(3,i,j,k-1) - u41km1 = tmp * rsd(4,i,j,k-1) - u51km1 = tmp * rsd(5,i,j,k-1) - - flux(2,i,j,k) = tz3 * ( u21k - u21km1 ) - flux(3,i,j,k) = tz3 * ( u31k - u31km1 ) - flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * ( u41k - > - u41km1 ) - flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) - > * tz3 * ( ( u21k **2 + u31k **2 + u41k **2 ) - > - ( u21km1**2 + u31km1**2 + u41km1**2 ) ) - > + (1.0d+00/6.0d+00) - > * tz3 * ( u41k**2 - u41km1**2 ) - > + c1 * c5 * tz3 * ( u51k - u51km1 ) - end do - end do - end do - - do k = 2, nz - 1 - do j = jst, jend - do i = ist, iend - frct(1,i,j,k) = frct(1,i,j,k) - > + dz1 * tz1 * ( rsd(1,i,j,k+1) - > - 2.0d+00 * rsd(1,i,j,k) - > + rsd(1,i,j,k-1) ) - frct(2,i,j,k) = frct(2,i,j,k) - > + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) ) - > + dz2 * tz1 * ( rsd(2,i,j,k+1) - > - 2.0d+00 * rsd(2,i,j,k) - > + rsd(2,i,j,k-1) ) - frct(3,i,j,k) = frct(3,i,j,k) - > + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) ) - > + dz3 * tz1 * ( rsd(3,i,j,k+1) - > - 2.0d+00 * rsd(3,i,j,k) - > + rsd(3,i,j,k-1) ) - frct(4,i,j,k) = frct(4,i,j,k) - > + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) ) - > + dz4 * tz1 * ( rsd(4,i,j,k+1) - > - 2.0d+00 * rsd(4,i,j,k) - > + rsd(4,i,j,k-1) ) - frct(5,i,j,k) = frct(5,i,j,k) - > + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) ) - > + dz5 * tz1 * ( rsd(5,i,j,k+1) - > - 2.0d+00 * rsd(5,i,j,k) - > + rsd(5,i,j,k-1) ) - end do - end do - end do - -c--------------------------------------------------------------------- -c fourth-order dissipation -c--------------------------------------------------------------------- - do j = jst, jend - do i = ist, iend - do m = 1, 5 - frct(m,i,j,2) = frct(m,i,j,2) - > - dsspm * ( + 5.0d+00 * rsd(m,i,j,2) - > - 4.0d+00 * rsd(m,i,j,3) - > + rsd(m,i,j,4) ) - frct(m,i,j,3) = frct(m,i,j,3) - > - dsspm * (- 4.0d+00 * rsd(m,i,j,2) - > + 6.0d+00 * rsd(m,i,j,3) - > - 4.0d+00 * rsd(m,i,j,4) - > + rsd(m,i,j,5) ) - end do - end do - end do - - do k = 4, nz - 3 - do j = jst, jend - do i = ist, iend - do m = 1, 5 - frct(m,i,j,k) = frct(m,i,j,k) - > - dsspm * ( rsd(m,i,j,k-2) - > - 4.0d+00 * rsd(m,i,j,k-1) - > + 6.0d+00 * rsd(m,i,j,k) - > - 4.0d+00 * rsd(m,i,j,k+1) - > + rsd(m,i,j,k+2) ) - end do - end do - end do - end do - - do j = jst, jend - do i = ist, iend - do m = 1, 5 - frct(m,i,j,nz-2) = frct(m,i,j,nz-2) - > - dsspm * ( rsd(m,i,j,nz-4) - > - 4.0d+00 * rsd(m,i,j,nz-3) - > + 6.0d+00 * rsd(m,i,j,nz-2) - > - 4.0d+00 * rsd(m,i,j,nz-1) ) - frct(m,i,j,nz-1) = frct(m,i,j,nz-1) - > - dsspm * ( rsd(m,i,j,nz-3) - > - 4.0d+00 * rsd(m,i,j,nz-2) - > + 5.0d+00 * rsd(m,i,j,nz-1) ) - end do - end do - end do - - return - end diff --git a/examples/smpi/NAS/LU/error.f b/examples/smpi/NAS/LU/error.f deleted file mode 100644 index e83f74912f..0000000000 --- a/examples/smpi/NAS/LU/error.f +++ /dev/null @@ -1,81 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine error - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c compute the solution error -c -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, k, m - integer iglob, jglob - double precision tmp - double precision u000ijk(5), dummy(5) - - integer IERROR - - - do m = 1, 5 - errnm(m) = 0.0d+00 - dummy(m) = 0.0d+00 - end do - - do k = 2, nz-1 - do j = jst, jend - jglob = jpt + j - do i = ist, iend - iglob = ipt + i - call exact( iglob, jglob, k, u000ijk ) - do m = 1, 5 - tmp = ( u000ijk(m) - u(m,i,j,k) ) - dummy(m) = dummy(m) + tmp ** 2 - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c compute the global sum of individual contributions to dot product. -c--------------------------------------------------------------------- - call MPI_ALLREDUCE( dummy, - > errnm, - > 5, - > dp_type, - > MPI_SUM, - > MPI_COMM_WORLD, - > IERROR ) - - do m = 1, 5 - errnm(m) = sqrt ( errnm(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) ) - end do - -c if (id.eq.0) then -c write (*,1002) ( errnm(m), m = 1, 5 ) -c end if - - 1002 format (1x/1x,'RMS-norm of error in soln. to ', - > 'first pde = ',1pe12.5/, - > 1x,'RMS-norm of error in soln. to ', - > 'second pde = ',1pe12.5/, - > 1x,'RMS-norm of error in soln. to ', - > 'third pde = ',1pe12.5/, - > 1x,'RMS-norm of error in soln. to ', - > 'fourth pde = ',1pe12.5/, - > 1x,'RMS-norm of error in soln. to ', - > 'fifth pde = ',1pe12.5) - - return - end diff --git a/examples/smpi/NAS/LU/exact.f b/examples/smpi/NAS/LU/exact.f deleted file mode 100644 index 19e14c3d5f..0000000000 --- a/examples/smpi/NAS/LU/exact.f +++ /dev/null @@ -1,53 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exact( i, j, k, u000ijk ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c compute the exact solution at (i,j,k) -c -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - integer i, j, k - double precision u000ijk(*) - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer m - double precision xi, eta, zeta - - xi = ( dble ( i - 1 ) ) / ( nx0 - 1 ) - eta = ( dble ( j - 1 ) ) / ( ny0 - 1 ) - zeta = ( dble ( k - 1 ) ) / ( nz - 1 ) - - - do m = 1, 5 - u000ijk(m) = ce(m,1) - > + ce(m,2) * xi - > + ce(m,3) * eta - > + ce(m,4) * zeta - > + ce(m,5) * xi * xi - > + ce(m,6) * eta * eta - > + ce(m,7) * zeta * zeta - > + ce(m,8) * xi * xi * xi - > + ce(m,9) * eta * eta * eta - > + ce(m,10) * zeta * zeta * zeta - > + ce(m,11) * xi * xi * xi * xi - > + ce(m,12) * eta * eta * eta * eta - > + ce(m,13) * zeta * zeta * zeta * zeta - end do - - return - end diff --git a/examples/smpi/NAS/LU/exchange_1.f b/examples/smpi/NAS/LU/exchange_1.f deleted file mode 100644 index 2bf7d28b94..0000000000 --- a/examples/smpi/NAS/LU/exchange_1.f +++ /dev/null @@ -1,180 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exchange_1( g,k,iex ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - - double precision g(5,-1:isiz1+2,-1:isiz2+2,isiz3) - integer k - integer iex - integer i, j - double precision dum(5,isiz1+isiz2), dum1(5,isiz1+isiz2) - - integer STATUS(MPI_STATUS_SIZE) - integer IERROR - - - - if( iex .eq. 0 ) then - - if( north .ne. -1 ) then - call MPI_RECV( dum1(1,jst), - > 5*(jend-jst+1), - > dp_type, - > north, - > from_n, - > MPI_COMM_WORLD, - > status, - > IERROR ) - do j=jst,jend - g(1,0,j,k) = dum1(1,j) - g(2,0,j,k) = dum1(2,j) - g(3,0,j,k) = dum1(3,j) - g(4,0,j,k) = dum1(4,j) - g(5,0,j,k) = dum1(5,j) - enddo - endif - - if( west .ne. -1 ) then - call MPI_RECV( dum1(1,ist), - > 5*(iend-ist+1), - > dp_type, - > west, - > from_w, - > MPI_COMM_WORLD, - > status, - > IERROR ) - do i=ist,iend - g(1,i,0,k) = dum1(1,i) - g(2,i,0,k) = dum1(2,i) - g(3,i,0,k) = dum1(3,i) - g(4,i,0,k) = dum1(4,i) - g(5,i,0,k) = dum1(5,i) - enddo - endif - - else if( iex .eq. 1 ) then - - if( south .ne. -1 ) then - call MPI_RECV( dum1(1,jst), - > 5*(jend-jst+1), - > dp_type, - > south, - > from_s, - > MPI_COMM_WORLD, - > status, - > IERROR ) - do j=jst,jend - g(1,nx+1,j,k) = dum1(1,j) - g(2,nx+1,j,k) = dum1(2,j) - g(3,nx+1,j,k) = dum1(3,j) - g(4,nx+1,j,k) = dum1(4,j) - g(5,nx+1,j,k) = dum1(5,j) - enddo - endif - - if( east .ne. -1 ) then - call MPI_RECV( dum1(1,ist), - > 5*(iend-ist+1), - > dp_type, - > east, - > from_e, - > MPI_COMM_WORLD, - > status, - > IERROR ) - do i=ist,iend - g(1,i,ny+1,k) = dum1(1,i) - g(2,i,ny+1,k) = dum1(2,i) - g(3,i,ny+1,k) = dum1(3,i) - g(4,i,ny+1,k) = dum1(4,i) - g(5,i,ny+1,k) = dum1(5,i) - enddo - endif - - else if( iex .eq. 2 ) then - - if( south .ne. -1 ) then - do j=jst,jend - dum(1,j) = g(1,nx,j,k) - dum(2,j) = g(2,nx,j,k) - dum(3,j) = g(3,nx,j,k) - dum(4,j) = g(4,nx,j,k) - dum(5,j) = g(5,nx,j,k) - enddo - call MPI_SEND( dum(1,jst), - > 5*(jend-jst+1), - > dp_type, - > south, - > from_n, - > MPI_COMM_WORLD, - > IERROR ) - endif - - if( east .ne. -1 ) then - do i=ist,iend - dum(1,i) = g(1,i,ny,k) - dum(2,i) = g(2,i,ny,k) - dum(3,i) = g(3,i,ny,k) - dum(4,i) = g(4,i,ny,k) - dum(5,i) = g(5,i,ny,k) - enddo - call MPI_SEND( dum(1,ist), - > 5*(iend-ist+1), - > dp_type, - > east, - > from_w, - > MPI_COMM_WORLD, - > IERROR ) - endif - - else - - if( north .ne. -1 ) then - do j=jst,jend - dum(1,j) = g(1,1,j,k) - dum(2,j) = g(2,1,j,k) - dum(3,j) = g(3,1,j,k) - dum(4,j) = g(4,1,j,k) - dum(5,j) = g(5,1,j,k) - enddo - call MPI_SEND( dum(1,jst), - > 5*(jend-jst+1), - > dp_type, - > north, - > from_s, - > MPI_COMM_WORLD, - > IERROR ) - endif - - if( west .ne. -1 ) then - do i=ist,iend - dum(1,i) = g(1,i,1,k) - dum(2,i) = g(2,i,1,k) - dum(3,i) = g(3,i,1,k) - dum(4,i) = g(4,i,1,k) - dum(5,i) = g(5,i,1,k) - enddo - call MPI_SEND( dum(1,ist), - > 5*(iend-ist+1), - > dp_type, - > west, - > from_e, - > MPI_COMM_WORLD, - > IERROR ) - endif - - endif - - end - - - diff --git a/examples/smpi/NAS/LU/exchange_3.f b/examples/smpi/NAS/LU/exchange_3.f deleted file mode 100644 index d52ae7ef4d..0000000000 --- a/examples/smpi/NAS/LU/exchange_3.f +++ /dev/null @@ -1,312 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exchange_3(g,iex) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute the right hand side based on exact solution -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - double precision g(5,-1:isiz1+2,-1:isiz2+2,isiz3) - integer iex - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, k - integer ipos1, ipos2 - - integer mid - integer STATUS(MPI_STATUS_SIZE) - integer IERROR - - - - if (iex.eq.0) then -c--------------------------------------------------------------------- -c communicate in the south and north directions -c--------------------------------------------------------------------- - if (north.ne.-1) then - call MPI_IRECV( buf1, - > 10*ny*nz, - > dp_type, - > MPI_ANY_SOURCE, - > from_n, - > MPI_COMM_WORLD, - > mid, - > IERROR ) - end if - -c--------------------------------------------------------------------- -c send south -c--------------------------------------------------------------------- - if (south.ne.-1) then - do k = 1,nz - do j = 1,ny - ipos1 = (k-1)*ny + j - ipos2 = ipos1 + ny*nz - buf(1,ipos1) = g(1,nx-1,j,k) - buf(2,ipos1) = g(2,nx-1,j,k) - buf(3,ipos1) = g(3,nx-1,j,k) - buf(4,ipos1) = g(4,nx-1,j,k) - buf(5,ipos1) = g(5,nx-1,j,k) - buf(1,ipos2) = g(1,nx,j,k) - buf(2,ipos2) = g(2,nx,j,k) - buf(3,ipos2) = g(3,nx,j,k) - buf(4,ipos2) = g(4,nx,j,k) - buf(5,ipos2) = g(5,nx,j,k) - end do - end do - - call MPI_SEND( buf, - > 10*ny*nz, - > dp_type, - > south, - > from_n, - > MPI_COMM_WORLD, - > IERROR ) - end if - -c--------------------------------------------------------------------- -c receive from north -c--------------------------------------------------------------------- - if (north.ne.-1) then - call MPI_WAIT( mid, STATUS, IERROR ) - - do k = 1,nz - do j = 1,ny - ipos1 = (k-1)*ny + j - ipos2 = ipos1 + ny*nz - g(1,-1,j,k) = buf1(1,ipos1) - g(2,-1,j,k) = buf1(2,ipos1) - g(3,-1,j,k) = buf1(3,ipos1) - g(4,-1,j,k) = buf1(4,ipos1) - g(5,-1,j,k) = buf1(5,ipos1) - g(1,0,j,k) = buf1(1,ipos2) - g(2,0,j,k) = buf1(2,ipos2) - g(3,0,j,k) = buf1(3,ipos2) - g(4,0,j,k) = buf1(4,ipos2) - g(5,0,j,k) = buf1(5,ipos2) - end do - end do - - end if - - if (south.ne.-1) then - call MPI_IRECV( buf1, - > 10*ny*nz, - > dp_type, - > MPI_ANY_SOURCE, - > from_s, - > MPI_COMM_WORLD, - > mid, - > IERROR ) - end if - -c--------------------------------------------------------------------- -c send north -c--------------------------------------------------------------------- - if (north.ne.-1) then - do k = 1,nz - do j = 1,ny - ipos1 = (k-1)*ny + j - ipos2 = ipos1 + ny*nz - buf(1,ipos1) = g(1,2,j,k) - buf(2,ipos1) = g(2,2,j,k) - buf(3,ipos1) = g(3,2,j,k) - buf(4,ipos1) = g(4,2,j,k) - buf(5,ipos1) = g(5,2,j,k) - buf(1,ipos2) = g(1,1,j,k) - buf(2,ipos2) = g(2,1,j,k) - buf(3,ipos2) = g(3,1,j,k) - buf(4,ipos2) = g(4,1,j,k) - buf(5,ipos2) = g(5,1,j,k) - end do - end do - - call MPI_SEND( buf, - > 10*ny*nz, - > dp_type, - > north, - > from_s, - > MPI_COMM_WORLD, - > IERROR ) - end if - -c--------------------------------------------------------------------- -c receive from south -c--------------------------------------------------------------------- - if (south.ne.-1) then - call MPI_WAIT( mid, STATUS, IERROR ) - - do k = 1,nz - do j = 1,ny - ipos1 = (k-1)*ny + j - ipos2 = ipos1 + ny*nz - g(1,nx+2,j,k) = buf1(1,ipos1) - g(2,nx+2,j,k) = buf1(2,ipos1) - g(3,nx+2,j,k) = buf1(3,ipos1) - g(4,nx+2,j,k) = buf1(4,ipos1) - g(5,nx+2,j,k) = buf1(5,ipos1) - g(1,nx+1,j,k) = buf1(1,ipos2) - g(2,nx+1,j,k) = buf1(2,ipos2) - g(3,nx+1,j,k) = buf1(3,ipos2) - g(4,nx+1,j,k) = buf1(4,ipos2) - g(5,nx+1,j,k) = buf1(5,ipos2) - end do - end do - end if - - else - -c--------------------------------------------------------------------- -c communicate in the east and west directions -c--------------------------------------------------------------------- - if (west.ne.-1) then - call MPI_IRECV( buf1, - > 10*nx*nz, - > dp_type, - > MPI_ANY_SOURCE, - > from_w, - > MPI_COMM_WORLD, - > mid, - > IERROR ) - end if - -c--------------------------------------------------------------------- -c send east -c--------------------------------------------------------------------- - if (east.ne.-1) then - do k = 1,nz - do i = 1,nx - ipos1 = (k-1)*nx + i - ipos2 = ipos1 + nx*nz - buf(1,ipos1) = g(1,i,ny-1,k) - buf(2,ipos1) = g(2,i,ny-1,k) - buf(3,ipos1) = g(3,i,ny-1,k) - buf(4,ipos1) = g(4,i,ny-1,k) - buf(5,ipos1) = g(5,i,ny-1,k) - buf(1,ipos2) = g(1,i,ny,k) - buf(2,ipos2) = g(2,i,ny,k) - buf(3,ipos2) = g(3,i,ny,k) - buf(4,ipos2) = g(4,i,ny,k) - buf(5,ipos2) = g(5,i,ny,k) - end do - end do - - call MPI_SEND( buf, - > 10*nx*nz, - > dp_type, - > east, - > from_w, - > MPI_COMM_WORLD, - > IERROR ) - end if - -c--------------------------------------------------------------------- -c receive from west -c--------------------------------------------------------------------- - if (west.ne.-1) then - call MPI_WAIT( mid, STATUS, IERROR ) - - do k = 1,nz - do i = 1,nx - ipos1 = (k-1)*nx + i - ipos2 = ipos1 + nx*nz - g(1,i,-1,k) = buf1(1,ipos1) - g(2,i,-1,k) = buf1(2,ipos1) - g(3,i,-1,k) = buf1(3,ipos1) - g(4,i,-1,k) = buf1(4,ipos1) - g(5,i,-1,k) = buf1(5,ipos1) - g(1,i,0,k) = buf1(1,ipos2) - g(2,i,0,k) = buf1(2,ipos2) - g(3,i,0,k) = buf1(3,ipos2) - g(4,i,0,k) = buf1(4,ipos2) - g(5,i,0,k) = buf1(5,ipos2) - end do - end do - - end if - - if (east.ne.-1) then - call MPI_IRECV( buf1, - > 10*nx*nz, - > dp_type, - > MPI_ANY_SOURCE, - > from_e, - > MPI_COMM_WORLD, - > mid, - > IERROR ) - end if - -c--------------------------------------------------------------------- -c send west -c--------------------------------------------------------------------- - if (west.ne.-1) then - do k = 1,nz - do i = 1,nx - ipos1 = (k-1)*nx + i - ipos2 = ipos1 + nx*nz - buf(1,ipos1) = g(1,i,2,k) - buf(2,ipos1) = g(2,i,2,k) - buf(3,ipos1) = g(3,i,2,k) - buf(4,ipos1) = g(4,i,2,k) - buf(5,ipos1) = g(5,i,2,k) - buf(1,ipos2) = g(1,i,1,k) - buf(2,ipos2) = g(2,i,1,k) - buf(3,ipos2) = g(3,i,1,k) - buf(4,ipos2) = g(4,i,1,k) - buf(5,ipos2) = g(5,i,1,k) - end do - end do - - call MPI_SEND( buf, - > 10*nx*nz, - > dp_type, - > west, - > from_e, - > MPI_COMM_WORLD, - > IERROR ) - end if - -c--------------------------------------------------------------------- -c receive from east -c--------------------------------------------------------------------- - if (east.ne.-1) then - call MPI_WAIT( mid, STATUS, IERROR ) - - do k = 1,nz - do i = 1,nx - ipos1 = (k-1)*nx + i - ipos2 = ipos1 + nx*nz - g(1,i,ny+2,k) = buf1(1,ipos1) - g(2,i,ny+2,k) = buf1(2,ipos1) - g(3,i,ny+2,k) = buf1(3,ipos1) - g(4,i,ny+2,k) = buf1(4,ipos1) - g(5,i,ny+2,k) = buf1(5,ipos1) - g(1,i,ny+1,k) = buf1(1,ipos2) - g(2,i,ny+1,k) = buf1(2,ipos2) - g(3,i,ny+1,k) = buf1(3,ipos2) - g(4,i,ny+1,k) = buf1(4,ipos2) - g(5,i,ny+1,k) = buf1(5,ipos2) - end do - end do - - end if - - end if - - return - end diff --git a/examples/smpi/NAS/LU/exchange_4.f b/examples/smpi/NAS/LU/exchange_4.f deleted file mode 100644 index 1c4c38e218..0000000000 --- a/examples/smpi/NAS/LU/exchange_4.f +++ /dev/null @@ -1,133 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exchange_4(g,h,ibeg,ifin1,jbeg,jfin1) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute the right hand side based on exact solution -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - double precision g(0:isiz2+1,0:isiz3+1), - > h(0:isiz2+1,0:isiz3+1) - integer ibeg, ifin1 - integer jbeg, jfin1 - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j - integer ny2 - double precision dum(1024) - - integer msgid1, msgid3 - integer STATUS(MPI_STATUS_SIZE) - integer IERROR - - - - ny2 = ny + 2 - -c--------------------------------------------------------------------- -c communicate in the east and west directions -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c receive from east -c--------------------------------------------------------------------- - if (jfin1.eq.ny) then - call MPI_IRECV( dum, - > 2*nx, - > dp_type, - > MPI_ANY_SOURCE, - > from_e, - > MPI_COMM_WORLD, - > msgid3, - > IERROR ) - - call MPI_WAIT( msgid3, STATUS, IERROR ) - - do i = 1,nx - g(i,ny+1) = dum(i) - h(i,ny+1) = dum(i+nx) - end do - - end if - -c--------------------------------------------------------------------- -c send west -c--------------------------------------------------------------------- - if (jbeg.eq.1) then - do i = 1,nx - dum(i) = g(i,1) - dum(i+nx) = h(i,1) - end do - - call MPI_SEND( dum, - > 2*nx, - > dp_type, - > west, - > from_e, - > MPI_COMM_WORLD, - > IERROR ) - - end if - -c--------------------------------------------------------------------- -c communicate in the south and north directions -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c receive from south -c--------------------------------------------------------------------- - if (ifin1.eq.nx) then - call MPI_IRECV( dum, - > 2*ny2, - > dp_type, - > MPI_ANY_SOURCE, - > from_s, - > MPI_COMM_WORLD, - > msgid1, - > IERROR ) - - call MPI_WAIT( msgid1, STATUS, IERROR ) - - do j = 0,ny+1 - g(nx+1,j) = dum(j+1) - h(nx+1,j) = dum(j+ny2+1) - end do - - end if - -c--------------------------------------------------------------------- -c send north -c--------------------------------------------------------------------- - if (ibeg.eq.1) then - do j = 0,ny+1 - dum(j+1) = g(1,j) - dum(j+ny2+1) = h(1,j) - end do - - call MPI_SEND( dum, - > 2*ny2, - > dp_type, - > north, - > from_s, - > MPI_COMM_WORLD, - > IERROR ) - - end if - - return - end diff --git a/examples/smpi/NAS/LU/exchange_5.f b/examples/smpi/NAS/LU/exchange_5.f deleted file mode 100644 index e4cc66f5bf..0000000000 --- a/examples/smpi/NAS/LU/exchange_5.f +++ /dev/null @@ -1,81 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exchange_5(g,ibeg,ifin1) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute the right hand side based on exact solution -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - double precision g(0:isiz2+1,0:isiz3+1) - integer ibeg, ifin1 - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer k - double precision dum(1024) - - integer msgid1 - integer STATUS(MPI_STATUS_SIZE) - integer IERROR - - - -c--------------------------------------------------------------------- -c communicate in the south and north directions -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c receive from south -c--------------------------------------------------------------------- - if (ifin1.eq.nx) then - call MPI_IRECV( dum, - > nz, - > dp_type, - > MPI_ANY_SOURCE, - > from_s, - > MPI_COMM_WORLD, - > msgid1, - > IERROR ) - - call MPI_WAIT( msgid1, STATUS, IERROR ) - - do k = 1,nz - g(nx+1,k) = dum(k) - end do - - end if - -c--------------------------------------------------------------------- -c send north -c--------------------------------------------------------------------- - if (ibeg.eq.1) then - do k = 1,nz - dum(k) = g(1,k) - end do - - call MPI_SEND( dum, - > nz, - > dp_type, - > north, - > from_s, - > MPI_COMM_WORLD, - > IERROR ) - - end if - - return - end diff --git a/examples/smpi/NAS/LU/exchange_6.f b/examples/smpi/NAS/LU/exchange_6.f deleted file mode 100644 index 0626609547..0000000000 --- a/examples/smpi/NAS/LU/exchange_6.f +++ /dev/null @@ -1,81 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exchange_6(g,jbeg,jfin1) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute the right hand side based on exact solution -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - double precision g(0:isiz2+1,0:isiz3+1) - integer jbeg, jfin1 - -c--------------------------------------------------------------------- -c local parameters -c--------------------------------------------------------------------- - integer k - double precision dum(1024) - - integer msgid3 - integer STATUS(MPI_STATUS_SIZE) - integer IERROR - - - -c--------------------------------------------------------------------- -c communicate in the east and west directions -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c receive from east -c--------------------------------------------------------------------- - if (jfin1.eq.ny) then - call MPI_IRECV( dum, - > nz, - > dp_type, - > MPI_ANY_SOURCE, - > from_e, - > MPI_COMM_WORLD, - > msgid3, - > IERROR ) - - call MPI_WAIT( msgid3, STATUS, IERROR ) - - do k = 1,nz - g(ny+1,k) = dum(k) - end do - - end if - -c--------------------------------------------------------------------- -c send west -c--------------------------------------------------------------------- - if (jbeg.eq.1) then - do k = 1,nz - dum(k) = g(1,k) - end do - - call MPI_SEND( dum, - > nz, - > dp_type, - > west, - > from_e, - > MPI_COMM_WORLD, - > IERROR ) - - end if - - return - end diff --git a/examples/smpi/NAS/LU/init_comm.f b/examples/smpi/NAS/LU/init_comm.f deleted file mode 100644 index 72ece00ac4..0000000000 --- a/examples/smpi/NAS/LU/init_comm.f +++ /dev/null @@ -1,57 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine init_comm - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c initialize MPI and establish rank and size -c -c This is a module in the MPI implementation of LUSSOR -c pseudo application from the NAS Parallel Benchmarks. -c -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - - integer nodedim - integer IERROR - - -c--------------------------------------------------------------------- -c initialize MPI communication -c--------------------------------------------------------------------- - call MPI_INIT( IERROR ) - -c--------------------------------------------------------------------- -c establish the global rank of this process -c--------------------------------------------------------------------- - call MPI_COMM_RANK( MPI_COMM_WORLD, - > id, - > IERROR ) - -c--------------------------------------------------------------------- -c establish the size of the global group -c--------------------------------------------------------------------- - call MPI_COMM_SIZE( MPI_COMM_WORLD, - > num, - > IERROR ) - - ndim = nodedim(num) - - if (.not. convertdouble) then - dp_type = MPI_DOUBLE_PRECISION - else - dp_type = MPI_REAL - endif - - - return - end diff --git a/examples/smpi/NAS/LU/inputlu.data.sample b/examples/smpi/NAS/LU/inputlu.data.sample deleted file mode 100644 index 9ef5a7be00..0000000000 --- a/examples/smpi/NAS/LU/inputlu.data.sample +++ /dev/null @@ -1,24 +0,0 @@ -c -c***controls printing of the progress of iterations: ipr inorm - 1 250 -c -c***the maximum no. of pseudo-time steps to be performed: nitmax - 250 -c -c***magnitude of the time step: dt - 2.0e+00 -c -c***relaxation factor for SSOR iterations: omega - 1.2 -c -c***tolerance levels for steady-state residuals: tolnwt(m),m=1,5 - 1.0e-08 1.0e-08 1.0e-08 1.0e-08 1.0e-08 -c -c***number of grid points in xi and eta and zeta directions: nx ny nz - 64 64 64 -c - - - - - diff --git a/examples/smpi/NAS/LU/jacld.f b/examples/smpi/NAS/LU/jacld.f deleted file mode 100644 index 9580d080ab..0000000000 --- a/examples/smpi/NAS/LU/jacld.f +++ /dev/null @@ -1,384 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine jacld(k) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - -c--------------------------------------------------------------------- -c compute the lower triangular part of the jacobian matrix -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - integer k - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j - double precision r43 - double precision c1345 - double precision c34 - double precision tmp1, tmp2, tmp3 - - - - r43 = ( 4.0d+00 / 3.0d+00 ) - c1345 = c1 * c3 * c4 * c5 - c34 = c3 * c4 - - do j = jst, jend - do i = ist, iend - -c--------------------------------------------------------------------- -c form the block daigonal -c--------------------------------------------------------------------- - tmp1 = 1.0d+00 / u(1,i,j,k) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - d(1,1,i,j) = 1.0d+00 - > + dt * 2.0d+00 * ( tx1 * dx1 - > + ty1 * dy1 - > + tz1 * dz1 ) - d(1,2,i,j) = 0.0d+00 - d(1,3,i,j) = 0.0d+00 - d(1,4,i,j) = 0.0d+00 - d(1,5,i,j) = 0.0d+00 - - d(2,1,i,j) = dt * 2.0d+00 - > * ( tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) ) - > + ty1 * ( - c34 * tmp2 * u(2,i,j,k) ) - > + tz1 * ( - c34 * tmp2 * u(2,i,j,k) ) ) - d(2,2,i,j) = 1.0d+00 - > + dt * 2.0d+00 - > * ( tx1 * r43 * c34 * tmp1 - > + ty1 * c34 * tmp1 - > + tz1 * c34 * tmp1 ) - > + dt * 2.0d+00 * ( tx1 * dx2 - > + ty1 * dy2 - > + tz1 * dz2 ) - d(2,3,i,j) = 0.0d+00 - d(2,4,i,j) = 0.0d+00 - d(2,5,i,j) = 0.0d+00 - - d(3,1,i,j) = dt * 2.0d+00 - > * ( tx1 * ( - c34 * tmp2 * u(3,i,j,k) ) - > + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) ) - > + tz1 * ( - c34 * tmp2 * u(3,i,j,k) ) ) - d(3,2,i,j) = 0.0d+00 - d(3,3,i,j) = 1.0d+00 - > + dt * 2.0d+00 - > * ( tx1 * c34 * tmp1 - > + ty1 * r43 * c34 * tmp1 - > + tz1 * c34 * tmp1 ) - > + dt * 2.0d+00 * ( tx1 * dx3 - > + ty1 * dy3 - > + tz1 * dz3 ) - d(3,4,i,j) = 0.0d+00 - d(3,5,i,j) = 0.0d+00 - - d(4,1,i,j) = dt * 2.0d+00 - > * ( tx1 * ( - c34 * tmp2 * u(4,i,j,k) ) - > + ty1 * ( - c34 * tmp2 * u(4,i,j,k) ) - > + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) ) - d(4,2,i,j) = 0.0d+00 - d(4,3,i,j) = 0.0d+00 - d(4,4,i,j) = 1.0d+00 - > + dt * 2.0d+00 - > * ( tx1 * c34 * tmp1 - > + ty1 * c34 * tmp1 - > + tz1 * r43 * c34 * tmp1 ) - > + dt * 2.0d+00 * ( tx1 * dx4 - > + ty1 * dy4 - > + tz1 * dz4 ) - d(4,5,i,j) = 0.0d+00 - - d(5,1,i,j) = dt * 2.0d+00 - > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) - > - ( c1345 ) * tmp2 * u(5,i,j,k) ) - > + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) - > - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) - > - ( c1345 ) * tmp2 * u(5,i,j,k) ) - > + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) - > - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) - > - ( c1345 ) * tmp2 * u(5,i,j,k) ) ) - d(5,2,i,j) = dt * 2.0d+00 - > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k) - > + ty1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) - > + tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) ) - d(5,3,i,j) = dt * 2.0d+00 - > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) - > + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k) - > + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) ) - d(5,4,i,j) = dt * 2.0d+00 - > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k) - > + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k) - > + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) ) - d(5,5,i,j) = 1.0d+00 - > + dt * 2.0d+00 * ( tx1 * c1345 * tmp1 - > + ty1 * c1345 * tmp1 - > + tz1 * c1345 * tmp1 ) - > + dt * 2.0d+00 * ( tx1 * dx5 - > + ty1 * dy5 - > + tz1 * dz5 ) - -c--------------------------------------------------------------------- -c form the first block sub-diagonal -c--------------------------------------------------------------------- - tmp1 = 1.0d+00 / u(1,i,j,k-1) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - a(1,1,i,j) = - dt * tz1 * dz1 - a(1,2,i,j) = 0.0d+00 - a(1,3,i,j) = 0.0d+00 - a(1,4,i,j) = - dt * tz2 - a(1,5,i,j) = 0.0d+00 - - a(2,1,i,j) = - dt * tz2 - > * ( - ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) - > - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k-1) ) - a(2,2,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 ) - > - dt * tz1 * c34 * tmp1 - > - dt * tz1 * dz2 - a(2,3,i,j) = 0.0d+00 - a(2,4,i,j) = - dt * tz2 * ( u(2,i,j,k-1) * tmp1 ) - a(2,5,i,j) = 0.0d+00 - - a(3,1,i,j) = - dt * tz2 - > * ( - ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) - > - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k-1) ) - a(3,2,i,j) = 0.0d+00 - a(3,3,i,j) = - dt * tz2 * ( u(4,i,j,k-1) * tmp1 ) - > - dt * tz1 * ( c34 * tmp1 ) - > - dt * tz1 * dz3 - a(3,4,i,j) = - dt * tz2 * ( u(3,i,j,k-1) * tmp1 ) - a(3,5,i,j) = 0.0d+00 - - a(4,1,i,j) = - dt * tz2 - > * ( - ( u(4,i,j,k-1) * tmp1 ) ** 2 - > + 0.50d+00 * c2 - > * ( ( u(2,i,j,k-1) * u(2,i,j,k-1) - > + u(3,i,j,k-1) * u(3,i,j,k-1) - > + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2 ) ) - > - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k-1) ) - a(4,2,i,j) = - dt * tz2 - > * ( - c2 * ( u(2,i,j,k-1) * tmp1 ) ) - a(4,3,i,j) = - dt * tz2 - > * ( - c2 * ( u(3,i,j,k-1) * tmp1 ) ) - a(4,4,i,j) = - dt * tz2 * ( 2.0d+00 - c2 ) - > * ( u(4,i,j,k-1) * tmp1 ) - > - dt * tz1 * ( r43 * c34 * tmp1 ) - > - dt * tz1 * dz4 - a(4,5,i,j) = - dt * tz2 * c2 - - a(5,1,i,j) = - dt * tz2 - > * ( ( c2 * ( u(2,i,j,k-1) * u(2,i,j,k-1) - > + u(3,i,j,k-1) * u(3,i,j,k-1) - > + u(4,i,j,k-1) * u(4,i,j,k-1) ) * tmp2 - > - c1 * ( u(5,i,j,k-1) * tmp1 ) ) - > * ( u(4,i,j,k-1) * tmp1 ) ) - > - dt * tz1 - > * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k-1)**2) - > - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k-1)**2) - > - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k-1)**2) - > - c1345 * tmp2 * u(5,i,j,k-1) ) - a(5,2,i,j) = - dt * tz2 - > * ( - c2 * ( u(2,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) - > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k-1) - a(5,3,i,j) = - dt * tz2 - > * ( - c2 * ( u(3,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) - > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k-1) - a(5,4,i,j) = - dt * tz2 - > * ( c1 * ( u(5,i,j,k-1) * tmp1 ) - > - 0.50d+00 * c2 - > * ( ( u(2,i,j,k-1)*u(2,i,j,k-1) - > + u(3,i,j,k-1)*u(3,i,j,k-1) - > + 3.0d+00*u(4,i,j,k-1)*u(4,i,j,k-1) ) * tmp2 ) ) - > - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k-1) - a(5,5,i,j) = - dt * tz2 - > * ( c1 * ( u(4,i,j,k-1) * tmp1 ) ) - > - dt * tz1 * c1345 * tmp1 - > - dt * tz1 * dz5 - -c--------------------------------------------------------------------- -c form the second block sub-diagonal -c--------------------------------------------------------------------- - tmp1 = 1.0d+00 / u(1,i,j-1,k) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - b(1,1,i,j) = - dt * ty1 * dy1 - b(1,2,i,j) = 0.0d+00 - b(1,3,i,j) = - dt * ty2 - b(1,4,i,j) = 0.0d+00 - b(1,5,i,j) = 0.0d+00 - - b(2,1,i,j) = - dt * ty2 - > * ( - ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 ) - > - dt * ty1 * ( - c34 * tmp2 * u(2,i,j-1,k) ) - b(2,2,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 ) - > - dt * ty1 * ( c34 * tmp1 ) - > - dt * ty1 * dy2 - b(2,3,i,j) = - dt * ty2 * ( u(2,i,j-1,k) * tmp1 ) - b(2,4,i,j) = 0.0d+00 - b(2,5,i,j) = 0.0d+00 - - b(3,1,i,j) = - dt * ty2 - > * ( - ( u(3,i,j-1,k) * tmp1 ) ** 2 - > + 0.50d+00 * c2 * ( ( u(2,i,j-1,k) * u(2,i,j-1,k) - > + u(3,i,j-1,k) * u(3,i,j-1,k) - > + u(4,i,j-1,k) * u(4,i,j-1,k) ) - > * tmp2 ) ) - > - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j-1,k) ) - b(3,2,i,j) = - dt * ty2 - > * ( - c2 * ( u(2,i,j-1,k) * tmp1 ) ) - b(3,3,i,j) = - dt * ty2 * ( ( 2.0d+00 - c2 ) - > * ( u(3,i,j-1,k) * tmp1 ) ) - > - dt * ty1 * ( r43 * c34 * tmp1 ) - > - dt * ty1 * dy3 - b(3,4,i,j) = - dt * ty2 - > * ( - c2 * ( u(4,i,j-1,k) * tmp1 ) ) - b(3,5,i,j) = - dt * ty2 * c2 - - b(4,1,i,j) = - dt * ty2 - > * ( - ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) - > - dt * ty1 * ( - c34 * tmp2 * u(4,i,j-1,k) ) - b(4,2,i,j) = 0.0d+00 - b(4,3,i,j) = - dt * ty2 * ( u(4,i,j-1,k) * tmp1 ) - b(4,4,i,j) = - dt * ty2 * ( u(3,i,j-1,k) * tmp1 ) - > - dt * ty1 * ( c34 * tmp1 ) - > - dt * ty1 * dy4 - b(4,5,i,j) = 0.0d+00 - - b(5,1,i,j) = - dt * ty2 - > * ( ( c2 * ( u(2,i,j-1,k) * u(2,i,j-1,k) - > + u(3,i,j-1,k) * u(3,i,j-1,k) - > + u(4,i,j-1,k) * u(4,i,j-1,k) ) * tmp2 - > - c1 * ( u(5,i,j-1,k) * tmp1 ) ) - > * ( u(3,i,j-1,k) * tmp1 ) ) - > - dt * ty1 - > * ( - ( c34 - c1345 )*tmp3*(u(2,i,j-1,k)**2) - > - ( r43*c34 - c1345 )*tmp3*(u(3,i,j-1,k)**2) - > - ( c34 - c1345 )*tmp3*(u(4,i,j-1,k)**2) - > - c1345*tmp2*u(5,i,j-1,k) ) - b(5,2,i,j) = - dt * ty2 - > * ( - c2 * ( u(2,i,j-1,k)*u(3,i,j-1,k) ) * tmp2 ) - > - dt * ty1 - > * ( c34 - c1345 ) * tmp2 * u(2,i,j-1,k) - b(5,3,i,j) = - dt * ty2 - > * ( c1 * ( u(5,i,j-1,k) * tmp1 ) - > - 0.50d+00 * c2 - > * ( ( u(2,i,j-1,k)*u(2,i,j-1,k) - > + 3.0d+00 * u(3,i,j-1,k)*u(3,i,j-1,k) - > + u(4,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) ) - > - dt * ty1 - > * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j-1,k) - b(5,4,i,j) = - dt * ty2 - > * ( - c2 * ( u(3,i,j-1,k)*u(4,i,j-1,k) ) * tmp2 ) - > - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j-1,k) - b(5,5,i,j) = - dt * ty2 - > * ( c1 * ( u(3,i,j-1,k) * tmp1 ) ) - > - dt * ty1 * c1345 * tmp1 - > - dt * ty1 * dy5 - -c--------------------------------------------------------------------- -c form the third block sub-diagonal -c--------------------------------------------------------------------- - tmp1 = 1.0d+00 / u(1,i-1,j,k) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - c(1,1,i,j) = - dt * tx1 * dx1 - c(1,2,i,j) = - dt * tx2 - c(1,3,i,j) = 0.0d+00 - c(1,4,i,j) = 0.0d+00 - c(1,5,i,j) = 0.0d+00 - - c(2,1,i,j) = - dt * tx2 - > * ( - ( u(2,i-1,j,k) * tmp1 ) ** 2 - > + c2 * 0.50d+00 * ( u(2,i-1,j,k) * u(2,i-1,j,k) - > + u(3,i-1,j,k) * u(3,i-1,j,k) - > + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2 ) - > - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i-1,j,k) ) - c(2,2,i,j) = - dt * tx2 - > * ( ( 2.0d+00 - c2 ) * ( u(2,i-1,j,k) * tmp1 ) ) - > - dt * tx1 * ( r43 * c34 * tmp1 ) - > - dt * tx1 * dx2 - c(2,3,i,j) = - dt * tx2 - > * ( - c2 * ( u(3,i-1,j,k) * tmp1 ) ) - c(2,4,i,j) = - dt * tx2 - > * ( - c2 * ( u(4,i-1,j,k) * tmp1 ) ) - c(2,5,i,j) = - dt * tx2 * c2 - - c(3,1,i,j) = - dt * tx2 - > * ( - ( u(2,i-1,j,k) * u(3,i-1,j,k) ) * tmp2 ) - > - dt * tx1 * ( - c34 * tmp2 * u(3,i-1,j,k) ) - c(3,2,i,j) = - dt * tx2 * ( u(3,i-1,j,k) * tmp1 ) - c(3,3,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 ) - > - dt * tx1 * ( c34 * tmp1 ) - > - dt * tx1 * dx3 - c(3,4,i,j) = 0.0d+00 - c(3,5,i,j) = 0.0d+00 - - c(4,1,i,j) = - dt * tx2 - > * ( - ( u(2,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 ) - > - dt * tx1 * ( - c34 * tmp2 * u(4,i-1,j,k) ) - c(4,2,i,j) = - dt * tx2 * ( u(4,i-1,j,k) * tmp1 ) - c(4,3,i,j) = 0.0d+00 - c(4,4,i,j) = - dt * tx2 * ( u(2,i-1,j,k) * tmp1 ) - > - dt * tx1 * ( c34 * tmp1 ) - > - dt * tx1 * dx4 - c(4,5,i,j) = 0.0d+00 - - c(5,1,i,j) = - dt * tx2 - > * ( ( c2 * ( u(2,i-1,j,k) * u(2,i-1,j,k) - > + u(3,i-1,j,k) * u(3,i-1,j,k) - > + u(4,i-1,j,k) * u(4,i-1,j,k) ) * tmp2 - > - c1 * ( u(5,i-1,j,k) * tmp1 ) ) - > * ( u(2,i-1,j,k) * tmp1 ) ) - > - dt * tx1 - > * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i-1,j,k)**2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(3,i-1,j,k)**2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(4,i-1,j,k)**2 ) - > - c1345 * tmp2 * u(5,i-1,j,k) ) - c(5,2,i,j) = - dt * tx2 - > * ( c1 * ( u(5,i-1,j,k) * tmp1 ) - > - 0.50d+00 * c2 - > * ( ( 3.0d+00*u(2,i-1,j,k)*u(2,i-1,j,k) - > + u(3,i-1,j,k)*u(3,i-1,j,k) - > + u(4,i-1,j,k)*u(4,i-1,j,k) ) * tmp2 ) ) - > - dt * tx1 - > * ( r43*c34 - c1345 ) * tmp2 * u(2,i-1,j,k) - c(5,3,i,j) = - dt * tx2 - > * ( - c2 * ( u(3,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 ) - > - dt * tx1 - > * ( c34 - c1345 ) * tmp2 * u(3,i-1,j,k) - c(5,4,i,j) = - dt * tx2 - > * ( - c2 * ( u(4,i-1,j,k)*u(2,i-1,j,k) ) * tmp2 ) - > - dt * tx1 - > * ( c34 - c1345 ) * tmp2 * u(4,i-1,j,k) - c(5,5,i,j) = - dt * tx2 - > * ( c1 * ( u(2,i-1,j,k) * tmp1 ) ) - > - dt * tx1 * c1345 * tmp1 - > - dt * tx1 * dx5 - - end do - end do - - return - end diff --git a/examples/smpi/NAS/LU/jacu.f b/examples/smpi/NAS/LU/jacu.f deleted file mode 100644 index 6a3c5b8ddb..0000000000 --- a/examples/smpi/NAS/LU/jacu.f +++ /dev/null @@ -1,384 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine jacu(k) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute the upper triangular part of the jacobian matrix -c--------------------------------------------------------------------- - - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - integer k - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j - double precision r43 - double precision c1345 - double precision c34 - double precision tmp1, tmp2, tmp3 - - - - r43 = ( 4.0d+00 / 3.0d+00 ) - c1345 = c1 * c3 * c4 * c5 - c34 = c3 * c4 - - do j = jst, jend - do i = ist, iend - -c--------------------------------------------------------------------- -c form the block daigonal -c--------------------------------------------------------------------- - tmp1 = 1.0d+00 / u(1,i,j,k) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - d(1,1,i,j) = 1.0d+00 - > + dt * 2.0d+00 * ( tx1 * dx1 - > + ty1 * dy1 - > + tz1 * dz1 ) - d(1,2,i,j) = 0.0d+00 - d(1,3,i,j) = 0.0d+00 - d(1,4,i,j) = 0.0d+00 - d(1,5,i,j) = 0.0d+00 - - d(2,1,i,j) = dt * 2.0d+00 - > * ( tx1 * ( - r43 * c34 * tmp2 * u(2,i,j,k) ) - > + ty1 * ( - c34 * tmp2 * u(2,i,j,k) ) - > + tz1 * ( - c34 * tmp2 * u(2,i,j,k) ) ) - d(2,2,i,j) = 1.0d+00 - > + dt * 2.0d+00 - > * ( tx1 * r43 * c34 * tmp1 - > + ty1 * c34 * tmp1 - > + tz1 * c34 * tmp1 ) - > + dt * 2.0d+00 * ( tx1 * dx2 - > + ty1 * dy2 - > + tz1 * dz2 ) - d(2,3,i,j) = 0.0d+00 - d(2,4,i,j) = 0.0d+00 - d(2,5,i,j) = 0.0d+00 - - d(3,1,i,j) = dt * 2.0d+00 - > * ( tx1 * ( - c34 * tmp2 * u(3,i,j,k) ) - > + ty1 * ( - r43 * c34 * tmp2 * u(3,i,j,k) ) - > + tz1 * ( - c34 * tmp2 * u(3,i,j,k) ) ) - d(3,2,i,j) = 0.0d+00 - d(3,3,i,j) = 1.0d+00 - > + dt * 2.0d+00 - > * ( tx1 * c34 * tmp1 - > + ty1 * r43 * c34 * tmp1 - > + tz1 * c34 * tmp1 ) - > + dt * 2.0d+00 * ( tx1 * dx3 - > + ty1 * dy3 - > + tz1 * dz3 ) - d(3,4,i,j) = 0.0d+00 - d(3,5,i,j) = 0.0d+00 - - d(4,1,i,j) = dt * 2.0d+00 - > * ( tx1 * ( - c34 * tmp2 * u(4,i,j,k) ) - > + ty1 * ( - c34 * tmp2 * u(4,i,j,k) ) - > + tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k) ) ) - d(4,2,i,j) = 0.0d+00 - d(4,3,i,j) = 0.0d+00 - d(4,4,i,j) = 1.0d+00 - > + dt * 2.0d+00 - > * ( tx1 * c34 * tmp1 - > + ty1 * c34 * tmp1 - > + tz1 * r43 * c34 * tmp1 ) - > + dt * 2.0d+00 * ( tx1 * dx4 - > + ty1 * dy4 - > + tz1 * dz4 ) - d(4,5,i,j) = 0.0d+00 - - d(5,1,i,j) = dt * 2.0d+00 - > * ( tx1 * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) - > - ( c1345 ) * tmp2 * u(5,i,j,k) ) - > + ty1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) - > - ( r43*c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) - > - ( c1345 ) * tmp2 * u(5,i,j,k) ) - > + tz1 * ( - ( c34 - c1345 ) * tmp3 * ( u(2,i,j,k) ** 2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(3,i,j,k) ** 2 ) - > - ( r43*c34 - c1345 ) * tmp3 * ( u(4,i,j,k) ** 2 ) - > - ( c1345 ) * tmp2 * u(5,i,j,k) ) ) - d(5,2,i,j) = dt * 2.0d+00 - > * ( tx1 * ( r43*c34 - c1345 ) * tmp2 * u(2,i,j,k) - > + ty1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) - > + tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k) ) - d(5,3,i,j) = dt * 2.0d+00 - > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) - > + ty1 * ( r43*c34 -c1345 ) * tmp2 * u(3,i,j,k) - > + tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k) ) - d(5,4,i,j) = dt * 2.0d+00 - > * ( tx1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k) - > + ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j,k) - > + tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k) ) - d(5,5,i,j) = 1.0d+00 - > + dt * 2.0d+00 * ( tx1 * c1345 * tmp1 - > + ty1 * c1345 * tmp1 - > + tz1 * c1345 * tmp1 ) - > + dt * 2.0d+00 * ( tx1 * dx5 - > + ty1 * dy5 - > + tz1 * dz5 ) - -c--------------------------------------------------------------------- -c form the first block sub-diagonal -c--------------------------------------------------------------------- - tmp1 = 1.0d+00 / u(1,i+1,j,k) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - a(1,1,i,j) = - dt * tx1 * dx1 - a(1,2,i,j) = dt * tx2 - a(1,3,i,j) = 0.0d+00 - a(1,4,i,j) = 0.0d+00 - a(1,5,i,j) = 0.0d+00 - - a(2,1,i,j) = dt * tx2 - > * ( - ( u(2,i+1,j,k) * tmp1 ) ** 2 - > + c2 * 0.50d+00 * ( u(2,i+1,j,k) * u(2,i+1,j,k) - > + u(3,i+1,j,k) * u(3,i+1,j,k) - > + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2 ) - > - dt * tx1 * ( - r43 * c34 * tmp2 * u(2,i+1,j,k) ) - a(2,2,i,j) = dt * tx2 - > * ( ( 2.0d+00 - c2 ) * ( u(2,i+1,j,k) * tmp1 ) ) - > - dt * tx1 * ( r43 * c34 * tmp1 ) - > - dt * tx1 * dx2 - a(2,3,i,j) = dt * tx2 - > * ( - c2 * ( u(3,i+1,j,k) * tmp1 ) ) - a(2,4,i,j) = dt * tx2 - > * ( - c2 * ( u(4,i+1,j,k) * tmp1 ) ) - a(2,5,i,j) = dt * tx2 * c2 - - a(3,1,i,j) = dt * tx2 - > * ( - ( u(2,i+1,j,k) * u(3,i+1,j,k) ) * tmp2 ) - > - dt * tx1 * ( - c34 * tmp2 * u(3,i+1,j,k) ) - a(3,2,i,j) = dt * tx2 * ( u(3,i+1,j,k) * tmp1 ) - a(3,3,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 ) - > - dt * tx1 * ( c34 * tmp1 ) - > - dt * tx1 * dx3 - a(3,4,i,j) = 0.0d+00 - a(3,5,i,j) = 0.0d+00 - - a(4,1,i,j) = dt * tx2 - > * ( - ( u(2,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 ) - > - dt * tx1 * ( - c34 * tmp2 * u(4,i+1,j,k) ) - a(4,2,i,j) = dt * tx2 * ( u(4,i+1,j,k) * tmp1 ) - a(4,3,i,j) = 0.0d+00 - a(4,4,i,j) = dt * tx2 * ( u(2,i+1,j,k) * tmp1 ) - > - dt * tx1 * ( c34 * tmp1 ) - > - dt * tx1 * dx4 - a(4,5,i,j) = 0.0d+00 - - a(5,1,i,j) = dt * tx2 - > * ( ( c2 * ( u(2,i+1,j,k) * u(2,i+1,j,k) - > + u(3,i+1,j,k) * u(3,i+1,j,k) - > + u(4,i+1,j,k) * u(4,i+1,j,k) ) * tmp2 - > - c1 * ( u(5,i+1,j,k) * tmp1 ) ) - > * ( u(2,i+1,j,k) * tmp1 ) ) - > - dt * tx1 - > * ( - ( r43*c34 - c1345 ) * tmp3 * ( u(2,i+1,j,k)**2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(3,i+1,j,k)**2 ) - > - ( c34 - c1345 ) * tmp3 * ( u(4,i+1,j,k)**2 ) - > - c1345 * tmp2 * u(5,i+1,j,k) ) - a(5,2,i,j) = dt * tx2 - > * ( c1 * ( u(5,i+1,j,k) * tmp1 ) - > - 0.50d+00 * c2 - > * ( ( 3.0d+00*u(2,i+1,j,k)*u(2,i+1,j,k) - > + u(3,i+1,j,k)*u(3,i+1,j,k) - > + u(4,i+1,j,k)*u(4,i+1,j,k) ) * tmp2 ) ) - > - dt * tx1 - > * ( r43*c34 - c1345 ) * tmp2 * u(2,i+1,j,k) - a(5,3,i,j) = dt * tx2 - > * ( - c2 * ( u(3,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 ) - > - dt * tx1 - > * ( c34 - c1345 ) * tmp2 * u(3,i+1,j,k) - a(5,4,i,j) = dt * tx2 - > * ( - c2 * ( u(4,i+1,j,k)*u(2,i+1,j,k) ) * tmp2 ) - > - dt * tx1 - > * ( c34 - c1345 ) * tmp2 * u(4,i+1,j,k) - a(5,5,i,j) = dt * tx2 - > * ( c1 * ( u(2,i+1,j,k) * tmp1 ) ) - > - dt * tx1 * c1345 * tmp1 - > - dt * tx1 * dx5 - -c--------------------------------------------------------------------- -c form the second block sub-diagonal -c--------------------------------------------------------------------- - tmp1 = 1.0d+00 / u(1,i,j+1,k) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - b(1,1,i,j) = - dt * ty1 * dy1 - b(1,2,i,j) = 0.0d+00 - b(1,3,i,j) = dt * ty2 - b(1,4,i,j) = 0.0d+00 - b(1,5,i,j) = 0.0d+00 - - b(2,1,i,j) = dt * ty2 - > * ( - ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 ) - > - dt * ty1 * ( - c34 * tmp2 * u(2,i,j+1,k) ) - b(2,2,i,j) = dt * ty2 * ( u(3,i,j+1,k) * tmp1 ) - > - dt * ty1 * ( c34 * tmp1 ) - > - dt * ty1 * dy2 - b(2,3,i,j) = dt * ty2 * ( u(2,i,j+1,k) * tmp1 ) - b(2,4,i,j) = 0.0d+00 - b(2,5,i,j) = 0.0d+00 - - b(3,1,i,j) = dt * ty2 - > * ( - ( u(3,i,j+1,k) * tmp1 ) ** 2 - > + 0.50d+00 * c2 * ( ( u(2,i,j+1,k) * u(2,i,j+1,k) - > + u(3,i,j+1,k) * u(3,i,j+1,k) - > + u(4,i,j+1,k) * u(4,i,j+1,k) ) - > * tmp2 ) ) - > - dt * ty1 * ( - r43 * c34 * tmp2 * u(3,i,j+1,k) ) - b(3,2,i,j) = dt * ty2 - > * ( - c2 * ( u(2,i,j+1,k) * tmp1 ) ) - b(3,3,i,j) = dt * ty2 * ( ( 2.0d+00 - c2 ) - > * ( u(3,i,j+1,k) * tmp1 ) ) - > - dt * ty1 * ( r43 * c34 * tmp1 ) - > - dt * ty1 * dy3 - b(3,4,i,j) = dt * ty2 - > * ( - c2 * ( u(4,i,j+1,k) * tmp1 ) ) - b(3,5,i,j) = dt * ty2 * c2 - - b(4,1,i,j) = dt * ty2 - > * ( - ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) - > - dt * ty1 * ( - c34 * tmp2 * u(4,i,j+1,k) ) - b(4,2,i,j) = 0.0d+00 - b(4,3,i,j) = dt * ty2 * ( u(4,i,j+1,k) * tmp1 ) - b(4,4,i,j) = dt * ty2 * ( u(3,i,j+1,k) * tmp1 ) - > - dt * ty1 * ( c34 * tmp1 ) - > - dt * ty1 * dy4 - b(4,5,i,j) = 0.0d+00 - - b(5,1,i,j) = dt * ty2 - > * ( ( c2 * ( u(2,i,j+1,k) * u(2,i,j+1,k) - > + u(3,i,j+1,k) * u(3,i,j+1,k) - > + u(4,i,j+1,k) * u(4,i,j+1,k) ) * tmp2 - > - c1 * ( u(5,i,j+1,k) * tmp1 ) ) - > * ( u(3,i,j+1,k) * tmp1 ) ) - > - dt * ty1 - > * ( - ( c34 - c1345 )*tmp3*(u(2,i,j+1,k)**2) - > - ( r43*c34 - c1345 )*tmp3*(u(3,i,j+1,k)**2) - > - ( c34 - c1345 )*tmp3*(u(4,i,j+1,k)**2) - > - c1345*tmp2*u(5,i,j+1,k) ) - b(5,2,i,j) = dt * ty2 - > * ( - c2 * ( u(2,i,j+1,k)*u(3,i,j+1,k) ) * tmp2 ) - > - dt * ty1 - > * ( c34 - c1345 ) * tmp2 * u(2,i,j+1,k) - b(5,3,i,j) = dt * ty2 - > * ( c1 * ( u(5,i,j+1,k) * tmp1 ) - > - 0.50d+00 * c2 - > * ( ( u(2,i,j+1,k)*u(2,i,j+1,k) - > + 3.0d+00 * u(3,i,j+1,k)*u(3,i,j+1,k) - > + u(4,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) ) - > - dt * ty1 - > * ( r43*c34 - c1345 ) * tmp2 * u(3,i,j+1,k) - b(5,4,i,j) = dt * ty2 - > * ( - c2 * ( u(3,i,j+1,k)*u(4,i,j+1,k) ) * tmp2 ) - > - dt * ty1 * ( c34 - c1345 ) * tmp2 * u(4,i,j+1,k) - b(5,5,i,j) = dt * ty2 - > * ( c1 * ( u(3,i,j+1,k) * tmp1 ) ) - > - dt * ty1 * c1345 * tmp1 - > - dt * ty1 * dy5 - -c--------------------------------------------------------------------- -c form the third block sub-diagonal -c--------------------------------------------------------------------- - tmp1 = 1.0d+00 / u(1,i,j,k+1) - tmp2 = tmp1 * tmp1 - tmp3 = tmp1 * tmp2 - - c(1,1,i,j) = - dt * tz1 * dz1 - c(1,2,i,j) = 0.0d+00 - c(1,3,i,j) = 0.0d+00 - c(1,4,i,j) = dt * tz2 - c(1,5,i,j) = 0.0d+00 - - c(2,1,i,j) = dt * tz2 - > * ( - ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) - > - dt * tz1 * ( - c34 * tmp2 * u(2,i,j,k+1) ) - c(2,2,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 ) - > - dt * tz1 * c34 * tmp1 - > - dt * tz1 * dz2 - c(2,3,i,j) = 0.0d+00 - c(2,4,i,j) = dt * tz2 * ( u(2,i,j,k+1) * tmp1 ) - c(2,5,i,j) = 0.0d+00 - - c(3,1,i,j) = dt * tz2 - > * ( - ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) - > - dt * tz1 * ( - c34 * tmp2 * u(3,i,j,k+1) ) - c(3,2,i,j) = 0.0d+00 - c(3,3,i,j) = dt * tz2 * ( u(4,i,j,k+1) * tmp1 ) - > - dt * tz1 * ( c34 * tmp1 ) - > - dt * tz1 * dz3 - c(3,4,i,j) = dt * tz2 * ( u(3,i,j,k+1) * tmp1 ) - c(3,5,i,j) = 0.0d+00 - - c(4,1,i,j) = dt * tz2 - > * ( - ( u(4,i,j,k+1) * tmp1 ) ** 2 - > + 0.50d+00 * c2 - > * ( ( u(2,i,j,k+1) * u(2,i,j,k+1) - > + u(3,i,j,k+1) * u(3,i,j,k+1) - > + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2 ) ) - > - dt * tz1 * ( - r43 * c34 * tmp2 * u(4,i,j,k+1) ) - c(4,2,i,j) = dt * tz2 - > * ( - c2 * ( u(2,i,j,k+1) * tmp1 ) ) - c(4,3,i,j) = dt * tz2 - > * ( - c2 * ( u(3,i,j,k+1) * tmp1 ) ) - c(4,4,i,j) = dt * tz2 * ( 2.0d+00 - c2 ) - > * ( u(4,i,j,k+1) * tmp1 ) - > - dt * tz1 * ( r43 * c34 * tmp1 ) - > - dt * tz1 * dz4 - c(4,5,i,j) = dt * tz2 * c2 - - c(5,1,i,j) = dt * tz2 - > * ( ( c2 * ( u(2,i,j,k+1) * u(2,i,j,k+1) - > + u(3,i,j,k+1) * u(3,i,j,k+1) - > + u(4,i,j,k+1) * u(4,i,j,k+1) ) * tmp2 - > - c1 * ( u(5,i,j,k+1) * tmp1 ) ) - > * ( u(4,i,j,k+1) * tmp1 ) ) - > - dt * tz1 - > * ( - ( c34 - c1345 ) * tmp3 * (u(2,i,j,k+1)**2) - > - ( c34 - c1345 ) * tmp3 * (u(3,i,j,k+1)**2) - > - ( r43*c34 - c1345 )* tmp3 * (u(4,i,j,k+1)**2) - > - c1345 * tmp2 * u(5,i,j,k+1) ) - c(5,2,i,j) = dt * tz2 - > * ( - c2 * ( u(2,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) - > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(2,i,j,k+1) - c(5,3,i,j) = dt * tz2 - > * ( - c2 * ( u(3,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) - > - dt * tz1 * ( c34 - c1345 ) * tmp2 * u(3,i,j,k+1) - c(5,4,i,j) = dt * tz2 - > * ( c1 * ( u(5,i,j,k+1) * tmp1 ) - > - 0.50d+00 * c2 - > * ( ( u(2,i,j,k+1)*u(2,i,j,k+1) - > + u(3,i,j,k+1)*u(3,i,j,k+1) - > + 3.0d+00*u(4,i,j,k+1)*u(4,i,j,k+1) ) * tmp2 ) ) - > - dt * tz1 * ( r43*c34 - c1345 ) * tmp2 * u(4,i,j,k+1) - c(5,5,i,j) = dt * tz2 - > * ( c1 * ( u(4,i,j,k+1) * tmp1 ) ) - > - dt * tz1 * c1345 * tmp1 - > - dt * tz1 * dz5 - - end do - end do - - return - end diff --git a/examples/smpi/NAS/LU/l2norm.f b/examples/smpi/NAS/LU/l2norm.f deleted file mode 100644 index 147b21deb4..0000000000 --- a/examples/smpi/NAS/LU/l2norm.f +++ /dev/null @@ -1,68 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine l2norm ( ldx, ldy, ldz, - > nx0, ny0, nz0, - > ist, iend, - > jst, jend, - > v, sum ) -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c to compute the l2-norm of vector v. -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - integer ldx, ldy, ldz - integer nx0, ny0, nz0 - integer ist, iend - integer jst, jend - double precision v(5,-1:ldx+2,-1:ldy+2,*), sum(5) - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, k, m - double precision dummy(5) - - integer IERROR - - - do m = 1, 5 - dummy(m) = 0.0d+00 - end do - - do k = 2, nz0-1 - do j = jst, jend - do i = ist, iend - do m = 1, 5 - dummy(m) = dummy(m) + v(m,i,j,k) * v(m,i,j,k) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c compute the global sum of individual contributions to dot product. -c--------------------------------------------------------------------- - call MPI_ALLREDUCE( dummy, - > sum, - > 5, - > dp_type, - > MPI_SUM, - > MPI_COMM_WORLD, - > IERROR ) - - do m = 1, 5 - sum(m) = sqrt ( sum(m) / ( (nx0-2)*(ny0-2)*(nz0-2) ) ) - end do - - return - end diff --git a/examples/smpi/NAS/LU/lu.f b/examples/smpi/NAS/LU/lu.f deleted file mode 100644 index 543463a07b..0000000000 --- a/examples/smpi/NAS/LU/lu.f +++ /dev/null @@ -1,164 +0,0 @@ -!-------------------------------------------------------------------------! -! ! -! N A S P A R A L L E L B E N C H M A R K S 3.3 ! -! ! -! L U ! -! ! -!-------------------------------------------------------------------------! -! ! -! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! -! It is described in NAS Technical Reports 95-020 and 02-007 ! -! ! -! Permission to use, copy, distribute and modify this software ! -! for any purpose with or without fee is hereby granted. We ! -! request, however, that all derived work reference the NAS ! -! Parallel Benchmarks 3.3. This software is provided "as is" ! -! without express or implied warranty. ! -! ! -! Information on NPB 3.3, including the technical report, the ! -! original specifications, source code, results and information ! -! on how to submit new results, is available at: ! -! ! -! http://www.nas.nasa.gov/Software/NPB/ ! -! ! -! Send comments or suggestions to npb@nas.nasa.gov ! -! ! -! NAS Parallel Benchmarks Group ! -! NASA Ames Research Center ! -! Mail Stop: T27A-1 ! -! Moffett Field, CA 94035-1000 ! -! ! -! E-mail: npb@nas.nasa.gov ! -! Fax: (650) 604-3957 ! -! ! -!-------------------------------------------------------------------------! - -c--------------------------------------------------------------------- -c -c Authors: S. Weeratunga -c V. Venkatakrishnan -c E. Barszcz -c M. Yarrow -c -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- - program applu -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c driver for the performance evaluation of the solver for -c five coupled parabolic/elliptic partial differential equations. -c -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - character class - logical verified - double precision mflops - integer ierr - -c--------------------------------------------------------------------- -c initialize communications -c--------------------------------------------------------------------- - call init_comm() - -c--------------------------------------------------------------------- -c read input data -c--------------------------------------------------------------------- - call read_input() - -c--------------------------------------------------------------------- -c set up processor grid -c--------------------------------------------------------------------- - call proc_grid() - -c--------------------------------------------------------------------- -c determine the neighbors -c--------------------------------------------------------------------- - call neighbors() - -c--------------------------------------------------------------------- -c set up sub-domain sizes -c--------------------------------------------------------------------- - call subdomain() - -c--------------------------------------------------------------------- -c set up coefficients -c--------------------------------------------------------------------- - call setcoeff() - -c--------------------------------------------------------------------- -c set the masks required for comm -c--------------------------------------------------------------------- - call sethyper() - -c--------------------------------------------------------------------- -c set the boundary values for dependent variables -c--------------------------------------------------------------------- - call setbv() - -c--------------------------------------------------------------------- -c set the initial values for dependent variables -c--------------------------------------------------------------------- - call setiv() - -c--------------------------------------------------------------------- -c compute the forcing term based on prescribed exact solution -c--------------------------------------------------------------------- - call erhs() - -c--------------------------------------------------------------------- -c perform one SSOR iteration to touch all data and program pages -c--------------------------------------------------------------------- - call ssor(1) - -c--------------------------------------------------------------------- -c reset the boundary and initial values -c--------------------------------------------------------------------- - call setbv() - call setiv() - -c--------------------------------------------------------------------- -c perform the SSOR iterations -c--------------------------------------------------------------------- - call ssor(itmax) - -c--------------------------------------------------------------------- -c compute the solution error -c--------------------------------------------------------------------- - call error() - -c--------------------------------------------------------------------- -c compute the surface integral -c--------------------------------------------------------------------- - call pintgr() - -c--------------------------------------------------------------------- -c verification test -c--------------------------------------------------------------------- - IF (id.eq.0) THEN - call verify ( rsdnm, errnm, frc, class, verified ) - mflops = float(itmax)*(1984.77*float( nx0 ) - > *float( ny0 ) - > *float( nz0 ) - > -10923.3*(float( nx0+ny0+nz0 )/3.)**2 - > +27770.9* float( nx0+ny0+nz0 )/3. - > -144010.) - > / (maxtime*1000000.) - - call print_results('LU', class, nx0, - > ny0, nz0, itmax, nnodes_compiled, - > num, maxtime, mflops, ' floating point', verified, - > npbversion, compiletime, cs1, cs2, cs3, cs4, cs5, cs6, - > '(none)') - - END IF - - call mpi_finalize(ierr) - end - - diff --git a/examples/smpi/NAS/LU/mpinpb.h b/examples/smpi/NAS/LU/mpinpb.h deleted file mode 100644 index ddbf1515f0..0000000000 --- a/examples/smpi/NAS/LU/mpinpb.h +++ /dev/null @@ -1,11 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'mpif.h' - - integer node, no_nodes, root, comm_setup, - > comm_solve, comm_rhs, dp_type - common /mpistuff/ node, no_nodes, root, comm_setup, - > comm_solve, comm_rhs, dp_type - diff --git a/examples/smpi/NAS/LU/neighbors.f b/examples/smpi/NAS/LU/neighbors.f deleted file mode 100644 index ed8a3126af..0000000000 --- a/examples/smpi/NAS/LU/neighbors.f +++ /dev/null @@ -1,48 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine neighbors () - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c figure out the neighbors and their wrap numbers for each processor -c--------------------------------------------------------------------- - - south = -1 - east = -1 - north = -1 - west = -1 - - if (row.gt.1) then - north = id -1 - else - north = -1 - end if - - if (row.lt.xdim) then - south = id + 1 - else - south = -1 - end if - - if (col.gt.1) then - west = id- xdim - else - west = -1 - end if - - if (col.lt.ydim) then - east = id + xdim - else - east = -1 - end if - - return - end diff --git a/examples/smpi/NAS/LU/nodedim.f b/examples/smpi/NAS/LU/nodedim.f deleted file mode 100644 index f4def3a01e..0000000000 --- a/examples/smpi/NAS/LU/nodedim.f +++ /dev/null @@ -1,36 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - integer function nodedim(num) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c compute the exponent where num = 2**nodedim -c NOTE: assumes a power-of-two number of nodes -c -c--------------------------------------------------------------------- - - implicit none - -c--------------------------------------------------------------------- -c input parameters -c--------------------------------------------------------------------- - integer num - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - double precision fnum - - - fnum = dble(num) - nodedim = log(fnum)/log(2.0d+0) + 0.00001 - - return - end - - diff --git a/examples/smpi/NAS/LU/pintgr.f b/examples/smpi/NAS/LU/pintgr.f deleted file mode 100644 index de514cccd2..0000000000 --- a/examples/smpi/NAS/LU/pintgr.f +++ /dev/null @@ -1,288 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine pintgr - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, k - integer ibeg, ifin, ifin1 - integer jbeg, jfin, jfin1 - integer iglob, iglob1, iglob2 - integer jglob, jglob1, jglob2 - integer ind1, ind2 - double precision phi1(0:isiz2+1,0:isiz3+1), - > phi2(0:isiz2+1,0:isiz3+1) - double precision frc1, frc2, frc3 - double precision dummy - - integer IERROR - - -c--------------------------------------------------------------------- -c set up the sub-domains for integeration in each processor -c--------------------------------------------------------------------- - ibeg = nx + 1 - ifin = 0 - iglob1 = ipt + 1 - iglob2 = ipt + nx - if (iglob1.ge.ii1.and.iglob2.lt.ii2+nx) ibeg = 1 - if (iglob1.gt.ii1-nx.and.iglob2.le.ii2) ifin = nx - if (ii1.ge.iglob1.and.ii1.le.iglob2) ibeg = ii1 - ipt - if (ii2.ge.iglob1.and.ii2.le.iglob2) ifin = ii2 - ipt - jbeg = ny + 1 - jfin = 0 - jglob1 = jpt + 1 - jglob2 = jpt + ny - if (jglob1.ge.ji1.and.jglob2.lt.ji2+ny) jbeg = 1 - if (jglob1.gt.ji1-ny.and.jglob2.le.ji2) jfin = ny - if (ji1.ge.jglob1.and.ji1.le.jglob2) jbeg = ji1 - jpt - if (ji2.ge.jglob1.and.ji2.le.jglob2) jfin = ji2 - jpt - ifin1 = ifin - jfin1 = jfin - if (ipt + ifin1.eq.ii2) ifin1 = ifin -1 - if (jpt + jfin1.eq.ji2) jfin1 = jfin -1 - -c--------------------------------------------------------------------- -c initialize -c--------------------------------------------------------------------- - do i = 0,isiz2+1 - do k = 0,isiz3+1 - phi1(i,k) = 0. - phi2(i,k) = 0. - end do - end do - - do j = jbeg,jfin - jglob = jpt + j - do i = ibeg,ifin - iglob = ipt + i - - k = ki1 - - phi1(i,j) = c2*( u(5,i,j,k) - > - 0.50d+00 * ( u(2,i,j,k) ** 2 - > + u(3,i,j,k) ** 2 - > + u(4,i,j,k) ** 2 ) - > / u(1,i,j,k) ) - - k = ki2 - - phi2(i,j) = c2*( u(5,i,j,k) - > - 0.50d+00 * ( u(2,i,j,k) ** 2 - > + u(3,i,j,k) ** 2 - > + u(4,i,j,k) ** 2 ) - > / u(1,i,j,k) ) - end do - end do - -c--------------------------------------------------------------------- -c communicate in i and j directions -c--------------------------------------------------------------------- - call exchange_4(phi1,phi2,ibeg,ifin1,jbeg,jfin1) - - frc1 = 0.0d+00 - - do j = jbeg,jfin1 - do i = ibeg, ifin1 - frc1 = frc1 + ( phi1(i,j) - > + phi1(i+1,j) - > + phi1(i,j+1) - > + phi1(i+1,j+1) - > + phi2(i,j) - > + phi2(i+1,j) - > + phi2(i,j+1) - > + phi2(i+1,j+1) ) - end do - end do - -c--------------------------------------------------------------------- -c compute the global sum of individual contributions to frc1 -c--------------------------------------------------------------------- - dummy = frc1 - call MPI_ALLREDUCE( dummy, - > frc1, - > 1, - > dp_type, - > MPI_SUM, - > MPI_COMM_WORLD, - > IERROR ) - - frc1 = dxi * deta * frc1 - -c--------------------------------------------------------------------- -c initialize -c--------------------------------------------------------------------- - do i = 0,isiz2+1 - do k = 0,isiz3+1 - phi1(i,k) = 0. - phi2(i,k) = 0. - end do - end do - jglob = jpt + jbeg - ind1 = 0 - if (jglob.eq.ji1) then - ind1 = 1 - do k = ki1, ki2 - do i = ibeg, ifin - iglob = ipt + i - phi1(i,k) = c2*( u(5,i,jbeg,k) - > - 0.50d+00 * ( u(2,i,jbeg,k) ** 2 - > + u(3,i,jbeg,k) ** 2 - > + u(4,i,jbeg,k) ** 2 ) - > / u(1,i,jbeg,k) ) - end do - end do - end if - - jglob = jpt + jfin - ind2 = 0 - if (jglob.eq.ji2) then - ind2 = 1 - do k = ki1, ki2 - do i = ibeg, ifin - iglob = ipt + i - phi2(i,k) = c2*( u(5,i,jfin,k) - > - 0.50d+00 * ( u(2,i,jfin,k) ** 2 - > + u(3,i,jfin,k) ** 2 - > + u(4,i,jfin,k) ** 2 ) - > / u(1,i,jfin,k) ) - end do - end do - end if - -c--------------------------------------------------------------------- -c communicate in i direction -c--------------------------------------------------------------------- - if (ind1.eq.1) then - call exchange_5(phi1,ibeg,ifin1) - end if - if (ind2.eq.1) then - call exchange_5 (phi2,ibeg,ifin1) - end if - - frc2 = 0.0d+00 - do k = ki1, ki2-1 - do i = ibeg, ifin1 - frc2 = frc2 + ( phi1(i,k) - > + phi1(i+1,k) - > + phi1(i,k+1) - > + phi1(i+1,k+1) - > + phi2(i,k) - > + phi2(i+1,k) - > + phi2(i,k+1) - > + phi2(i+1,k+1) ) - end do - end do - -c--------------------------------------------------------------------- -c compute the global sum of individual contributions to frc2 -c--------------------------------------------------------------------- - dummy = frc2 - call MPI_ALLREDUCE( dummy, - > frc2, - > 1, - > dp_type, - > MPI_SUM, - > MPI_COMM_WORLD, - > IERROR ) - - frc2 = dxi * dzeta * frc2 - -c--------------------------------------------------------------------- -c initialize -c--------------------------------------------------------------------- - do i = 0,isiz2+1 - do k = 0,isiz3+1 - phi1(i,k) = 0. - phi2(i,k) = 0. - end do - end do - iglob = ipt + ibeg - ind1 = 0 - if (iglob.eq.ii1) then - ind1 = 1 - do k = ki1, ki2 - do j = jbeg, jfin - jglob = jpt + j - phi1(j,k) = c2*( u(5,ibeg,j,k) - > - 0.50d+00 * ( u(2,ibeg,j,k) ** 2 - > + u(3,ibeg,j,k) ** 2 - > + u(4,ibeg,j,k) ** 2 ) - > / u(1,ibeg,j,k) ) - end do - end do - end if - - iglob = ipt + ifin - ind2 = 0 - if (iglob.eq.ii2) then - ind2 = 1 - do k = ki1, ki2 - do j = jbeg, jfin - jglob = jpt + j - phi2(j,k) = c2*( u(5,ifin,j,k) - > - 0.50d+00 * ( u(2,ifin,j,k) ** 2 - > + u(3,ifin,j,k) ** 2 - > + u(4,ifin,j,k) ** 2 ) - > / u(1,ifin,j,k) ) - end do - end do - end if - -c--------------------------------------------------------------------- -c communicate in j direction -c--------------------------------------------------------------------- - if (ind1.eq.1) then - call exchange_6(phi1,jbeg,jfin1) - end if - if (ind2.eq.1) then - call exchange_6(phi2,jbeg,jfin1) - end if - - frc3 = 0.0d+00 - - do k = ki1, ki2-1 - do j = jbeg, jfin1 - frc3 = frc3 + ( phi1(j,k) - > + phi1(j+1,k) - > + phi1(j,k+1) - > + phi1(j+1,k+1) - > + phi2(j,k) - > + phi2(j+1,k) - > + phi2(j,k+1) - > + phi2(j+1,k+1) ) - end do - end do - -c--------------------------------------------------------------------- -c compute the global sum of individual contributions to frc3 -c--------------------------------------------------------------------- - dummy = frc3 - call MPI_ALLREDUCE( dummy, - > frc3, - > 1, - > dp_type, - > MPI_SUM, - > MPI_COMM_WORLD, - > IERROR ) - - frc3 = deta * dzeta * frc3 - frc = 0.25d+00 * ( frc1 + frc2 + frc3 ) -c if (id.eq.0) write (*,1001) frc - - return - - 1001 format (//5x,'surface integral = ',1pe12.5//) - - end diff --git a/examples/smpi/NAS/LU/proc_grid.f b/examples/smpi/NAS/LU/proc_grid.f deleted file mode 100644 index 40271c135e..0000000000 --- a/examples/smpi/NAS/LU/proc_grid.f +++ /dev/null @@ -1,36 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine proc_grid - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c set up a two-d grid for processors: column-major ordering of unknowns -c NOTE: assumes a power-of-two number of processors -c -c--------------------------------------------------------------------- - - xdim = 2**(ndim/2) - if (mod(ndim,2).eq.1) xdim = xdim + xdim - ydim = num/xdim - - row = mod(id,xdim) + 1 - col = id/xdim + 1 - - - return - end - - diff --git a/examples/smpi/NAS/LU/read_input.f b/examples/smpi/NAS/LU/read_input.f deleted file mode 100644 index b2e5ff1e79..0000000000 --- a/examples/smpi/NAS/LU/read_input.f +++ /dev/null @@ -1,127 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine read_input - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - - integer IERROR, fstatus, nnodes - - -c--------------------------------------------------------------------- -c only root reads the input file -c if input file does not exist, it uses defaults -c ipr = 1 for detailed progress output -c inorm = how often the norm is printed (once every inorm iterations) -c itmax = number of pseudo time steps -c dt = time step -c omega 1 over-relaxation factor for SSOR -c tolrsd = steady state residual tolerance levels -c nx, ny, nz = number of grid points in x, y, z directions -c--------------------------------------------------------------------- - ROOT = 0 - if (id .eq. ROOT) then - - write(*, 1000) - - open (unit=3,file='inputlu.data',status='old', - > access='sequential',form='formatted', iostat=fstatus) - if (fstatus .eq. 0) then - - write(*, *) 'Reading from input file inputlu.data' - - read (3,*) - read (3,*) - read (3,*) ipr, inorm - read (3,*) - read (3,*) - read (3,*) itmax - read (3,*) - read (3,*) - read (3,*) dt - read (3,*) - read (3,*) - read (3,*) omega - read (3,*) - read (3,*) - read (3,*) tolrsd(1),tolrsd(2),tolrsd(3),tolrsd(4),tolrsd(5) - read (3,*) - read (3,*) - read (3,*) nx0, ny0, nz0 - close(3) - else - ipr = ipr_default - inorm = inorm_default - itmax = itmax_default - dt = dt_default - omega = omega_default - tolrsd(1) = tolrsd1_def - tolrsd(2) = tolrsd2_def - tolrsd(3) = tolrsd3_def - tolrsd(4) = tolrsd4_def - tolrsd(5) = tolrsd5_def - nx0 = isiz01 - ny0 = isiz02 - nz0 = isiz03 - endif - -c--------------------------------------------------------------------- -c check problem size -c--------------------------------------------------------------------- - call MPI_COMM_SIZE(MPI_COMM_WORLD, nnodes, ierror) - if (nnodes .ne. nnodes_compiled) then - write (*, 2000) nnodes, nnodes_compiled - 2000 format (5x,'Warning: program is running on',i3,' processors' - > /5x,'but was compiled for ', i3) - endif - - if ( ( nx0 .lt. 4 ) .or. - > ( ny0 .lt. 4 ) .or. - > ( nz0 .lt. 4 ) ) then - - write (*,2001) - 2001 format (5x,'PROBLEM SIZE IS TOO SMALL - ', - > /5x,'SET EACH OF NX, NY AND NZ AT LEAST EQUAL TO 5') - CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR ) - - end if - - if ( ( nx0 .gt. isiz01 ) .or. - > ( ny0 .gt. isiz02 ) .or. - > ( nz0 .gt. isiz03 ) ) then - - write (*,2002) - 2002 format (5x,'PROBLEM SIZE IS TOO LARGE - ', - > /5x,'NX, NY AND NZ SHOULD BE LESS THAN OR EQUAL TO ', - > /5x,'ISIZ01, ISIZ02 AND ISIZ03 RESPECTIVELY') - CALL MPI_ABORT( MPI_COMM_WORLD, MPI_ERR_OTHER, IERROR ) - - end if - - - write(*, 1001) nx0, ny0, nz0 - write(*, 1002) itmax - write(*, 1003) nnodes - - 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- LU Benchmark',/) - 1001 format(' Size: ', i4, 'x', i4, 'x', i4) - 1002 format(' Iterations: ', i4) - 1003 format(' Number of processes: ', i5, /) - - - - end if - - call bcast_inputs - - return - end - - diff --git a/examples/smpi/NAS/LU/rhs.f b/examples/smpi/NAS/LU/rhs.f deleted file mode 100644 index 3da39117a4..0000000000 --- a/examples/smpi/NAS/LU/rhs.f +++ /dev/null @@ -1,504 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine rhs - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute the right hand sides -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, k, m - integer iex - integer L1, L2 - integer ist1, iend1 - integer jst1, jend1 - double precision q - double precision u21, u31, u41 - double precision tmp - double precision u21i, u31i, u41i, u51i - double precision u21j, u31j, u41j, u51j - double precision u21k, u31k, u41k, u51k - double precision u21im1, u31im1, u41im1, u51im1 - double precision u21jm1, u31jm1, u41jm1, u51jm1 - double precision u21km1, u31km1, u41km1, u51km1 - - - - do k = 1, nz - do j = 1, ny - do i = 1, nx - do m = 1, 5 - rsd(m,i,j,k) = - frct(m,i,j,k) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c xi-direction flux differences -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c iex = flag : iex = 0 north/south communication -c : iex = 1 east/west communication -c--------------------------------------------------------------------- - iex = 0 - -c--------------------------------------------------------------------- -c communicate and receive/send two rows of data -c--------------------------------------------------------------------- - call exchange_3(u,iex) - - L1 = 0 - if (north.eq.-1) L1 = 1 - L2 = nx + 1 - if (south.eq.-1) L2 = nx - - ist1 = 1 - iend1 = nx - if (north.eq.-1) ist1 = 4 - if (south.eq.-1) iend1 = nx - 3 - - do k = 2, nz - 1 - do j = jst, jend - do i = L1, L2 - flux(1,i,j,k) = u(2,i,j,k) - u21 = u(2,i,j,k) / u(1,i,j,k) - - q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k) - > + u(3,i,j,k) * u(3,i,j,k) - > + u(4,i,j,k) * u(4,i,j,k) ) - > / u(1,i,j,k) - - flux(2,i,j,k) = u(2,i,j,k) * u21 + c2 * - > ( u(5,i,j,k) - q ) - flux(3,i,j,k) = u(3,i,j,k) * u21 - flux(4,i,j,k) = u(4,i,j,k) * u21 - flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u21 - end do - - do i = ist, iend - do m = 1, 5 - rsd(m,i,j,k) = rsd(m,i,j,k) - > - tx2 * ( flux(m,i+1,j,k) - flux(m,i-1,j,k) ) - end do - end do - - do i = ist, L2 - tmp = 1.0d+00 / u(1,i,j,k) - - u21i = tmp * u(2,i,j,k) - u31i = tmp * u(3,i,j,k) - u41i = tmp * u(4,i,j,k) - u51i = tmp * u(5,i,j,k) - - tmp = 1.0d+00 / u(1,i-1,j,k) - - u21im1 = tmp * u(2,i-1,j,k) - u31im1 = tmp * u(3,i-1,j,k) - u41im1 = tmp * u(4,i-1,j,k) - u51im1 = tmp * u(5,i-1,j,k) - - flux(2,i,j,k) = (4.0d+00/3.0d+00) * tx3 * (u21i-u21im1) - flux(3,i,j,k) = tx3 * ( u31i - u31im1 ) - flux(4,i,j,k) = tx3 * ( u41i - u41im1 ) - flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) - > * tx3 * ( ( u21i **2 + u31i **2 + u41i **2 ) - > - ( u21im1**2 + u31im1**2 + u41im1**2 ) ) - > + (1.0d+00/6.0d+00) - > * tx3 * ( u21i**2 - u21im1**2 ) - > + c1 * c5 * tx3 * ( u51i - u51im1 ) - end do - - do i = ist, iend - rsd(1,i,j,k) = rsd(1,i,j,k) - > + dx1 * tx1 * ( u(1,i-1,j,k) - > - 2.0d+00 * u(1,i,j,k) - > + u(1,i+1,j,k) ) - rsd(2,i,j,k) = rsd(2,i,j,k) - > + tx3 * c3 * c4 * ( flux(2,i+1,j,k) - flux(2,i,j,k) ) - > + dx2 * tx1 * ( u(2,i-1,j,k) - > - 2.0d+00 * u(2,i,j,k) - > + u(2,i+1,j,k) ) - rsd(3,i,j,k) = rsd(3,i,j,k) - > + tx3 * c3 * c4 * ( flux(3,i+1,j,k) - flux(3,i,j,k) ) - > + dx3 * tx1 * ( u(3,i-1,j,k) - > - 2.0d+00 * u(3,i,j,k) - > + u(3,i+1,j,k) ) - rsd(4,i,j,k) = rsd(4,i,j,k) - > + tx3 * c3 * c4 * ( flux(4,i+1,j,k) - flux(4,i,j,k) ) - > + dx4 * tx1 * ( u(4,i-1,j,k) - > - 2.0d+00 * u(4,i,j,k) - > + u(4,i+1,j,k) ) - rsd(5,i,j,k) = rsd(5,i,j,k) - > + tx3 * c3 * c4 * ( flux(5,i+1,j,k) - flux(5,i,j,k) ) - > + dx5 * tx1 * ( u(5,i-1,j,k) - > - 2.0d+00 * u(5,i,j,k) - > + u(5,i+1,j,k) ) - end do - -c--------------------------------------------------------------------- -c Fourth-order dissipation -c--------------------------------------------------------------------- - IF (north.eq.-1) then - do m = 1, 5 - rsd(m,2,j,k) = rsd(m,2,j,k) - > - dssp * ( + 5.0d+00 * u(m,2,j,k) - > - 4.0d+00 * u(m,3,j,k) - > + u(m,4,j,k) ) - rsd(m,3,j,k) = rsd(m,3,j,k) - > - dssp * ( - 4.0d+00 * u(m,2,j,k) - > + 6.0d+00 * u(m,3,j,k) - > - 4.0d+00 * u(m,4,j,k) - > + u(m,5,j,k) ) - end do - END IF - - do i = ist1,iend1 - do m = 1, 5 - rsd(m,i,j,k) = rsd(m,i,j,k) - > - dssp * ( u(m,i-2,j,k) - > - 4.0d+00 * u(m,i-1,j,k) - > + 6.0d+00 * u(m,i,j,k) - > - 4.0d+00 * u(m,i+1,j,k) - > + u(m,i+2,j,k) ) - end do - end do - - IF (south.eq.-1) then - do m = 1, 5 - rsd(m,nx-2,j,k) = rsd(m,nx-2,j,k) - > - dssp * ( u(m,nx-4,j,k) - > - 4.0d+00 * u(m,nx-3,j,k) - > + 6.0d+00 * u(m,nx-2,j,k) - > - 4.0d+00 * u(m,nx-1,j,k) ) - rsd(m,nx-1,j,k) = rsd(m,nx-1,j,k) - > - dssp * ( u(m,nx-3,j,k) - > - 4.0d+00 * u(m,nx-2,j,k) - > + 5.0d+00 * u(m,nx-1,j,k) ) - end do - END IF - - end do - end do - -c--------------------------------------------------------------------- -c eta-direction flux differences -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c iex = flag : iex = 0 north/south communication -c--------------------------------------------------------------------- - iex = 1 - -c--------------------------------------------------------------------- -c communicate and receive/send two rows of data -c--------------------------------------------------------------------- - call exchange_3(u,iex) - - L1 = 0 - if (west.eq.-1) L1 = 1 - L2 = ny + 1 - if (east.eq.-1) L2 = ny - - jst1 = 1 - jend1 = ny - if (west.eq.-1) jst1 = 4 - if (east.eq.-1) jend1 = ny - 3 - - do k = 2, nz - 1 - do j = L1, L2 - do i = ist, iend - flux(1,i,j,k) = u(3,i,j,k) - u31 = u(3,i,j,k) / u(1,i,j,k) - - q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k) - > + u(3,i,j,k) * u(3,i,j,k) - > + u(4,i,j,k) * u(4,i,j,k) ) - > / u(1,i,j,k) - - flux(2,i,j,k) = u(2,i,j,k) * u31 - flux(3,i,j,k) = u(3,i,j,k) * u31 + c2 * (u(5,i,j,k)-q) - flux(4,i,j,k) = u(4,i,j,k) * u31 - flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u31 - end do - end do - - do j = jst, jend - do i = ist, iend - do m = 1, 5 - rsd(m,i,j,k) = rsd(m,i,j,k) - > - ty2 * ( flux(m,i,j+1,k) - flux(m,i,j-1,k) ) - end do - end do - end do - - do j = jst, L2 - do i = ist, iend - tmp = 1.0d+00 / u(1,i,j,k) - - u21j = tmp * u(2,i,j,k) - u31j = tmp * u(3,i,j,k) - u41j = tmp * u(4,i,j,k) - u51j = tmp * u(5,i,j,k) - - tmp = 1.0d+00 / u(1,i,j-1,k) - u21jm1 = tmp * u(2,i,j-1,k) - u31jm1 = tmp * u(3,i,j-1,k) - u41jm1 = tmp * u(4,i,j-1,k) - u51jm1 = tmp * u(5,i,j-1,k) - - flux(2,i,j,k) = ty3 * ( u21j - u21jm1 ) - flux(3,i,j,k) = (4.0d+00/3.0d+00) * ty3 * (u31j-u31jm1) - flux(4,i,j,k) = ty3 * ( u41j - u41jm1 ) - flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) - > * ty3 * ( ( u21j **2 + u31j **2 + u41j **2 ) - > - ( u21jm1**2 + u31jm1**2 + u41jm1**2 ) ) - > + (1.0d+00/6.0d+00) - > * ty3 * ( u31j**2 - u31jm1**2 ) - > + c1 * c5 * ty3 * ( u51j - u51jm1 ) - end do - end do - - do j = jst, jend - do i = ist, iend - - rsd(1,i,j,k) = rsd(1,i,j,k) - > + dy1 * ty1 * ( u(1,i,j-1,k) - > - 2.0d+00 * u(1,i,j,k) - > + u(1,i,j+1,k) ) - - rsd(2,i,j,k) = rsd(2,i,j,k) - > + ty3 * c3 * c4 * ( flux(2,i,j+1,k) - flux(2,i,j,k) ) - > + dy2 * ty1 * ( u(2,i,j-1,k) - > - 2.0d+00 * u(2,i,j,k) - > + u(2,i,j+1,k) ) - - rsd(3,i,j,k) = rsd(3,i,j,k) - > + ty3 * c3 * c4 * ( flux(3,i,j+1,k) - flux(3,i,j,k) ) - > + dy3 * ty1 * ( u(3,i,j-1,k) - > - 2.0d+00 * u(3,i,j,k) - > + u(3,i,j+1,k) ) - - rsd(4,i,j,k) = rsd(4,i,j,k) - > + ty3 * c3 * c4 * ( flux(4,i,j+1,k) - flux(4,i,j,k) ) - > + dy4 * ty1 * ( u(4,i,j-1,k) - > - 2.0d+00 * u(4,i,j,k) - > + u(4,i,j+1,k) ) - - rsd(5,i,j,k) = rsd(5,i,j,k) - > + ty3 * c3 * c4 * ( flux(5,i,j+1,k) - flux(5,i,j,k) ) - > + dy5 * ty1 * ( u(5,i,j-1,k) - > - 2.0d+00 * u(5,i,j,k) - > + u(5,i,j+1,k) ) - - end do - end do - -c--------------------------------------------------------------------- -c fourth-order dissipation -c--------------------------------------------------------------------- - IF (west.eq.-1) then - do i = ist, iend - do m = 1, 5 - rsd(m,i,2,k) = rsd(m,i,2,k) - > - dssp * ( + 5.0d+00 * u(m,i,2,k) - > - 4.0d+00 * u(m,i,3,k) - > + u(m,i,4,k) ) - rsd(m,i,3,k) = rsd(m,i,3,k) - > - dssp * ( - 4.0d+00 * u(m,i,2,k) - > + 6.0d+00 * u(m,i,3,k) - > - 4.0d+00 * u(m,i,4,k) - > + u(m,i,5,k) ) - end do - end do - END IF - - do j = jst1, jend1 - do i = ist, iend - do m = 1, 5 - rsd(m,i,j,k) = rsd(m,i,j,k) - > - dssp * ( u(m,i,j-2,k) - > - 4.0d+00 * u(m,i,j-1,k) - > + 6.0d+00 * u(m,i,j,k) - > - 4.0d+00 * u(m,i,j+1,k) - > + u(m,i,j+2,k) ) - end do - end do - end do - - IF (east.eq.-1) then - do i = ist, iend - do m = 1, 5 - rsd(m,i,ny-2,k) = rsd(m,i,ny-2,k) - > - dssp * ( u(m,i,ny-4,k) - > - 4.0d+00 * u(m,i,ny-3,k) - > + 6.0d+00 * u(m,i,ny-2,k) - > - 4.0d+00 * u(m,i,ny-1,k) ) - rsd(m,i,ny-1,k) = rsd(m,i,ny-1,k) - > - dssp * ( u(m,i,ny-3,k) - > - 4.0d+00 * u(m,i,ny-2,k) - > + 5.0d+00 * u(m,i,ny-1,k) ) - end do - end do - END IF - - end do - -c--------------------------------------------------------------------- -c zeta-direction flux differences -c--------------------------------------------------------------------- - do k = 1, nz - do j = jst, jend - do i = ist, iend - flux(1,i,j,k) = u(4,i,j,k) - u41 = u(4,i,j,k) / u(1,i,j,k) - - q = 0.50d+00 * ( u(2,i,j,k) * u(2,i,j,k) - > + u(3,i,j,k) * u(3,i,j,k) - > + u(4,i,j,k) * u(4,i,j,k) ) - > / u(1,i,j,k) - - flux(2,i,j,k) = u(2,i,j,k) * u41 - flux(3,i,j,k) = u(3,i,j,k) * u41 - flux(4,i,j,k) = u(4,i,j,k) * u41 + c2 * (u(5,i,j,k)-q) - flux(5,i,j,k) = ( c1 * u(5,i,j,k) - c2 * q ) * u41 - end do - end do - end do - - do k = 2, nz - 1 - do j = jst, jend - do i = ist, iend - do m = 1, 5 - rsd(m,i,j,k) = rsd(m,i,j,k) - > - tz2 * ( flux(m,i,j,k+1) - flux(m,i,j,k-1) ) - end do - end do - end do - end do - - do k = 2, nz - do j = jst, jend - do i = ist, iend - tmp = 1.0d+00 / u(1,i,j,k) - - u21k = tmp * u(2,i,j,k) - u31k = tmp * u(3,i,j,k) - u41k = tmp * u(4,i,j,k) - u51k = tmp * u(5,i,j,k) - - tmp = 1.0d+00 / u(1,i,j,k-1) - - u21km1 = tmp * u(2,i,j,k-1) - u31km1 = tmp * u(3,i,j,k-1) - u41km1 = tmp * u(4,i,j,k-1) - u51km1 = tmp * u(5,i,j,k-1) - - flux(2,i,j,k) = tz3 * ( u21k - u21km1 ) - flux(3,i,j,k) = tz3 * ( u31k - u31km1 ) - flux(4,i,j,k) = (4.0d+00/3.0d+00) * tz3 * (u41k-u41km1) - flux(5,i,j,k) = 0.50d+00 * ( 1.0d+00 - c1*c5 ) - > * tz3 * ( ( u21k **2 + u31k **2 + u41k **2 ) - > - ( u21km1**2 + u31km1**2 + u41km1**2 ) ) - > + (1.0d+00/6.0d+00) - > * tz3 * ( u41k**2 - u41km1**2 ) - > + c1 * c5 * tz3 * ( u51k - u51km1 ) - end do - end do - end do - - do k = 2, nz - 1 - do j = jst, jend - do i = ist, iend - rsd(1,i,j,k) = rsd(1,i,j,k) - > + dz1 * tz1 * ( u(1,i,j,k-1) - > - 2.0d+00 * u(1,i,j,k) - > + u(1,i,j,k+1) ) - rsd(2,i,j,k) = rsd(2,i,j,k) - > + tz3 * c3 * c4 * ( flux(2,i,j,k+1) - flux(2,i,j,k) ) - > + dz2 * tz1 * ( u(2,i,j,k-1) - > - 2.0d+00 * u(2,i,j,k) - > + u(2,i,j,k+1) ) - rsd(3,i,j,k) = rsd(3,i,j,k) - > + tz3 * c3 * c4 * ( flux(3,i,j,k+1) - flux(3,i,j,k) ) - > + dz3 * tz1 * ( u(3,i,j,k-1) - > - 2.0d+00 * u(3,i,j,k) - > + u(3,i,j,k+1) ) - rsd(4,i,j,k) = rsd(4,i,j,k) - > + tz3 * c3 * c4 * ( flux(4,i,j,k+1) - flux(4,i,j,k) ) - > + dz4 * tz1 * ( u(4,i,j,k-1) - > - 2.0d+00 * u(4,i,j,k) - > + u(4,i,j,k+1) ) - rsd(5,i,j,k) = rsd(5,i,j,k) - > + tz3 * c3 * c4 * ( flux(5,i,j,k+1) - flux(5,i,j,k) ) - > + dz5 * tz1 * ( u(5,i,j,k-1) - > - 2.0d+00 * u(5,i,j,k) - > + u(5,i,j,k+1) ) - end do - end do - end do - -c--------------------------------------------------------------------- -c fourth-order dissipation -c--------------------------------------------------------------------- - do j = jst, jend - do i = ist, iend - do m = 1, 5 - rsd(m,i,j,2) = rsd(m,i,j,2) - > - dssp * ( + 5.0d+00 * u(m,i,j,2) - > - 4.0d+00 * u(m,i,j,3) - > + u(m,i,j,4) ) - rsd(m,i,j,3) = rsd(m,i,j,3) - > - dssp * ( - 4.0d+00 * u(m,i,j,2) - > + 6.0d+00 * u(m,i,j,3) - > - 4.0d+00 * u(m,i,j,4) - > + u(m,i,j,5) ) - end do - end do - end do - - do k = 4, nz - 3 - do j = jst, jend - do i = ist, iend - do m = 1, 5 - rsd(m,i,j,k) = rsd(m,i,j,k) - > - dssp * ( u(m,i,j,k-2) - > - 4.0d+00 * u(m,i,j,k-1) - > + 6.0d+00 * u(m,i,j,k) - > - 4.0d+00 * u(m,i,j,k+1) - > + u(m,i,j,k+2) ) - end do - end do - end do - end do - - do j = jst, jend - do i = ist, iend - do m = 1, 5 - rsd(m,i,j,nz-2) = rsd(m,i,j,nz-2) - > - dssp * ( u(m,i,j,nz-4) - > - 4.0d+00 * u(m,i,j,nz-3) - > + 6.0d+00 * u(m,i,j,nz-2) - > - 4.0d+00 * u(m,i,j,nz-1) ) - rsd(m,i,j,nz-1) = rsd(m,i,j,nz-1) - > - dssp * ( u(m,i,j,nz-3) - > - 4.0d+00 * u(m,i,j,nz-2) - > + 5.0d+00 * u(m,i,j,nz-1) ) - end do - end do - end do - - return - end diff --git a/examples/smpi/NAS/LU/setbv.f b/examples/smpi/NAS/LU/setbv.f deleted file mode 100644 index 56b0edf967..0000000000 --- a/examples/smpi/NAS/LU/setbv.f +++ /dev/null @@ -1,79 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setbv - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c set the boundary values of dependent variables -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, k - integer iglob, jglob - -c--------------------------------------------------------------------- -c set the dependent variable values along the top and bottom faces -c--------------------------------------------------------------------- - do j = 1, ny - jglob = jpt + j - do i = 1, nx - iglob = ipt + i - call exact( iglob, jglob, 1, u( 1, i, j, 1 ) ) - call exact( iglob, jglob, nz, u( 1, i, j, nz ) ) - end do - end do - -c--------------------------------------------------------------------- -c set the dependent variable values along north and south faces -c--------------------------------------------------------------------- - IF (west.eq.-1) then - do k = 1, nz - do i = 1, nx - iglob = ipt + i - call exact( iglob, 1, k, u( 1, i, 1, k ) ) - end do - end do - END IF - - IF (east.eq.-1) then - do k = 1, nz - do i = 1, nx - iglob = ipt + i - call exact( iglob, ny0, k, u( 1, i, ny, k ) ) - end do - end do - END IF - -c--------------------------------------------------------------------- -c set the dependent variable values along east and west faces -c--------------------------------------------------------------------- - IF (north.eq.-1) then - do k = 1, nz - do j = 1, ny - jglob = jpt + j - call exact( 1, jglob, k, u( 1, 1, j, k ) ) - end do - end do - END IF - - IF (south.eq.-1) then - do k = 1, nz - do j = 1, ny - jglob = jpt + j - call exact( nx0, jglob, k, u( 1, nx, j, k ) ) - end do - end do - END IF - - return - end diff --git a/examples/smpi/NAS/LU/setcoeff.f b/examples/smpi/NAS/LU/setcoeff.f deleted file mode 100644 index 8fc5c1840c..0000000000 --- a/examples/smpi/NAS/LU/setcoeff.f +++ /dev/null @@ -1,159 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setcoeff - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - - -c--------------------------------------------------------------------- -c set up coefficients -c--------------------------------------------------------------------- - dxi = 1.0d+00 / ( nx0 - 1 ) - deta = 1.0d+00 / ( ny0 - 1 ) - dzeta = 1.0d+00 / ( nz0 - 1 ) - - tx1 = 1.0d+00 / ( dxi * dxi ) - tx2 = 1.0d+00 / ( 2.0d+00 * dxi ) - tx3 = 1.0d+00 / dxi - - ty1 = 1.0d+00 / ( deta * deta ) - ty2 = 1.0d+00 / ( 2.0d+00 * deta ) - ty3 = 1.0d+00 / deta - - tz1 = 1.0d+00 / ( dzeta * dzeta ) - tz2 = 1.0d+00 / ( 2.0d+00 * dzeta ) - tz3 = 1.0d+00 / dzeta - - ii1 = 2 - ii2 = nx0 - 1 - ji1 = 2 - ji2 = ny0 - 2 - ki1 = 3 - ki2 = nz0 - 1 - -c--------------------------------------------------------------------- -c diffusion coefficients -c--------------------------------------------------------------------- - dx1 = 0.75d+00 - dx2 = dx1 - dx3 = dx1 - dx4 = dx1 - dx5 = dx1 - - dy1 = 0.75d+00 - dy2 = dy1 - dy3 = dy1 - dy4 = dy1 - dy5 = dy1 - - dz1 = 1.00d+00 - dz2 = dz1 - dz3 = dz1 - dz4 = dz1 - dz5 = dz1 - -c--------------------------------------------------------------------- -c fourth difference dissipation -c--------------------------------------------------------------------- - dssp = ( max (dx1, dy1, dz1 ) ) / 4.0d+00 - -c--------------------------------------------------------------------- -c coefficients of the exact solution to the first pde -c--------------------------------------------------------------------- - ce(1,1) = 2.0d+00 - ce(1,2) = 0.0d+00 - ce(1,3) = 0.0d+00 - ce(1,4) = 4.0d+00 - ce(1,5) = 5.0d+00 - ce(1,6) = 3.0d+00 - ce(1,7) = 5.0d-01 - ce(1,8) = 2.0d-02 - ce(1,9) = 1.0d-02 - ce(1,10) = 3.0d-02 - ce(1,11) = 5.0d-01 - ce(1,12) = 4.0d-01 - ce(1,13) = 3.0d-01 - -c--------------------------------------------------------------------- -c coefficients of the exact solution to the second pde -c--------------------------------------------------------------------- - ce(2,1) = 1.0d+00 - ce(2,2) = 0.0d+00 - ce(2,3) = 0.0d+00 - ce(2,4) = 0.0d+00 - ce(2,5) = 1.0d+00 - ce(2,6) = 2.0d+00 - ce(2,7) = 3.0d+00 - ce(2,8) = 1.0d-02 - ce(2,9) = 3.0d-02 - ce(2,10) = 2.0d-02 - ce(2,11) = 4.0d-01 - ce(2,12) = 3.0d-01 - ce(2,13) = 5.0d-01 - -c--------------------------------------------------------------------- -c coefficients of the exact solution to the third pde -c--------------------------------------------------------------------- - ce(3,1) = 2.0d+00 - ce(3,2) = 2.0d+00 - ce(3,3) = 0.0d+00 - ce(3,4) = 0.0d+00 - ce(3,5) = 0.0d+00 - ce(3,6) = 2.0d+00 - ce(3,7) = 3.0d+00 - ce(3,8) = 4.0d-02 - ce(3,9) = 3.0d-02 - ce(3,10) = 5.0d-02 - ce(3,11) = 3.0d-01 - ce(3,12) = 5.0d-01 - ce(3,13) = 4.0d-01 - -c--------------------------------------------------------------------- -c coefficients of the exact solution to the fourth pde -c--------------------------------------------------------------------- - ce(4,1) = 2.0d+00 - ce(4,2) = 2.0d+00 - ce(4,3) = 0.0d+00 - ce(4,4) = 0.0d+00 - ce(4,5) = 0.0d+00 - ce(4,6) = 2.0d+00 - ce(4,7) = 3.0d+00 - ce(4,8) = 3.0d-02 - ce(4,9) = 5.0d-02 - ce(4,10) = 4.0d-02 - ce(4,11) = 2.0d-01 - ce(4,12) = 1.0d-01 - ce(4,13) = 3.0d-01 - -c--------------------------------------------------------------------- -c coefficients of the exact solution to the fifth pde -c--------------------------------------------------------------------- - ce(5,1) = 5.0d+00 - ce(5,2) = 4.0d+00 - ce(5,3) = 3.0d+00 - ce(5,4) = 2.0d+00 - ce(5,5) = 1.0d-01 - ce(5,6) = 4.0d-01 - ce(5,7) = 3.0d-01 - ce(5,8) = 5.0d-02 - ce(5,9) = 4.0d-02 - ce(5,10) = 3.0d-02 - ce(5,11) = 1.0d-01 - ce(5,12) = 3.0d-01 - ce(5,13) = 2.0d-01 - - return - end - - diff --git a/examples/smpi/NAS/LU/sethyper.f b/examples/smpi/NAS/LU/sethyper.f deleted file mode 100644 index 15245d4c21..0000000000 --- a/examples/smpi/NAS/LU/sethyper.f +++ /dev/null @@ -1,94 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine sethyper - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c for each column in a hyperplane, istart = first row, -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j - integer iglob, jglob - integer kp - -c--------------------------------------------------------------------- -c compute the pointers for hyperplanes -c--------------------------------------------------------------------- - do kp = 2,nx0+ny0 - icomms(kp) = .false. - icommn(kp) = .false. - icomme(kp) = .false. - icommw(kp) = .false. - -c--------------------------------------------------------------------- -c check to see if comm. to south is required -c--------------------------------------------------------------------- - if (south.ne.-1) then - i = iend - iglob = ipt + i - jglob = kp - iglob - j = jglob - jpt - if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and. - > j.le.jend) icomms(kp) = .true. - end if - -c--------------------------------------------------------------------- -c check to see if comm. to north is required -c--------------------------------------------------------------------- - if (north.ne.-1) then - i = ist - iglob = ipt + i - jglob = kp - iglob - j = jglob - jpt - if (jglob.ge.2.and.jglob.le.ny0-1.and.j.ge.jst.and. - > j.le.jend) icommn(kp) = .true. - end if - -c--------------------------------------------------------------------- -c check to see if comm. to east is required -c--------------------------------------------------------------------- - if (east.ne.-1) then - j = jend - jglob = jpt + j - iglob = kp - jglob - i = iglob - ipt - if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and. - > i.le.iend) icomme(kp) = .true. - end if - -c--------------------------------------------------------------------- -c check to see if comm. to west is required -c--------------------------------------------------------------------- - if (west.ne.-1) then - j = jst - jglob = jpt + j - iglob = kp - jglob - i = iglob - ipt - if (iglob.ge.2.and.iglob.le.nx0-1.and.i.ge.ist.and. - > i.le.iend) icommw(kp) = .true. - end if - - end do - - icomms(1) = .false. - icommn(1) = .false. - icomme(1) = .false. - icommw(1) = .false. - icomms(nx0+ny0+1) = .false. - icommn(nx0+ny0+1) = .false. - icomme(nx0+ny0+1) = .false. - icommw(nx0+ny0+1) = .false. - - return - end diff --git a/examples/smpi/NAS/LU/setiv.f b/examples/smpi/NAS/LU/setiv.f deleted file mode 100644 index 73725cbe5c..0000000000 --- a/examples/smpi/NAS/LU/setiv.f +++ /dev/null @@ -1,67 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - subroutine setiv - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c -c set the initial values of independent variables based on tri-linear -c interpolation of boundary values in the computational space. -c -c--------------------------------------------------------------------- - - implicit none - - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, k, m - integer iglob, jglob - double precision xi, eta, zeta - double precision pxi, peta, pzeta - double precision ue_1jk(5),ue_nx0jk(5),ue_i1k(5), - > ue_iny0k(5),ue_ij1(5),ue_ijnz(5) - - - do k = 2, nz - 1 - zeta = ( dble (k-1) ) / (nz-1) - do j = 1, ny - jglob = jpt + j - IF (jglob.ne.1.and.jglob.ne.ny0) then - eta = ( dble (jglob-1) ) / (ny0-1) - do i = 1, nx - iglob = ipt + i - IF (iglob.ne.1.and.iglob.ne.nx0) then - xi = ( dble (iglob-1) ) / (nx0-1) - call exact (1,jglob,k,ue_1jk) - call exact (nx0,jglob,k,ue_nx0jk) - call exact (iglob,1,k,ue_i1k) - call exact (iglob,ny0,k,ue_iny0k) - call exact (iglob,jglob,1,ue_ij1) - call exact (iglob,jglob,nz,ue_ijnz) - do m = 1, 5 - pxi = ( 1.0d+00 - xi ) * ue_1jk(m) - > + xi * ue_nx0jk(m) - peta = ( 1.0d+00 - eta ) * ue_i1k(m) - > + eta * ue_iny0k(m) - pzeta = ( 1.0d+00 - zeta ) * ue_ij1(m) - > + zeta * ue_ijnz(m) - - u( m, i, j, k ) = pxi + peta + pzeta - > - pxi * peta - peta * pzeta - pzeta * pxi - > + pxi * peta * pzeta - - end do - END IF - end do - END IF - end do - end do - - return - end diff --git a/examples/smpi/NAS/LU/ssor.f b/examples/smpi/NAS/LU/ssor.f deleted file mode 100644 index cf4eed0eb7..0000000000 --- a/examples/smpi/NAS/LU/ssor.f +++ /dev/null @@ -1,241 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine ssor(niter) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c to perform pseudo-time stepping SSOR iterations -c for five nonlinear pde's. -c--------------------------------------------------------------------- - - implicit none - integer niter - - include 'mpinpb.h' - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer i, j, k, m - integer istep - double precision tmp - double precision delunm(5), tv(5,isiz1,isiz2) - - external timer_read - double precision wtime, timer_read - - integer IERROR - - - ROOT = 0 - -c--------------------------------------------------------------------- -c begin pseudo-time stepping iterations -c--------------------------------------------------------------------- - tmp = 1.0d+00 / ( omega * ( 2.0d+00 - omega ) ) - -c--------------------------------------------------------------------- -c initialize a,b,c,d to zero (guarantees that page tables have been -c formed, if applicable on given architecture, before timestepping). -c--------------------------------------------------------------------- - do m=1,isiz2 - do k=1,isiz1 - do j=1,5 - do i=1,5 - a(i,j,k,m) = 0.d0 - b(i,j,k,m) = 0.d0 - c(i,j,k,m) = 0.d0 - d(i,j,k,m) = 0.d0 - enddo - enddo - enddo - enddo - -c--------------------------------------------------------------------- -c compute the steady-state residuals -c--------------------------------------------------------------------- - call rhs - -c--------------------------------------------------------------------- -c compute the L2 norms of newton iteration residuals -c--------------------------------------------------------------------- - call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0, - > ist, iend, jst, jend, - > rsd, rsdnm ) - - call MPI_BARRIER( MPI_COMM_WORLD, IERROR ) - - call timer_clear(1) - call timer_start(1) - -c--------------------------------------------------------------------- -c the timestep loop -c--------------------------------------------------------------------- - do istep = 1, niter - - if (id .eq. 0) then - if (mod ( istep, 20) .eq. 0 .or. - > istep .eq. itmax .or. - > istep .eq. 1) then - if (niter .gt. 1) write( *, 200) istep - 200 format(' Time step ', i4) - endif - endif - -c--------------------------------------------------------------------- -c perform SSOR iteration -c--------------------------------------------------------------------- - do k = 2, nz - 1 - do j = jst, jend - do i = ist, iend - do m = 1, 5 - rsd(m,i,j,k) = dt * rsd(m,i,j,k) - end do - end do - end do - end do - - DO k = 2, nz -1 -c--------------------------------------------------------------------- -c form the lower triangular part of the jacobian matrix -c--------------------------------------------------------------------- - call jacld(k) - -c--------------------------------------------------------------------- -c perform the lower triangular solution -c--------------------------------------------------------------------- - call blts( isiz1, isiz2, isiz3, - > nx, ny, nz, k, - > omega, - > rsd, - > a, b, c, d, - > ist, iend, jst, jend, - > nx0, ny0, ipt, jpt) - END DO - - DO k = nz - 1, 2, -1 -c--------------------------------------------------------------------- -c form the strictly upper triangular part of the jacobian matrix -c--------------------------------------------------------------------- - call jacu(k) - -c--------------------------------------------------------------------- -c perform the upper triangular solution -c--------------------------------------------------------------------- - call buts( isiz1, isiz2, isiz3, - > nx, ny, nz, k, - > omega, - > rsd, tv, - > d, a, b, c, - > ist, iend, jst, jend, - > nx0, ny0, ipt, jpt) - END DO - -c--------------------------------------------------------------------- -c update the variables -c--------------------------------------------------------------------- - - do k = 2, nz-1 - do j = jst, jend - do i = ist, iend - do m = 1, 5 - u( m, i, j, k ) = u( m, i, j, k ) - > + tmp * rsd( m, i, j, k ) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c compute the max-norms of newton iteration corrections -c--------------------------------------------------------------------- - if ( mod ( istep, inorm ) .eq. 0 ) then - call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0, - > ist, iend, jst, jend, - > rsd, delunm ) -c if ( ipr .eq. 1 .and. id .eq. 0 ) then -c write (*,1006) ( delunm(m), m = 1, 5 ) -c else if ( ipr .eq. 2 .and. id .eq. 0 ) then -c write (*,'(i5,f15.6)') istep,delunm(5) -c end if - end if - -c--------------------------------------------------------------------- -c compute the steady-state residuals -c--------------------------------------------------------------------- - call rhs - -c--------------------------------------------------------------------- -c compute the max-norms of newton iteration residuals -c--------------------------------------------------------------------- - if ( ( mod ( istep, inorm ) .eq. 0 ) .or. - > ( istep .eq. itmax ) ) then - call l2norm( isiz1, isiz2, isiz3, nx0, ny0, nz0, - > ist, iend, jst, jend, - > rsd, rsdnm ) -c if ( ipr .eq. 1.and.id.eq.0 ) then -c write (*,1007) ( rsdnm(m), m = 1, 5 ) -c end if - end if - -c--------------------------------------------------------------------- -c check the newton-iteration residuals against the tolerance levels -c--------------------------------------------------------------------- - if ( ( rsdnm(1) .lt. tolrsd(1) ) .and. - > ( rsdnm(2) .lt. tolrsd(2) ) .and. - > ( rsdnm(3) .lt. tolrsd(3) ) .and. - > ( rsdnm(4) .lt. tolrsd(4) ) .and. - > ( rsdnm(5) .lt. tolrsd(5) ) ) then -c if (ipr .eq. 1 .and. id.eq.0) then -c write (*,1004) istep -c end if - return - end if - - end do - - call timer_stop(1) - wtime = timer_read(1) - - - call MPI_ALLREDUCE( wtime, - > maxtime, - > 1, - > MPI_DOUBLE_PRECISION, - > MPI_MAX, - > MPI_COMM_WORLD, - > IERROR ) - - - - return - - 1001 format (1x/5x,'pseudo-time SSOR iteration no.=',i4/) - 1004 format (1x/1x,'convergence was achieved after ',i4, - > ' pseudo-time steps' ) - 1006 format (1x/1x,'RMS-norm of SSOR-iteration correction ', - > 'for first pde = ',1pe12.5/, - > 1x,'RMS-norm of SSOR-iteration correction ', - > 'for second pde = ',1pe12.5/, - > 1x,'RMS-norm of SSOR-iteration correction ', - > 'for third pde = ',1pe12.5/, - > 1x,'RMS-norm of SSOR-iteration correction ', - > 'for fourth pde = ',1pe12.5/, - > 1x,'RMS-norm of SSOR-iteration correction ', - > 'for fifth pde = ',1pe12.5) - 1007 format (1x/1x,'RMS-norm of steady-state residual for ', - > 'first pde = ',1pe12.5/, - > 1x,'RMS-norm of steady-state residual for ', - > 'second pde = ',1pe12.5/, - > 1x,'RMS-norm of steady-state residual for ', - > 'third pde = ',1pe12.5/, - > 1x,'RMS-norm of steady-state residual for ', - > 'fourth pde = ',1pe12.5/, - > 1x,'RMS-norm of steady-state residual for ', - > 'fifth pde = ',1pe12.5) - - end diff --git a/examples/smpi/NAS/LU/subdomain.f b/examples/smpi/NAS/LU/subdomain.f deleted file mode 100644 index 388bbf4494..0000000000 --- a/examples/smpi/NAS/LU/subdomain.f +++ /dev/null @@ -1,103 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine subdomain - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'applu.incl' - -c--------------------------------------------------------------------- -c local variables -c--------------------------------------------------------------------- - integer mm, ierror, errorcode - - -c--------------------------------------------------------------------- -c -c set up the sub-domain sizes -c -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c x dimension -c--------------------------------------------------------------------- - mm = mod(nx0,xdim) - if (row.le.mm) then - nx = nx0/xdim + 1 - ipt = (row-1)*nx - else - nx = nx0/xdim - ipt = (row-1)*nx + mm - end if - -c--------------------------------------------------------------------- -c y dimension -c--------------------------------------------------------------------- - mm = mod(ny0,ydim) - if (col.le.mm) then - ny = ny0/ydim + 1 - jpt = (col-1)*ny - else - ny = ny0/ydim - jpt = (col-1)*ny + mm - end if - -c--------------------------------------------------------------------- -c z dimension -c--------------------------------------------------------------------- - nz = nz0 - -c--------------------------------------------------------------------- -c check the sub-domain size -c--------------------------------------------------------------------- - if ( ( nx .lt. 4 ) .or. - > ( ny .lt. 4 ) .or. - > ( nz .lt. 4 ) ) then - write (*,2001) nx, ny, nz - 2001 format (5x,'SUBDOMAIN SIZE IS TOO SMALL - ', - > /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS', - > /5x,'SO THAT NX, NY AND NZ ARE GREATER THAN OR EQUAL', - > /5x,'TO 4 THEY ARE CURRENTLY', 3I3) - CALL MPI_ABORT( MPI_COMM_WORLD, - > ERRORCODE, - > IERROR ) - end if - - if ( ( nx .gt. isiz1 ) .or. - > ( ny .gt. isiz2 ) .or. - > ( nz .gt. isiz3 ) ) then - write (*,2002) nx, ny, nz - 2002 format (5x,'SUBDOMAIN SIZE IS TOO LARGE - ', - > /5x,'ADJUST PROBLEM SIZE OR NUMBER OF PROCESSORS', - > /5x,'SO THAT NX, NY AND NZ ARE LESS THAN OR EQUAL TO ', - > /5x,'ISIZ1, ISIZ2 AND ISIZ3 RESPECTIVELY. THEY ARE', - > /5x,'CURRENTLY', 3I4) - CALL MPI_ABORT( MPI_COMM_WORLD, - > ERRORCODE, - > IERROR ) - end if - - -c--------------------------------------------------------------------- -c set up the start and end in i and j extents for all processors -c--------------------------------------------------------------------- - ist = 1 - iend = nx - if (north.eq.-1) ist = 2 - if (south.eq.-1) iend = nx - 1 - - jst = 1 - jend = ny - if (west.eq.-1) jst = 2 - if (east.eq.-1) jend = ny - 1 - - return - end - - diff --git a/examples/smpi/NAS/LU/verify.f b/examples/smpi/NAS/LU/verify.f deleted file mode 100644 index 2572441a44..0000000000 --- a/examples/smpi/NAS/LU/verify.f +++ /dev/null @@ -1,403 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine verify(xcr, xce, xci, class, verified) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c verification routine -c--------------------------------------------------------------------- - - implicit none - include 'mpinpb.h' - include 'applu.incl' - - double precision xcr(5), xce(5), xci - double precision xcrref(5),xceref(5),xciref, - > xcrdif(5),xcedif(5),xcidif, - > epsilon, dtref - integer m - character class - logical verified - -c--------------------------------------------------------------------- -c tolerance level -c--------------------------------------------------------------------- - epsilon = 1.0d-08 - - class = 'U' - verified = .true. - - do m = 1,5 - xcrref(m) = 1.0 - xceref(m) = 1.0 - end do - xciref = 1.0 - - if ( (nx0 .eq. 12 ) .and. - > (ny0 .eq. 12 ) .and. - > (nz0 .eq. 12 ) .and. - > (itmax .eq. 50 )) then - - class = 'S' - dtref = 5.0d-1 -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual, for the (12X12X12) grid, -c after 50 time steps, with DT = 5.0d-01 -c--------------------------------------------------------------------- - xcrref(1) = 1.6196343210976702d-02 - xcrref(2) = 2.1976745164821318d-03 - xcrref(3) = 1.5179927653399185d-03 - xcrref(4) = 1.5029584435994323d-03 - xcrref(5) = 3.4264073155896461d-02 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error, for the (12X12X12) grid, -c after 50 time steps, with DT = 5.0d-01 -c--------------------------------------------------------------------- - xceref(1) = 6.4223319957960924d-04 - xceref(2) = 8.4144342047347926d-05 - xceref(3) = 5.8588269616485186d-05 - xceref(4) = 5.8474222595157350d-05 - xceref(5) = 1.3103347914111294d-03 - -c--------------------------------------------------------------------- -c Reference value of surface integral, for the (12X12X12) grid, -c after 50 time steps, with DT = 5.0d-01 -c--------------------------------------------------------------------- - xciref = 7.8418928865937083d+00 - - - elseif ( (nx0 .eq. 33) .and. - > (ny0 .eq. 33) .and. - > (nz0 .eq. 33) .and. - > (itmax . eq. 300) ) then - - class = 'W' !SPEC95fp size - dtref = 1.5d-3 -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual, for the (33x33x33) grid, -c after 300 time steps, with DT = 1.5d-3 -c--------------------------------------------------------------------- - xcrref(1) = 0.1236511638192d+02 - xcrref(2) = 0.1317228477799d+01 - xcrref(3) = 0.2550120713095d+01 - xcrref(4) = 0.2326187750252d+01 - xcrref(5) = 0.2826799444189d+02 - - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error, for the (33X33X33) grid, -c--------------------------------------------------------------------- - xceref(1) = 0.4867877144216d+00 - xceref(2) = 0.5064652880982d-01 - xceref(3) = 0.9281818101960d-01 - xceref(4) = 0.8570126542733d-01 - xceref(5) = 0.1084277417792d+01 - - -c--------------------------------------------------------------------- -c Reference value of surface integral, for the (33X33X33) grid, -c after 300 time steps, with DT = 1.5d-3 -c--------------------------------------------------------------------- - xciref = 0.1161399311023d+02 - - elseif ( (nx0 .eq. 64) .and. - > (ny0 .eq. 64) .and. - > (nz0 .eq. 64) .and. - > (itmax . eq. 250) ) then - - class = 'A' - dtref = 2.0d+0 -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual, for the (64X64X64) grid, -c after 250 time steps, with DT = 2.0d+00 -c--------------------------------------------------------------------- - xcrref(1) = 7.7902107606689367d+02 - xcrref(2) = 6.3402765259692870d+01 - xcrref(3) = 1.9499249727292479d+02 - xcrref(4) = 1.7845301160418537d+02 - xcrref(5) = 1.8384760349464247d+03 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error, for the (64X64X64) grid, -c after 250 time steps, with DT = 2.0d+00 -c--------------------------------------------------------------------- - xceref(1) = 2.9964085685471943d+01 - xceref(2) = 2.8194576365003349d+00 - xceref(3) = 7.3473412698774742d+00 - xceref(4) = 6.7139225687777051d+00 - xceref(5) = 7.0715315688392578d+01 - -c--------------------------------------------------------------------- -c Reference value of surface integral, for the (64X64X64) grid, -c after 250 time steps, with DT = 2.0d+00 -c--------------------------------------------------------------------- - xciref = 2.6030925604886277d+01 - - - elseif ( (nx0 .eq. 102) .and. - > (ny0 .eq. 102) .and. - > (nz0 .eq. 102) .and. - > (itmax . eq. 250) ) then - - class = 'B' - dtref = 2.0d+0 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual, for the (102X102X102) grid, -c after 250 time steps, with DT = 2.0d+00 -c--------------------------------------------------------------------- - xcrref(1) = 3.5532672969982736d+03 - xcrref(2) = 2.6214750795310692d+02 - xcrref(3) = 8.8333721850952190d+02 - xcrref(4) = 7.7812774739425265d+02 - xcrref(5) = 7.3087969592545314d+03 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error, for the (102X102X102) -c grid, after 250 time steps, with DT = 2.0d+00 -c--------------------------------------------------------------------- - xceref(1) = 1.1401176380212709d+02 - xceref(2) = 8.1098963655421574d+00 - xceref(3) = 2.8480597317698308d+01 - xceref(4) = 2.5905394567832939d+01 - xceref(5) = 2.6054907504857413d+02 - -c--------------------------------------------------------------------- -c Reference value of surface integral, for the (102X102X102) grid, -c after 250 time steps, with DT = 2.0d+00 -c--------------------------------------------------------------------- - xciref = 4.7887162703308227d+01 - - elseif ( (nx0 .eq. 162) .and. - > (ny0 .eq. 162) .and. - > (nz0 .eq. 162) .and. - > (itmax . eq. 250) ) then - - class = 'C' - dtref = 2.0d+0 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual, for the (162X162X162) grid, -c after 250 time steps, with DT = 2.0d+00 -c--------------------------------------------------------------------- - xcrref(1) = 1.03766980323537846d+04 - xcrref(2) = 8.92212458801008552d+02 - xcrref(3) = 2.56238814582660871d+03 - xcrref(4) = 2.19194343857831427d+03 - xcrref(5) = 1.78078057261061185d+04 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error, for the (162X162X162) -c grid, after 250 time steps, with DT = 2.0d+00 -c--------------------------------------------------------------------- - xceref(1) = 2.15986399716949279d+02 - xceref(2) = 1.55789559239863600d+01 - xceref(3) = 5.41318863077207766d+01 - xceref(4) = 4.82262643154045421d+01 - xceref(5) = 4.55902910043250358d+02 - -c--------------------------------------------------------------------- -c Reference value of surface integral, for the (162X162X162) grid, -c after 250 time steps, with DT = 2.0d+00 -c--------------------------------------------------------------------- - xciref = 6.66404553572181300d+01 - - elseif ( (nx0 .eq. 408) .and. - > (ny0 .eq. 408) .and. - > (nz0 .eq. 408) .and. - > (itmax . eq. 300) ) then - - class = 'D' - dtref = 1.0d+0 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual, for the (408X408X408) grid, -c after 300 time steps, with DT = 1.0d+00 -c--------------------------------------------------------------------- - xcrref(1) = 0.4868417937025d+05 - xcrref(2) = 0.4696371050071d+04 - xcrref(3) = 0.1218114549776d+05 - xcrref(4) = 0.1033801493461d+05 - xcrref(5) = 0.7142398413817d+05 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error, for the (408X408X408) -c grid, after 300 time steps, with DT = 1.0d+00 -c--------------------------------------------------------------------- - xceref(1) = 0.3752393004482d+03 - xceref(2) = 0.3084128893659d+02 - xceref(3) = 0.9434276905469d+02 - xceref(4) = 0.8230686681928d+02 - xceref(5) = 0.7002620636210d+03 - -c--------------------------------------------------------------------- -c Reference value of surface integral, for the (408X408X408) grid, -c after 300 time steps, with DT = 1.0d+00 -c--------------------------------------------------------------------- - xciref = 0.8334101392503d+02 - - elseif ( (nx0 .eq. 1020) .and. - > (ny0 .eq. 1020) .and. - > (nz0 .eq. 1020) .and. - > (itmax . eq. 300) ) then - - class = 'E' - dtref = 0.5d+0 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual, for the (1020X1020X1020) grid, -c after 300 time steps, with DT = 0.5d+00 -c--------------------------------------------------------------------- - xcrref(1) = 0.2099641687874d+06 - xcrref(2) = 0.2130403143165d+05 - xcrref(3) = 0.5319228789371d+05 - xcrref(4) = 0.4509761639833d+05 - xcrref(5) = 0.2932360006590d+06 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error, for the (1020X1020X1020) -c grid, after 300 time steps, with DT = 0.5d+00 -c--------------------------------------------------------------------- - xceref(1) = 0.4800572578333d+03 - xceref(2) = 0.4221993400184d+02 - xceref(3) = 0.1210851906824d+03 - xceref(4) = 0.1047888986770d+03 - xceref(5) = 0.8363028257389d+03 - -c--------------------------------------------------------------------- -c Reference value of surface integral, for the (1020X1020X1020) grid, -c after 300 time steps, with DT = 0.5d+00 -c--------------------------------------------------------------------- - xciref = 0.9512163272273d+02 - - else - verified = .FALSE. - endif - -c--------------------------------------------------------------------- -c verification test for residuals if gridsize is one of -c the defined grid sizes above (class .ne. 'U') -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Compute the difference of solution values and the known reference values. -c--------------------------------------------------------------------- - do m = 1, 5 - - xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) - xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m)) - - enddo - xcidif = dabs((xci - xciref)/xciref) - - -c--------------------------------------------------------------------- -c Output the comparison of computed results to known cases. -c--------------------------------------------------------------------- - - if (class .ne. 'U') then - write(*, 1990) class - 1990 format(/, ' Verification being performed for class ', a) - write (*,2000) epsilon - 2000 format(' Accuracy setting for epsilon = ', E20.13) - verified = (dabs(dt-dtref) .le. epsilon) - if (.not.verified) then - class = 'U' - write (*,1000) dtref - 1000 format(' DT does not match the reference value of ', - > E15.8) - endif - else - write(*, 1995) - 1995 format(' Unknown class') - endif - - - if (class .ne. 'U') then - write (*,2001) - else - write (*, 2005) - endif - - 2001 format(' Comparison of RMS-norms of residual') - 2005 format(' RMS-norms of residual') - do m = 1, 5 - if (class .eq. 'U') then - write(*, 2015) m, xcr(m) - else if (xcrdif(m) .le. epsilon) then - write (*,2011) m,xcr(m),xcrref(m),xcrdif(m) - else - verified = .false. - write (*,2010) m,xcr(m),xcrref(m),xcrdif(m) - endif - enddo - - if (class .ne. 'U') then - write (*,2002) - else - write (*,2006) - endif - 2002 format(' Comparison of RMS-norms of solution error') - 2006 format(' RMS-norms of solution error') - - do m = 1, 5 - if (class .eq. 'U') then - write(*, 2015) m, xce(m) - else if (xcedif(m) .le. epsilon) then - write (*,2011) m,xce(m),xceref(m),xcedif(m) - else - verified = .false. - write (*,2010) m,xce(m),xceref(m),xcedif(m) - endif - enddo - - 2010 format(' FAILURE: ', i2, 2x, E20.13, E20.13, E20.13) - 2011 format(' ', i2, 2x, E20.13, E20.13, E20.13) - 2015 format(' ', i2, 2x, E20.13) - - if (class .ne. 'U') then - write (*,2025) - else - write (*,2026) - endif - 2025 format(' Comparison of surface integral') - 2026 format(' Surface integral') - - - if (class .eq. 'U') then - write(*, 2030) xci - else if (xcidif .le. epsilon) then - write(*, 2032) xci, xciref, xcidif - else - verified = .false. - write(*, 2031) xci, xciref, xcidif - endif - - 2030 format(' ', 4x, E20.13) - 2031 format(' FAILURE: ', 4x, E20.13, E20.13, E20.13) - 2032 format(' ', 4x, E20.13, E20.13, E20.13) - - - - if (class .eq. 'U') then - write(*, 2022) - write(*, 2023) - 2022 format(' No reference values provided') - 2023 format(' No verification performed') - else if (verified) then - write(*, 2020) - 2020 format(' Verification Successful') - else - write(*, 2021) - 2021 format(' Verification failed') - endif - - return - - - end diff --git a/examples/smpi/NAS/MG/Makefile b/examples/smpi/NAS/MG/Makefile deleted file mode 100644 index 1554bedeea..0000000000 --- a/examples/smpi/NAS/MG/Makefile +++ /dev/null @@ -1,23 +0,0 @@ -SHELL=/bin/sh -BENCHMARK=mg -BENCHMARKU=MG - -include ../config/make.def - -OBJS = mg.o ${COMMON}/print_results.o \ - ${COMMON}/${RAND}.o ${COMMON}/timers.o - -include ../sys/make.common - -${PROGRAM}: config ${OBJS} - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} - -mg.o: mg.f globals.h mpinpb.h npbparams.h - ${FCOMPILE} mg.f - -clean: - - rm -f *.o *~ - - rm -f npbparams.h core - - - diff --git a/examples/smpi/NAS/MG/README b/examples/smpi/NAS/MG/README deleted file mode 100644 index 6c03f78527..0000000000 --- a/examples/smpi/NAS/MG/README +++ /dev/null @@ -1,138 +0,0 @@ -Some info about the MG benchmark -================================ - -'mg_demo' demonstrates the capabilities of a very simple multigrid -solver in computing a three dimensional potential field. This is -a simplified multigrid solver in two important respects: - - (1) it solves only a constant coefficient equation, - and that only on a uniform cubical grid, - - (2) it solves only a single equation, representing - a scalar field rather than a vector field. - -We chose it for its portability and simplicity, and expect that a -supercomputer which can run it effectively will also be able to -run more complex multigrid programs at least as well. - - Eric Barszcz Paul Frederickson - RIACS - NASA Ames Research Center NASA Ames Research Center - -======================================================================== -Running the program: (Note: also see parameter lm information in the - two sections immediately below this section) - -The program may be run with or without an input deck (called "mg.input"). -The following describes a few things about the input deck if you want to -use one. - -The four lines below are the "mg.input" file required to run a -problem of total size 256x256x256, for 4 iterations (Class "A"), -and presumes the use of 8 processors: - - 8 = top level - 256 256 256 = nx ny nz - 4 = nit - 0 0 0 0 0 0 0 0 = debug_vec - -The first line of input indicates how many levels of multi-grid -cycle will be applied to a particular subpartition. Presuming that -8 processors are solving this problem (recall that the number of -processors is specified to MPI as a run parameter, and MPI subsequently -determines this for the code via an MPI subroutine call), a 2x2x2 -processor grid is formed, and thus each partition on a processor is -of size 128x128x128. Therefore, a maximum of 8 multi-grid levels may -be used. These are of size 128,64,32,16,8,4,2,1, with the coarsest -level being a single point on a given processor. - - -Next, consider the same size problem but running on 1 processor. The -following "mg.input" file is appropriate: - - 9 = top level - 256 256 256 = nx ny nz - 4 = nit - 0 0 0 0 0 0 0 0 = debug_vec - -Since this processor must solve the full 256x256x256 problem, this -permits 9 multi-grid levels (256,128,64,32,16,8,4,2,1), resulting in -a coarsest multi-grid level of a single point on the processor - - -Next, consider the same size problem but running on 2 processors. The -following "mg.input" file is required: - - 8 = top level - 256 256 256 = nx ny nz - 4 = nit - 0 0 0 0 0 0 0 0 = debug_vec - -The algorithm for partitioning the full grid onto some power of 2 number -of processors is to start by splitting the last dimension of the grid -(z dimension) in 2: the problem is now partitioned onto 2 processors. -Next the middle dimension (y dimension) is split in 2: the problem is now -partitioned onto 4 processors. Next, first dimension (x dimension) is -split in 2: the problem is now partitioned onto 8 processors. Next, the -last dimension (z dimension) is split again in 2: the problem is now -partitioned onto 16 processors. This partitioning is repeated until all -of the power of 2 processors have been allocated. - -Thus to run the above problem on 2 processors, the grid partitioning -algorithm will allocate the two processors across the last dimension, -creating two partitions each of size 256x256x128. The coarsest level of -multi-grid must be a single point surrounded by a cubic number of grid -points. Therefore, each of the two processor partitions will contain 4 -coarsest multi-grid level points, each surrounded by a cube of grid points -of size 128x128x128, indicated by a top level of 8. - - -Next, consider the same size problem but running on 4 processors. The -following "mg.input" file is required: - - 8 = top level - 256 256 256 = nx ny nz - 4 = nit - 0 0 0 0 0 0 0 0 = debug_vec - -The partitioning algorithm will create 4 partitions, each of size -256x128x128. Each partition will contain 2 coarsest multi-grid level -points each surrounded by a cube of grid points of size 128x128x128, -indicated by a top level of 8. - - -Next, consider the same size problem but running on 16 processors. The -following "mg.input" file is required: - - 7 = top level - 256 256 256 = nx ny nz - 4 = nit - 0 0 0 0 0 0 0 0 = debug_vec - -On each node a partition of size 128x128x64 will be created. A maximum -of 7 multi-grid levels (64,32,16,8,4,2,1) may be used, resulting in each -partions containing 4 coarsest multi-grid level points, each surrounded -by a cube of grid points of size 64x64x64, indicated by a top level of 7. - - - - -Note that non-cubic problem sizes may also be considered: - -The four lines below are the "mg.input" file appropriate for running a -problem of total size 256x512x512, for 20 iterations and presumes the -use of 32 processors (note: this is NOT a class C problem): - - 8 = top level - 256 512 512 = nx ny nz - 20 = nit - 0 0 0 0 0 0 0 0 = debug_vec - -The first line of input indicates how many levels of multi-grid -cycle will be applied to a particular subpartition. Presuming that -32 processors are solving this problem, a 2x4x4 processor grid is -formed, and thus each partition on a processor is of size 128x128x128. -Therefore, a maximum of 8 multi-grid levels may be used. These are of -size 128,64,32,16,8,4,2,1, with the coarsest level being a single -point on a given processor. - diff --git a/examples/smpi/NAS/MG/globals.h b/examples/smpi/NAS/MG/globals.h deleted file mode 100644 index 99573e3c20..0000000000 --- a/examples/smpi/NAS/MG/globals.h +++ /dev/null @@ -1,55 +0,0 @@ -c--------------------------------------------------------------------- -c Parameter lm (declared and set in "npbparams.h") is the log-base2 of -c the edge size max for the partition on a given node, so must be changed -c either to save space (if running a small case) or made bigger for larger -c cases, for example, 512^3. Thus lm=7 means that the largest dimension -c of a partition that can be solved on a node is 2^7 = 128. lm is set -c automatically in npbparams.h -c Parameters ndim1, ndim2, ndim3 are the local problem dimensions. -c--------------------------------------------------------------------- - - include 'npbparams.h' - - integer nm ! actual dimension including ghost cells for communications - > , nv ! size of rhs array - > , nr ! size of residual array - > , nm2 ! size of communication buffer - > , maxlevel! maximum number of levels - - parameter( nm=2+2**lm, nv=(2+2**ndim1)*(2+2**ndim2)*(2+2**ndim3) ) - parameter( nm2=2*nm*nm, maxlevel=(lt_default+1) ) - parameter( nr = (8*(nv+nm**2+5*nm+14*lt_default-7*lm))/7 ) - integer maxprocs - parameter( maxprocs = 131072 ) ! this is the upper proc limit that - ! the current "nr" parameter can handle -c--------------------------------------------------------------------- - integer nbr(3,-1:1,maxlevel), msg_type(3,-1:1) - integer msg_id(3,-1:1,2),nx(maxlevel),ny(maxlevel),nz(maxlevel) - common /mg3/ nbr,msg_type,msg_id,nx,ny,nz - - character class - common /ClassType/class - - integer debug_vec(0:7) - common /my_debug/ debug_vec - - integer ir(maxlevel), m1(maxlevel), m2(maxlevel), m3(maxlevel) - integer lt, lb - common /fap/ ir,m1,m2,m3,lt,lb - - logical dead(maxlevel), give_ex(3,maxlevel), take_ex(3,maxlevel) - common /comm_ex/ dead, give_ex, take_ex - -c--------------------------------------------------------------------- -c Set at m=1024, can handle cases up to 1024^3 case -c--------------------------------------------------------------------- - integer m -c parameter( m=1037 ) - parameter( m=nm+1 ) - - double precision buff(nm2,4) - common /buffer/ buff - - - - diff --git a/examples/smpi/NAS/MG/mg.f b/examples/smpi/NAS/MG/mg.f deleted file mode 100644 index b0352ae2df..0000000000 --- a/examples/smpi/NAS/MG/mg.f +++ /dev/null @@ -1,2479 +0,0 @@ -!-------------------------------------------------------------------------! -! ! -! N A S P A R A L L E L B E N C H M A R K S 3.3 ! -! ! -! M G ! -! ! -!-------------------------------------------------------------------------! -! ! -! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! -! It is described in NAS Technical Reports 95-020 and 02-007 ! -! ! -! Permission to use, copy, distribute and modify this software ! -! for any purpose with or without fee is hereby granted. We ! -! request, however, that all derived work reference the NAS ! -! Parallel Benchmarks 3.3. This software is provided "as is" ! -! without express or implied warranty. ! -! ! -! Information on NPB 3.3, including the technical report, the ! -! original specifications, source code, results and information ! -! on how to submit new results, is available at: ! -! ! -! http://www.nas.nasa.gov/Software/NPB/ ! -! ! -! Send comments or suggestions to npb@nas.nasa.gov ! -! ! -! NAS Parallel Benchmarks Group ! -! NASA Ames Research Center ! -! Mail Stop: T27A-1 ! -! Moffett Field, CA 94035-1000 ! -! ! -! E-mail: npb@nas.nasa.gov ! -! Fax: (650) 604-3957 ! -! ! -!-------------------------------------------------------------------------! - - -c--------------------------------------------------------------------- -c -c Authors: E. Barszcz -c P. Frederickson -c A. Woo -c M. Yarrow -c R. F. Van der Wijngaart -c -c--------------------------------------------------------------------- - - -c--------------------------------------------------------------------- - program mg_mpi -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'globals.h' - -c---------------------------------------------------------------------------c -c k is the current level. It is passed down through subroutine args -c and is NOT global. it is the current iteration -c---------------------------------------------------------------------------c - - integer k, it - - external timer_read - double precision t, t0, tinit, mflops, timer_read - -c---------------------------------------------------------------------------c -c These arrays are in common because they are quite large -c and probably shouldn't be allocated on the stack. They -c are always passed as subroutine args. -c---------------------------------------------------------------------------c - - double precision u(nr),v(nv),r(nr),a(0:3),c(0:3) - common /noautom/ u,v,r - - double precision rnm2, rnmu, old2, oldu, epsilon - integer n1, n2, n3, nit - double precision nn, verify_value, err - logical verified - - integer ierr,i, fstatus - integer T_bench, T_init - parameter (T_bench=1, T_init=2) - - call mpi_init(ierr) - call mpi_comm_rank(mpi_comm_world, me, ierr) - call mpi_comm_size(mpi_comm_world, nprocs, ierr) - - root = 0 - if (nprocs_compiled .gt. maxprocs) then - if (me .eq. root) write(*,20) nprocs_compiled, maxprocs - 20 format(' ERROR: compiled for ',i8,' processes'// - & ' The maximum size allowed for this benchmark is ',i6) - call mpi_abort(MPI_COMM_WORLD, ierr) - stop - endif - - if (.not. convertdouble) then - dp_type = MPI_DOUBLE_PRECISION - else - dp_type = MPI_REAL - endif - - - call timer_clear(T_bench) - call timer_clear(T_init) - - call mpi_barrier(MPI_COMM_WORLD, ierr) - - call timer_start(T_init) - - -c--------------------------------------------------------------------- -c Read in and broadcast input data -c--------------------------------------------------------------------- - - if( me .eq. root )then - write (*, 1000) - - open(unit=7,file="mg.input", status="old", iostat=fstatus) - if (fstatus .eq. 0) then - write(*,50) - 50 format(' Reading from input file mg.input') - read(7,*) lt - read(7,*) nx(lt), ny(lt), nz(lt) - read(7,*) nit - read(7,*) (debug_vec(i),i=0,7) - else - write(*,51) - 51 format(' No input file. Using compiled defaults ') - lt = lt_default - nit = nit_default - nx(lt) = nx_default - ny(lt) = ny_default - nz(lt) = nz_default - do i = 0,7 - debug_vec(i) = debug_default - end do - endif - endif - - call mpi_bcast(lt, 1, MPI_INTEGER, 0, mpi_comm_world, ierr) - call mpi_bcast(nit, 1, MPI_INTEGER, 0, mpi_comm_world, ierr) - call mpi_bcast(nx(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr) - call mpi_bcast(ny(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr) - call mpi_bcast(nz(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr) - call mpi_bcast(debug_vec(0), 8, MPI_INTEGER, 0, - > mpi_comm_world, ierr) - - if ( (nx(lt) .ne. ny(lt)) .or. (nx(lt) .ne. nz(lt)) ) then - Class = 'U' - else if( nx(lt) .eq. 32 .and. nit .eq. 4 ) then - Class = 'S' - else if( nx(lt) .eq. 128 .and. nit .eq. 4 ) then - Class = 'W' - else if( nx(lt) .eq. 256 .and. nit .eq. 4 ) then - Class = 'A' - else if( nx(lt) .eq. 256 .and. nit .eq. 20 ) then - Class = 'B' - else if( nx(lt) .eq. 512 .and. nit .eq. 20 ) then - Class = 'C' - else if( nx(lt) .eq. 1024 .and. nit .eq. 50 ) then - Class = 'D' - else if( nx(lt) .eq. 2048 .and. nit .eq. 50 ) then - Class = 'E' - else - Class = 'U' - endif - -c--------------------------------------------------------------------- -c Use these for debug info: -c--------------------------------------------------------------------- -c debug_vec(0) = 1 !=> report all norms -c debug_vec(1) = 1 !=> some setup information -c debug_vec(1) = 2 !=> more setup information -c debug_vec(2) = k => at level k or below, show result of resid -c debug_vec(3) = k => at level k or below, show result of psinv -c debug_vec(4) = k => at level k or below, show result of rprj -c debug_vec(5) = k => at level k or below, show result of interp -c debug_vec(6) = 1 => (unused) -c debug_vec(7) = 1 => (unused) -c--------------------------------------------------------------------- - a(0) = -8.0D0/3.0D0 - a(1) = 0.0D0 - a(2) = 1.0D0/6.0D0 - a(3) = 1.0D0/12.0D0 - - if(Class .eq. 'A' .or. Class .eq. 'S'.or. Class .eq.'W') then -c--------------------------------------------------------------------- -c Coefficients for the S(a) smoother -c--------------------------------------------------------------------- - c(0) = -3.0D0/8.0D0 - c(1) = +1.0D0/32.0D0 - c(2) = -1.0D0/64.0D0 - c(3) = 0.0D0 - else -c--------------------------------------------------------------------- -c Coefficients for the S(b) smoother -c--------------------------------------------------------------------- - c(0) = -3.0D0/17.0D0 - c(1) = +1.0D0/33.0D0 - c(2) = -1.0D0/61.0D0 - c(3) = 0.0D0 - endif - lb = 1 - k = lt - - call setup(n1,n2,n3,k) - call zero3(u,n1,n2,n3) - call zran3(v,n1,n2,n3,nx(lt),ny(lt),k) - - call norm2u3(v,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) - - if( me .eq. root )then - write (*, 1001) nx(lt),ny(lt),nz(lt), Class - write (*, 1002) nit - - 1000 format(//,' NAS Parallel Benchmarks 3.3 -- MG Benchmark', /) - 1001 format(' Size: ', i4, 'x', i4, 'x', i4, ' (class ', A, ')' ) - 1002 format(' Iterations: ', i4) - 1003 format(' Number of processes: ', i6) - if (nprocs .ne. nprocs_compiled) then - write (*, 1004) nprocs_compiled - write (*, 1005) nprocs - 1004 format(' WARNING: compiled for ', i6, ' processes ') - 1005 format(' Number of active processes: ', i6, /) - else - write (*, 1003) nprocs - endif - endif - - call resid(u,v,r,n1,n2,n3,a,k) - call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) - old2 = rnm2 - oldu = rnmu - -c--------------------------------------------------------------------- -c One iteration for startup -c--------------------------------------------------------------------- - call mg3P(u,v,r,a,c,n1,n2,n3,k) - call resid(u,v,r,n1,n2,n3,a,k) - call setup(n1,n2,n3,k) - call zero3(u,n1,n2,n3) - call zran3(v,n1,n2,n3,nx(lt),ny(lt),k) - - call timer_stop(T_init) - if( me .eq. root )then - tinit = timer_read(T_init) - write( *,'(/A,F15.3,A/)' ) - > ' Initialization time: ',tinit, ' seconds' - endif - - call mpi_barrier(mpi_comm_world,ierr) - - call timer_start(T_bench) - - call resid(u,v,r,n1,n2,n3,a,k) - call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) - old2 = rnm2 - oldu = rnmu - - do it=1,nit - if (it.eq.1 .or. it.eq.nit .or. mod(it,5).eq.0) then - if (me .eq. root) write(*,80) it - 80 format(' iter ',i4) - endif - call mg3P(u,v,r,a,c,n1,n2,n3,k) - call resid(u,v,r,n1,n2,n3,a,k) - enddo - - - call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) - - call timer_stop(T_bench) - - t0 = timer_read(T_bench) - - call mpi_reduce(t0,t,1,dp_type, - > mpi_max,root,mpi_comm_world,ierr) - verified = .FALSE. - verify_value = 0.0 - if( me .eq. root )then - write(*,100) - 100 format(/' Benchmark completed ') - - epsilon = 1.d-8 - if (Class .ne. 'U') then - if(Class.eq.'S') then - verify_value = 0.5307707005734d-04 - elseif(Class.eq.'W') then - verify_value = 0.6467329375339d-05 - elseif(Class.eq.'A') then - verify_value = 0.2433365309069d-05 - elseif(Class.eq.'B') then - verify_value = 0.1800564401355d-05 - elseif(Class.eq.'C') then - verify_value = 0.5706732285740d-06 - elseif(Class.eq.'D') then - verify_value = 0.1583275060440d-09 - elseif(Class.eq.'E') then - verify_value = 0.5630442584711d-10 - endif - - err = abs( rnm2 - verify_value ) / verify_value - if( err .le. epsilon ) then - verified = .TRUE. - write(*, 200) - write(*, 201) rnm2 - write(*, 202) err - 200 format(' VERIFICATION SUCCESSFUL ') - 201 format(' L2 Norm is ', E20.13) - 202 format(' Error is ', E20.13) - else - verified = .FALSE. - write(*, 300) - write(*, 301) rnm2 - write(*, 302) verify_value - 300 format(' VERIFICATION FAILED') - 301 format(' L2 Norm is ', E20.13) - 302 format(' The correct L2 Norm is ', E20.13) - endif - else - verified = .FALSE. - write (*, 400) - write (*, 401) - write (*, 201) rnm2 - 400 format(' Problem size unknown') - 401 format(' NO VERIFICATION PERFORMED') - endif - - nn = 1.0d0*nx(lt)*ny(lt)*nz(lt) - - if( t .ne. 0. ) then - mflops = 58.*1.0D-6*nit*nn / t - else - mflops = 0.0 - endif - - call print_results('MG', class, nx(lt), ny(lt), nz(lt), - > nit, nprocs_compiled, nprocs, t, - > mflops, ' floating point', - > verified, npbversion, compiletime, - > cs1, cs2, cs3, cs4, cs5, cs6, cs7) - - - endif - - - call mpi_finalize(ierr) - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setup(n1,n2,n3,k) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer is1, is2, is3, ie1, ie2, ie3 - common /grid/ is1,is2,is3,ie1,ie2,ie3 - - integer n1,n2,n3,k - integer dx, dy, log_p, d, i, j - - integer ax, next(3),mi(3,maxlevel),mip(3,maxlevel) - integer ng(3,maxlevel) - integer idi(3), pi(3), idin(3,-1:1) - integer s, dir,ierr - - do j=-1,1,1 - do d=1,3 - msg_type(d,j) = 100*(j+2+10*d) - enddo - enddo - - ng(1,lt) = nx(lt) - ng(2,lt) = ny(lt) - ng(3,lt) = nz(lt) - do ax=1,3 - next(ax) = 1 - do k=lt-1,1,-1 - ng(ax,k) = ng(ax,k+1)/2 - enddo - enddo - 61 format(10i4) - do k=lt,1,-1 - nx(k) = ng(1,k) - ny(k) = ng(2,k) - nz(k) = ng(3,k) - enddo - - log_p = log(float(nprocs)+0.0001)/log(2.0) - dx = log_p/3 - pi(1) = 2**dx - idi(1) = mod(me,pi(1)) - - dy = (log_p-dx)/2 - pi(2) = 2**dy - idi(2) = mod((me/pi(1)),pi(2)) - - pi(3) = nprocs/(pi(1)*pi(2)) - idi(3) = me/(pi(1)*pi(2)) - - do k = lt,1,-1 - dead(k) = .false. - do ax = 1,3 - take_ex(ax,k) = .false. - give_ex(ax,k) = .false. - - mi(ax,k) = 2 + - > ((idi(ax)+1)*ng(ax,k))/pi(ax) - - > ((idi(ax)+0)*ng(ax,k))/pi(ax) - mip(ax,k) = 2 + - > ((next(ax)+idi(ax)+1)*ng(ax,k))/pi(ax) - - > ((next(ax)+idi(ax)+0)*ng(ax,k))/pi(ax) - - if(mip(ax,k).eq.2.or.mi(ax,k).eq.2)then - next(ax) = 2*next(ax) - endif - - if( k+1 .le. lt )then - if((mip(ax,k).eq.2).and.(mi(ax,k).eq.3))then - give_ex(ax,k+1) = .true. - endif - if((mip(ax,k).eq.3).and.(mi(ax,k).eq.2))then - take_ex(ax,k+1) = .true. - endif - endif - enddo - - if( mi(1,k).eq.2 .or. - > mi(2,k).eq.2 .or. - > mi(3,k).eq.2 )then - dead(k) = .true. - endif - m1(k) = mi(1,k) - m2(k) = mi(2,k) - m3(k) = mi(3,k) - - do ax=1,3 - idin(ax,+1) = mod( idi(ax) + next(ax) + pi(ax) , pi(ax) ) - idin(ax,-1) = mod( idi(ax) - next(ax) + pi(ax) , pi(ax) ) - enddo - do dir = 1,-1,-2 - nbr(1,dir,k) = idin(1,dir) + pi(1) - > *(idi(2) + pi(2) - > * idi(3)) - nbr(2,dir,k) = idi(1) + pi(1) - > *(idin(2,dir) + pi(2) - > * idi(3)) - nbr(3,dir,k) = idi(1) + pi(1) - > *(idi(2) + pi(2) - > * idin(3,dir)) - enddo - enddo - - k = lt - is1 = 2 + ng(1,k) - ((pi(1) -idi(1))*ng(1,lt))/pi(1) - ie1 = 1 + ng(1,k) - ((pi(1)-1-idi(1))*ng(1,lt))/pi(1) - n1 = 3 + ie1 - is1 - is2 = 2 + ng(2,k) - ((pi(2) -idi(2))*ng(2,lt))/pi(2) - ie2 = 1 + ng(2,k) - ((pi(2)-1-idi(2))*ng(2,lt))/pi(2) - n2 = 3 + ie2 - is2 - is3 = 2 + ng(3,k) - ((pi(3) -idi(3))*ng(3,lt))/pi(3) - ie3 = 1 + ng(3,k) - ((pi(3)-1-idi(3))*ng(3,lt))/pi(3) - n3 = 3 + ie3 - is3 - - - ir(lt)=1 - do j = lt-1, 1, -1 - ir(j)=ir(j+1)+m1(j+1)*m2(j+1)*m3(j+1) - enddo - - - if( debug_vec(1) .ge. 1 )then - if( me .eq. root )write(*,*)' in setup, ' - if( me .eq. root )write(*,*)' me k lt nx ny nz ', - > ' n1 n2 n3 is1 is2 is3 ie1 ie2 ie3' - do i=0,nprocs-1 - if( me .eq. i )then - write(*,9) me,k,lt,ng(1,k),ng(2,k),ng(3,k), - > n1,n2,n3,is1,is2,is3,ie1,ie2,ie3 - 9 format(15i4) - endif - call mpi_barrier(mpi_comm_world,ierr) - enddo - endif - if( debug_vec(1) .ge. 2 )then - do i=0,nprocs-1 - if( me .eq. i )then - write(*,*)' ' - write(*,*)' processor =',me - do k=lt,1,-1 - write(*,7)k,idi(1),idi(2),idi(3), - > ((nbr(d,j,k),j=-1,1,2),d=1,3), - > (mi(d,k),d=1,3) - enddo - 7 format(i4,'idi=',3i4,'nbr=',3(2i4,' '),'mi=',3i4,' ') - write(*,*)'idi(s) = ',(idi(s),s=1,3) - write(*,*)'dead(2), dead(1) = ',dead(2),dead(1) - do ax=1,3 - write(*,*)'give_ex(ax,2)= ',give_ex(ax,2) - write(*,*)'take_ex(ax,2)= ',take_ex(ax,2) - enddo - endif - call mpi_barrier(mpi_comm_world,ierr) - enddo - endif - - k = lt - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine mg3P(u,v,r,a,c,n1,n2,n3,k) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c multigrid V-cycle routine -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer n1, n2, n3, k - double precision u(nr),v(nv),r(nr) - double precision a(0:3),c(0:3) - - integer j - -c--------------------------------------------------------------------- -c down cycle. -c restrict the residual from the find grid to the coarse -c--------------------------------------------------------------------- - - do k= lt, lb+1 , -1 - j = k-1 - call rprj3(r(ir(k)),m1(k),m2(k),m3(k), - > r(ir(j)),m1(j),m2(j),m3(j),k) - enddo - - k = lb -c--------------------------------------------------------------------- -c compute an approximate solution on the coarsest grid -c--------------------------------------------------------------------- - call zero3(u(ir(k)),m1(k),m2(k),m3(k)) - call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k) - - do k = lb+1, lt-1 - j = k-1 -c--------------------------------------------------------------------- -c prolongate from level k-1 to k -c--------------------------------------------------------------------- - call zero3(u(ir(k)),m1(k),m2(k),m3(k)) - call interp(u(ir(j)),m1(j),m2(j),m3(j), - > u(ir(k)),m1(k),m2(k),m3(k),k) -c--------------------------------------------------------------------- -c compute residual for level k -c--------------------------------------------------------------------- - call resid(u(ir(k)),r(ir(k)),r(ir(k)),m1(k),m2(k),m3(k),a,k) -c--------------------------------------------------------------------- -c apply smoother -c--------------------------------------------------------------------- - call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k) - enddo - 200 continue - j = lt - 1 - k = lt - call interp(u(ir(j)),m1(j),m2(j),m3(j),u,n1,n2,n3,k) - call resid(u,v,r,n1,n2,n3,a,k) - call psinv(r,u,n1,n2,n3,c,k) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine psinv( r,u,n1,n2,n3,c,k) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c psinv applies an approximate inverse as smoother: u = u + Cr -c -c This implementation costs 15A + 4M per result, where -c A and M denote the costs of Addition and Multiplication. -c Presuming coefficient c(3) is zero (the NPB assumes this, -c but it is thus not a general case), 2A + 1M may be eliminated, -c resulting in 13A + 3M. -c Note that this vectorizes, and is also fine for cache -c based machines. -c--------------------------------------------------------------------- - implicit none - - include 'globals.h' - - integer n1,n2,n3,k - double precision u(n1,n2,n3),r(n1,n2,n3),c(0:3) - integer i3, i2, i1 - - double precision r1(m), r2(m) - - do i3=2,n3-1 - do i2=2,n2-1 - do i1=1,n1 - r1(i1) = r(i1,i2-1,i3) + r(i1,i2+1,i3) - > + r(i1,i2,i3-1) + r(i1,i2,i3+1) - r2(i1) = r(i1,i2-1,i3-1) + r(i1,i2+1,i3-1) - > + r(i1,i2-1,i3+1) + r(i1,i2+1,i3+1) - enddo - do i1=2,n1-1 - u(i1,i2,i3) = u(i1,i2,i3) - > + c(0) * r(i1,i2,i3) - > + c(1) * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) - > + r1(i1) ) - > + c(2) * ( r2(i1) + r1(i1-1) + r1(i1+1) ) -c--------------------------------------------------------------------- -c Assume c(3) = 0 (Enable line below if c(3) not= 0) -c--------------------------------------------------------------------- -c > + c(3) * ( r2(i1-1) + r2(i1+1) ) -c--------------------------------------------------------------------- - enddo - enddo - enddo - -c--------------------------------------------------------------------- -c exchange boundary points -c--------------------------------------------------------------------- - call comm3(u,n1,n2,n3,k) - - if( debug_vec(0) .ge. 1 )then - call rep_nrm(u,n1,n2,n3,' psinv',k) - endif - - if( debug_vec(3) .ge. k )then - call showall(u,n1,n2,n3) - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine resid( u,v,r,n1,n2,n3,a,k ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c resid computes the residual: r = v - Au -c -c This implementation costs 15A + 4M per result, where -c A and M denote the costs of Addition (or Subtraction) and -c Multiplication, respectively. -c Presuming coefficient a(1) is zero (the NPB assumes this, -c but it is thus not a general case), 3A + 1M may be eliminated, -c resulting in 12A + 3M. -c Note that this vectorizes, and is also fine for cache -c based machines. -c--------------------------------------------------------------------- - implicit none - - include 'globals.h' - - integer n1,n2,n3,k - double precision u(n1,n2,n3),v(n1,n2,n3),r(n1,n2,n3),a(0:3) - integer i3, i2, i1 - double precision u1(m), u2(m) - - do i3=2,n3-1 - do i2=2,n2-1 - do i1=1,n1 - u1(i1) = u(i1,i2-1,i3) + u(i1,i2+1,i3) - > + u(i1,i2,i3-1) + u(i1,i2,i3+1) - u2(i1) = u(i1,i2-1,i3-1) + u(i1,i2+1,i3-1) - > + u(i1,i2-1,i3+1) + u(i1,i2+1,i3+1) - enddo - do i1=2,n1-1 - r(i1,i2,i3) = v(i1,i2,i3) - > - a(0) * u(i1,i2,i3) -c--------------------------------------------------------------------- -c Assume a(1) = 0 (Enable 2 lines below if a(1) not= 0) -c--------------------------------------------------------------------- -c > - a(1) * ( u(i1-1,i2,i3) + u(i1+1,i2,i3) -c > + u1(i1) ) -c--------------------------------------------------------------------- - > - a(2) * ( u2(i1) + u1(i1-1) + u1(i1+1) ) - > - a(3) * ( u2(i1-1) + u2(i1+1) ) - enddo - enddo - enddo - -c--------------------------------------------------------------------- -c exchange boundary data -c--------------------------------------------------------------------- - call comm3(r,n1,n2,n3,k) - - if( debug_vec(0) .ge. 1 )then - call rep_nrm(r,n1,n2,n3,' resid',k) - endif - - if( debug_vec(2) .ge. k )then - call showall(r,n1,n2,n3) - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine rprj3( r,m1k,m2k,m3k,s,m1j,m2j,m3j,k ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c rprj3 projects onto the next coarser grid, -c using a trilinear Finite Element projection: s = r' = P r -c -c This implementation costs 20A + 4M per result, where -c A and M denote the costs of Addition and Multiplication. -c Note that this vectorizes, and is also fine for cache -c based machines. -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer m1k, m2k, m3k, m1j, m2j, m3j,k - double precision r(m1k,m2k,m3k), s(m1j,m2j,m3j) - integer j3, j2, j1, i3, i2, i1, d1, d2, d3, j - - double precision x1(m), y1(m), x2,y2 - - - if(m1k.eq.3)then - d1 = 2 - else - d1 = 1 - endif - - if(m2k.eq.3)then - d2 = 2 - else - d2 = 1 - endif - - if(m3k.eq.3)then - d3 = 2 - else - d3 = 1 - endif - - do j3=2,m3j-1 - i3 = 2*j3-d3 -C i3 = 2*j3-1 - do j2=2,m2j-1 - i2 = 2*j2-d2 -C i2 = 2*j2-1 - - do j1=2,m1j - i1 = 2*j1-d1 -C i1 = 2*j1-1 - x1(i1-1) = r(i1-1,i2-1,i3 ) + r(i1-1,i2+1,i3 ) - > + r(i1-1,i2, i3-1) + r(i1-1,i2, i3+1) - y1(i1-1) = r(i1-1,i2-1,i3-1) + r(i1-1,i2-1,i3+1) - > + r(i1-1,i2+1,i3-1) + r(i1-1,i2+1,i3+1) - enddo - - do j1=2,m1j-1 - i1 = 2*j1-d1 -C i1 = 2*j1-1 - y2 = r(i1, i2-1,i3-1) + r(i1, i2-1,i3+1) - > + r(i1, i2+1,i3-1) + r(i1, i2+1,i3+1) - x2 = r(i1, i2-1,i3 ) + r(i1, i2+1,i3 ) - > + r(i1, i2, i3-1) + r(i1, i2, i3+1) - s(j1,j2,j3) = - > 0.5D0 * r(i1,i2,i3) - > + 0.25D0 * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) + x2) - > + 0.125D0 * ( x1(i1-1) + x1(i1+1) + y2) - > + 0.0625D0 * ( y1(i1-1) + y1(i1+1) ) - enddo - - enddo - enddo - - - j = k-1 - call comm3(s,m1j,m2j,m3j,j) - - if( debug_vec(0) .ge. 1 )then - call rep_nrm(s,m1j,m2j,m3j,' rprj3',k-1) - endif - - if( debug_vec(4) .ge. k )then - call showall(s,m1j,m2j,m3j) - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine interp( z,mm1,mm2,mm3,u,n1,n2,n3,k ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c interp adds the trilinear interpolation of the correction -c from the coarser grid to the current approximation: u = u + Qu' -c -c Observe that this implementation costs 16A + 4M, where -c A and M denote the costs of Addition and Multiplication. -c Note that this vectorizes, and is also fine for cache -c based machines. Vector machines may get slightly better -c performance however, with 8 separate "do i1" loops, rather than 4. -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer mm1, mm2, mm3, n1, n2, n3,k - double precision z(mm1,mm2,mm3),u(n1,n2,n3) - integer i3, i2, i1, d1, d2, d3, t1, t2, t3 - -c note that m = 1037 in globals.h but for this only need to be -c 535 to handle up to 1024^3 -c integer m -c parameter( m=535 ) - double precision z1(m),z2(m),z3(m) - - - if( n1 .ne. 3 .and. n2 .ne. 3 .and. n3 .ne. 3 ) then - - do i3=1,mm3-1 - do i2=1,mm2-1 - - do i1=1,mm1 - z1(i1) = z(i1,i2+1,i3) + z(i1,i2,i3) - z2(i1) = z(i1,i2,i3+1) + z(i1,i2,i3) - z3(i1) = z(i1,i2+1,i3+1) + z(i1,i2,i3+1) + z1(i1) - enddo - - do i1=1,mm1-1 - u(2*i1-1,2*i2-1,2*i3-1)=u(2*i1-1,2*i2-1,2*i3-1) - > +z(i1,i2,i3) - u(2*i1,2*i2-1,2*i3-1)=u(2*i1,2*i2-1,2*i3-1) - > +0.5d0*(z(i1+1,i2,i3)+z(i1,i2,i3)) - enddo - do i1=1,mm1-1 - u(2*i1-1,2*i2,2*i3-1)=u(2*i1-1,2*i2,2*i3-1) - > +0.5d0 * z1(i1) - u(2*i1,2*i2,2*i3-1)=u(2*i1,2*i2,2*i3-1) - > +0.25d0*( z1(i1) + z1(i1+1) ) - enddo - do i1=1,mm1-1 - u(2*i1-1,2*i2-1,2*i3)=u(2*i1-1,2*i2-1,2*i3) - > +0.5d0 * z2(i1) - u(2*i1,2*i2-1,2*i3)=u(2*i1,2*i2-1,2*i3) - > +0.25d0*( z2(i1) + z2(i1+1) ) - enddo - do i1=1,mm1-1 - u(2*i1-1,2*i2,2*i3)=u(2*i1-1,2*i2,2*i3) - > +0.25d0* z3(i1) - u(2*i1,2*i2,2*i3)=u(2*i1,2*i2,2*i3) - > +0.125d0*( z3(i1) + z3(i1+1) ) - enddo - enddo - enddo - - else - - if(n1.eq.3)then - d1 = 2 - t1 = 1 - else - d1 = 1 - t1 = 0 - endif - - if(n2.eq.3)then - d2 = 2 - t2 = 1 - else - d2 = 1 - t2 = 0 - endif - - if(n3.eq.3)then - d3 = 2 - t3 = 1 - else - d3 = 1 - t3 = 0 - endif - - do i3=d3,mm3-1 - do i2=d2,mm2-1 - do i1=d1,mm1-1 - u(2*i1-d1,2*i2-d2,2*i3-d3)=u(2*i1-d1,2*i2-d2,2*i3-d3) - > +z(i1,i2,i3) - enddo - do i1=1,mm1-1 - u(2*i1-t1,2*i2-d2,2*i3-d3)=u(2*i1-t1,2*i2-d2,2*i3-d3) - > +0.5D0*(z(i1+1,i2,i3)+z(i1,i2,i3)) - enddo - enddo - do i2=1,mm2-1 - do i1=d1,mm1-1 - u(2*i1-d1,2*i2-t2,2*i3-d3)=u(2*i1-d1,2*i2-t2,2*i3-d3) - > +0.5D0*(z(i1,i2+1,i3)+z(i1,i2,i3)) - enddo - do i1=1,mm1-1 - u(2*i1-t1,2*i2-t2,2*i3-d3)=u(2*i1-t1,2*i2-t2,2*i3-d3) - > +0.25D0*(z(i1+1,i2+1,i3)+z(i1+1,i2,i3) - > +z(i1, i2+1,i3)+z(i1, i2,i3)) - enddo - enddo - enddo - - do i3=1,mm3-1 - do i2=d2,mm2-1 - do i1=d1,mm1-1 - u(2*i1-d1,2*i2-d2,2*i3-t3)=u(2*i1-d1,2*i2-d2,2*i3-t3) - > +0.5D0*(z(i1,i2,i3+1)+z(i1,i2,i3)) - enddo - do i1=1,mm1-1 - u(2*i1-t1,2*i2-d2,2*i3-t3)=u(2*i1-t1,2*i2-d2,2*i3-t3) - > +0.25D0*(z(i1+1,i2,i3+1)+z(i1,i2,i3+1) - > +z(i1+1,i2,i3 )+z(i1,i2,i3 )) - enddo - enddo - do i2=1,mm2-1 - do i1=d1,mm1-1 - u(2*i1-d1,2*i2-t2,2*i3-t3)=u(2*i1-d1,2*i2-t2,2*i3-t3) - > +0.25D0*(z(i1,i2+1,i3+1)+z(i1,i2,i3+1) - > +z(i1,i2+1,i3 )+z(i1,i2,i3 )) - enddo - do i1=1,mm1-1 - u(2*i1-t1,2*i2-t2,2*i3-t3)=u(2*i1-t1,2*i2-t2,2*i3-t3) - > +0.125D0*(z(i1+1,i2+1,i3+1)+z(i1+1,i2,i3+1) - > +z(i1 ,i2+1,i3+1)+z(i1 ,i2,i3+1) - > +z(i1+1,i2+1,i3 )+z(i1+1,i2,i3 ) - > +z(i1 ,i2+1,i3 )+z(i1 ,i2,i3 )) - enddo - enddo - enddo - - endif - - call comm3_ex(u,n1,n2,n3,k) - - if( debug_vec(0) .ge. 1 )then - call rep_nrm(z,mm1,mm2,mm3,'z: inter',k-1) - call rep_nrm(u,n1,n2,n3,'u: inter',k) - endif - - if( debug_vec(5) .ge. k )then - call showall(z,mm1,mm2,mm3) - call showall(u,n1,n2,n3) - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine norm2u3(r,n1,n2,n3,rnm2,rnmu,nx,ny,nz) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c norm2u3 evaluates approximations to the L2 norm and the -c uniform (or L-infinity or Chebyshev) norm, under the -c assumption that the boundaries are periodic or zero. Add the -c boundaries in with half weight (quarter weight on the edges -c and eighth weight at the corners) for inhomogeneous boundaries. -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - - integer n1, n2, n3, nx, ny, nz - double precision rnm2, rnmu, r(n1,n2,n3) - double precision s, a, ss - integer i3, i2, i1, ierr - - double precision dn - - dn = 1.0d0*nx*ny*nz - - s=0.0D0 - rnmu = 0.0D0 - do i3=2,n3-1 - do i2=2,n2-1 - do i1=2,n1-1 - s=s+r(i1,i2,i3)**2 - a=abs(r(i1,i2,i3)) - if(a.gt.rnmu)rnmu=a - enddo - enddo - enddo - - call mpi_allreduce(rnmu,ss,1,dp_type, - > mpi_max,mpi_comm_world,ierr) - rnmu = ss - call mpi_allreduce(s, ss, 1, dp_type, - > mpi_sum,mpi_comm_world,ierr) - s = ss - rnm2=sqrt( s / dn ) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine rep_nrm(u,n1,n2,n3,title,kk) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c report on norm -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer n1, n2, n3, kk - double precision u(n1,n2,n3) - character*8 title - - double precision rnm2, rnmu - - - call norm2u3(u,n1,n2,n3,rnm2,rnmu,nx(kk),ny(kk),nz(kk)) - if( me .eq. root )then - write(*,7)kk,title,rnm2,rnmu - 7 format(' Level',i2,' in ',a8,': norms =',D21.14,D21.14) - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine comm3(u,n1,n2,n3,kk) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c comm3 organizes the communication on all borders -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer n1, n2, n3, kk - double precision u(n1,n2,n3) - integer axis - - if( .not. dead(kk) )then - do axis = 1, 3 - if( nprocs .ne. 1) then - - call ready( axis, -1, kk ) - call ready( axis, +1, kk ) - - call give3( axis, +1, u, n1, n2, n3, kk ) - call give3( axis, -1, u, n1, n2, n3, kk ) - - call take3( axis, -1, u, n1, n2, n3 ) - call take3( axis, +1, u, n1, n2, n3 ) - - else - call comm1p( axis, u, n1, n2, n3, kk ) - endif - enddo - else - call zero3(u,n1,n2,n3) - endif - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine comm3_ex(u,n1,n2,n3,kk) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c comm3_ex communicates to expand the number of processors -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer n1, n2, n3, kk - double precision u(n1,n2,n3) - integer axis - - do axis = 1, 3 - if( nprocs .ne. 1 ) then - if( take_ex( axis, kk ) )then - call ready( axis, -1, kk ) - call ready( axis, +1, kk ) - call take3_ex( axis, -1, u, n1, n2, n3 ) - call take3_ex( axis, +1, u, n1, n2, n3 ) - endif - - if( give_ex( axis, kk ) )then - call give3_ex( axis, +1, u, n1, n2, n3, kk ) - call give3_ex( axis, -1, u, n1, n2, n3, kk ) - endif - else - call comm1p_ex( axis, u, n1, n2, n3, kk ) - endif - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine ready( axis, dir, k ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c ready allocates a buffer to take in a message -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer axis, dir, k - integer buff_id,buff_len,i,ierr - - buff_id = 3 + dir - buff_len = nm2 - - do i=1,nm2 - buff(i,buff_id) = 0.0D0 - enddo - - -c--------------------------------------------------------------------- -c fake message request type -c--------------------------------------------------------------------- - msg_id(axis,dir,1) = msg_type(axis,dir) +1000*me - - call mpi_irecv( buff(1,buff_id), buff_len, - > dp_type, nbr(axis,-dir,k), msg_type(axis,dir), - > mpi_comm_world, msg_id(axis,dir,1), ierr) - return - end - - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine give3( axis, dir, u, n1, n2, n3, k ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c give3 sends border data out in the requested direction -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer axis, dir, n1, n2, n3, k, ierr - double precision u( n1, n2, n3 ) - - integer i3, i2, i1, buff_len,buff_id - - buff_id = 2 + dir - buff_len = 0 - - if( axis .eq. 1 )then - if( dir .eq. -1 )then - - do i3=2,n3-1 - do i2=2,n2-1 - buff_len = buff_len + 1 - buff(buff_len,buff_id ) = u( 2, i2,i3) - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - else if( dir .eq. +1 ) then - - do i3=2,n3-1 - do i2=2,n2-1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( n1-1, i2,i3) - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - endif - endif - - if( axis .eq. 2 )then - if( dir .eq. -1 )then - - do i3=2,n3-1 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1, 2,i3) - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - else if( dir .eq. +1 ) then - - do i3=2,n3-1 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id )= u( i1,n2-1,i3) - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - endif - endif - - if( axis .eq. 3 )then - if( dir .eq. -1 )then - - do i2=1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1,i2,2) - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - else if( dir .eq. +1 ) then - - do i2=1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1,i2,n3-1) - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - endif - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine take3( axis, dir, u, n1, n2, n3 ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c take3 copies in border data from the requested direction -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer axis, dir, n1, n2, n3 - double precision u( n1, n2, n3 ) - - integer buff_id, indx - - integer status(mpi_status_size), ierr - - integer i3, i2, i1 - - call mpi_wait( msg_id( axis, dir, 1 ),status,ierr) - buff_id = 3 + dir - indx = 0 - - if( axis .eq. 1 )then - if( dir .eq. -1 )then - - do i3=2,n3-1 - do i2=2,n2-1 - indx = indx + 1 - u(n1,i2,i3) = buff(indx, buff_id ) - enddo - enddo - - else if( dir .eq. +1 ) then - - do i3=2,n3-1 - do i2=2,n2-1 - indx = indx + 1 - u(1,i2,i3) = buff(indx, buff_id ) - enddo - enddo - - endif - endif - - if( axis .eq. 2 )then - if( dir .eq. -1 )then - - do i3=2,n3-1 - do i1=1,n1 - indx = indx + 1 - u(i1,n2,i3) = buff(indx, buff_id ) - enddo - enddo - - else if( dir .eq. +1 ) then - - do i3=2,n3-1 - do i1=1,n1 - indx = indx + 1 - u(i1,1,i3) = buff(indx, buff_id ) - enddo - enddo - - endif - endif - - if( axis .eq. 3 )then - if( dir .eq. -1 )then - - do i2=1,n2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,n3) = buff(indx, buff_id ) - enddo - enddo - - else if( dir .eq. +1 ) then - - do i2=1,n2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,1) = buff(indx, buff_id ) - enddo - enddo - - endif - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine give3_ex( axis, dir, u, n1, n2, n3, k ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c give3_ex sends border data out to expand number of processors -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer axis, dir, n1, n2, n3, k, ierr - double precision u( n1, n2, n3 ) - - integer i3, i2, i1, buff_len, buff_id - - buff_id = 2 + dir - buff_len = 0 - - if( axis .eq. 1 )then - if( dir .eq. -1 )then - - do i3=1,n3 - do i2=1,n2 - buff_len = buff_len + 1 - buff(buff_len,buff_id ) = u( 2, i2,i3) - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - else if( dir .eq. +1 ) then - - do i3=1,n3 - do i2=1,n2 - do i1=n1-1,n1 - buff_len = buff_len + 1 - buff(buff_len,buff_id)= u(i1,i2,i3) - enddo - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - endif - endif - - if( axis .eq. 2 )then - if( dir .eq. -1 )then - - do i3=1,n3 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1, 2,i3) - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - else if( dir .eq. +1 ) then - - do i3=1,n3 - do i2=n2-1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len,buff_id )= u(i1,i2,i3) - enddo - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - endif - endif - - if( axis .eq. 3 )then - if( dir .eq. -1 )then - - do i2=1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1,i2,2) - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - else if( dir .eq. +1 ) then - - do i3=n3-1,n3 - do i2=1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1,i2,i3) - enddo - enddo - enddo - - call mpi_send( - > buff(1, buff_id ), buff_len,dp_type, - > nbr( axis, dir, k ), msg_type(axis,dir), - > mpi_comm_world, ierr) - - endif - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine take3_ex( axis, dir, u, n1, n2, n3 ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c take3_ex copies in border data to expand number of processors -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer axis, dir, n1, n2, n3 - double precision u( n1, n2, n3 ) - - integer buff_id, indx - - integer status(mpi_status_size) , ierr - - integer i3, i2, i1 - - call mpi_wait( msg_id( axis, dir, 1 ),status,ierr) - buff_id = 3 + dir - indx = 0 - - if( axis .eq. 1 )then - if( dir .eq. -1 )then - - do i3=1,n3 - do i2=1,n2 - indx = indx + 1 - u(n1,i2,i3) = buff(indx, buff_id ) - enddo - enddo - - else if( dir .eq. +1 ) then - - do i3=1,n3 - do i2=1,n2 - do i1=1,2 - indx = indx + 1 - u(i1,i2,i3) = buff(indx,buff_id) - enddo - enddo - enddo - - endif - endif - - if( axis .eq. 2 )then - if( dir .eq. -1 )then - - do i3=1,n3 - do i1=1,n1 - indx = indx + 1 - u(i1,n2,i3) = buff(indx, buff_id ) - enddo - enddo - - else if( dir .eq. +1 ) then - - do i3=1,n3 - do i2=1,2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,i3) = buff(indx,buff_id) - enddo - enddo - enddo - - endif - endif - - if( axis .eq. 3 )then - if( dir .eq. -1 )then - - do i2=1,n2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,n3) = buff(indx, buff_id ) - enddo - enddo - - else if( dir .eq. +1 ) then - - do i3=1,2 - do i2=1,n2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,i3) = buff(indx,buff_id) - enddo - enddo - enddo - - endif - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine comm1p( axis, u, n1, n2, n3, kk ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer axis, dir, n1, n2, n3 - double precision u( n1, n2, n3 ) - - integer i3, i2, i1, buff_len,buff_id - integer i, kk, indx - - dir = -1 - - buff_id = 3 + dir - buff_len = nm2 - - do i=1,nm2 - buff(i,buff_id) = 0.0D0 - enddo - - - dir = +1 - - buff_id = 3 + dir - buff_len = nm2 - - do i=1,nm2 - buff(i,buff_id) = 0.0D0 - enddo - - dir = +1 - - buff_id = 2 + dir - buff_len = 0 - - if( axis .eq. 1 )then - do i3=2,n3-1 - do i2=2,n2-1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( n1-1, i2,i3) - enddo - enddo - endif - - if( axis .eq. 2 )then - do i3=2,n3-1 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id )= u( i1,n2-1,i3) - enddo - enddo - endif - - if( axis .eq. 3 )then - do i2=1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1,i2,n3-1) - enddo - enddo - endif - - dir = -1 - - buff_id = 2 + dir - buff_len = 0 - - if( axis .eq. 1 )then - do i3=2,n3-1 - do i2=2,n2-1 - buff_len = buff_len + 1 - buff(buff_len,buff_id ) = u( 2, i2,i3) - enddo - enddo - endif - - if( axis .eq. 2 )then - do i3=2,n3-1 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1, 2,i3) - enddo - enddo - endif - - if( axis .eq. 3 )then - do i2=1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1,i2,2) - enddo - enddo - endif - - do i=1,nm2 - buff(i,4) = buff(i,3) - buff(i,2) = buff(i,1) - enddo - - dir = -1 - - buff_id = 3 + dir - indx = 0 - - if( axis .eq. 1 )then - do i3=2,n3-1 - do i2=2,n2-1 - indx = indx + 1 - u(n1,i2,i3) = buff(indx, buff_id ) - enddo - enddo - endif - - if( axis .eq. 2 )then - do i3=2,n3-1 - do i1=1,n1 - indx = indx + 1 - u(i1,n2,i3) = buff(indx, buff_id ) - enddo - enddo - endif - - if( axis .eq. 3 )then - do i2=1,n2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,n3) = buff(indx, buff_id ) - enddo - enddo - endif - - - dir = +1 - - buff_id = 3 + dir - indx = 0 - - if( axis .eq. 1 )then - do i3=2,n3-1 - do i2=2,n2-1 - indx = indx + 1 - u(1,i2,i3) = buff(indx, buff_id ) - enddo - enddo - endif - - if( axis .eq. 2 )then - do i3=2,n3-1 - do i1=1,n1 - indx = indx + 1 - u(i1,1,i3) = buff(indx, buff_id ) - enddo - enddo - endif - - if( axis .eq. 3 )then - do i2=1,n2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,1) = buff(indx, buff_id ) - enddo - enddo - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine comm1p_ex( axis, u, n1, n2, n3, kk ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - include 'globals.h' - - integer axis, dir, n1, n2, n3 - double precision u( n1, n2, n3 ) - - integer i3, i2, i1, buff_len,buff_id - integer i, kk, indx - - if( take_ex( axis, kk ) ) then - - dir = -1 - - buff_id = 3 + dir - buff_len = nm2 - - do i=1,nm2 - buff(i,buff_id) = 0.0D0 - enddo - - - dir = +1 - - buff_id = 3 + dir - buff_len = nm2 - - do i=1,nm2 - buff(i,buff_id) = 0.0D0 - enddo - - - dir = -1 - - buff_id = 3 + dir - indx = 0 - - if( axis .eq. 1 )then - do i3=1,n3 - do i2=1,n2 - indx = indx + 1 - u(n1,i2,i3) = buff(indx, buff_id ) - enddo - enddo - endif - - if( axis .eq. 2 )then - do i3=1,n3 - do i1=1,n1 - indx = indx + 1 - u(i1,n2,i3) = buff(indx, buff_id ) - enddo - enddo - endif - - if( axis .eq. 3 )then - do i2=1,n2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,n3) = buff(indx, buff_id ) - enddo - enddo - endif - - dir = +1 - - buff_id = 3 + dir - indx = 0 - - if( axis .eq. 1 )then - do i3=1,n3 - do i2=1,n2 - do i1=1,2 - indx = indx + 1 - u(i1,i2,i3) = buff(indx,buff_id) - enddo - enddo - enddo - endif - - if( axis .eq. 2 )then - do i3=1,n3 - do i2=1,2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,i3) = buff(indx,buff_id) - enddo - enddo - enddo - endif - - if( axis .eq. 3 )then - do i3=1,2 - do i2=1,n2 - do i1=1,n1 - indx = indx + 1 - u(i1,i2,i3) = buff(indx,buff_id) - enddo - enddo - enddo - endif - - endif - - if( give_ex( axis, kk ) )then - - dir = +1 - - buff_id = 2 + dir - buff_len = 0 - - if( axis .eq. 1 )then - do i3=1,n3 - do i2=1,n2 - do i1=n1-1,n1 - buff_len = buff_len + 1 - buff(buff_len,buff_id)= u(i1,i2,i3) - enddo - enddo - enddo - endif - - if( axis .eq. 2 )then - do i3=1,n3 - do i2=n2-1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len,buff_id )= u(i1,i2,i3) - enddo - enddo - enddo - endif - - if( axis .eq. 3 )then - do i3=n3-1,n3 - do i2=1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1,i2,i3) - enddo - enddo - enddo - endif - - dir = -1 - - buff_id = 2 + dir - buff_len = 0 - - if( axis .eq. 1 )then - do i3=1,n3 - do i2=1,n2 - buff_len = buff_len + 1 - buff(buff_len,buff_id ) = u( 2, i2,i3) - enddo - enddo - endif - - if( axis .eq. 2 )then - do i3=1,n3 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1, 2,i3) - enddo - enddo - endif - - if( axis .eq. 3 )then - do i2=1,n2 - do i1=1,n1 - buff_len = buff_len + 1 - buff(buff_len, buff_id ) = u( i1,i2,2) - enddo - enddo - endif - - endif - - do i=1,nm2 - buff(i,4) = buff(i,3) - buff(i,2) = buff(i,1) - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine zran3(z,n1,n2,n3,nx,ny,k) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c zran3 loads +1 at ten randomly chosen points, -c loads -1 at a different ten random points, -c and zero elsewhere. -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - - integer is1, is2, is3, ie1, ie2, ie3 - common /grid/ is1,is2,is3,ie1,ie2,ie3 - - integer n1, n2, n3, k, nx, ny, ierr, i0, m0, m1 - double precision z(n1,n2,n3) - - integer mm, i1, i2, i3, d1, e1, e2, e3 - double precision x, a - double precision xx, x0, x1, a1, a2, ai, power - parameter( mm = 10, a = 5.D0 ** 13, x = 314159265.D0) - double precision ten( mm, 0:1 ), temp, best - integer i, j1( mm, 0:1 ), j2( mm, 0:1 ), j3( mm, 0:1 ) - integer jg( 0:3, mm, 0:1 ), jg_temp(4) - - external randlc - double precision randlc, rdummy - - a1 = power( a, nx, 1, 0 ) - a2 = power( a, nx, ny, 0 ) - - call zero3(z,n1,n2,n3) - -c i = is1-2+nx*(is2-2+ny*(is3-2)) - - ai = power( a, nx, is2-2+ny*(is3-2), is1-2 ) - d1 = ie1 - is1 + 1 - e1 = ie1 - is1 + 2 - e2 = ie2 - is2 + 2 - e3 = ie3 - is3 + 2 - x0 = x - rdummy = randlc( x0, ai ) - do i3 = 2, e3 - x1 = x0 - do i2 = 2, e2 - xx = x1 - call vranlc( d1, xx, a, z( 2, i2, i3 )) - rdummy = randlc( x1, a1 ) - enddo - rdummy = randlc( x0, a2 ) - enddo - -c--------------------------------------------------------------------- -c call comm3(z,n1,n2,n3) -c call showall(z,n1,n2,n3) -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c each processor looks for twenty candidates -c--------------------------------------------------------------------- - do i=1,mm - ten( i, 1 ) = 0.0D0 - j1( i, 1 ) = 0 - j2( i, 1 ) = 0 - j3( i, 1 ) = 0 - ten( i, 0 ) = 1.0D0 - j1( i, 0 ) = 0 - j2( i, 0 ) = 0 - j3( i, 0 ) = 0 - enddo - - do i3=2,n3-1 - do i2=2,n2-1 - do i1=2,n1-1 - if( z(i1,i2,i3) .gt. ten( 1, 1 ) )then - ten(1,1) = z(i1,i2,i3) - j1(1,1) = i1 - j2(1,1) = i2 - j3(1,1) = i3 - call bubble( ten, j1, j2, j3, mm, 1 ) - endif - if( z(i1,i2,i3) .lt. ten( 1, 0 ) )then - ten(1,0) = z(i1,i2,i3) - j1(1,0) = i1 - j2(1,0) = i2 - j3(1,0) = i3 - call bubble( ten, j1, j2, j3, mm, 0 ) - endif - enddo - enddo - enddo - - call mpi_barrier(mpi_comm_world,ierr) - -c--------------------------------------------------------------------- -c Now which of these are globally best? -c--------------------------------------------------------------------- - i1 = mm - i0 = mm - do i=mm,1,-1 - - best = z( j1(i1,1), j2(i1,1), j3(i1,1) ) - call mpi_allreduce(best,temp,1,dp_type, - > mpi_max,mpi_comm_world,ierr) - best = temp - if(best.eq.z(j1(i1,1),j2(i1,1),j3(i1,1)))then - jg( 0, i, 1) = me - jg( 1, i, 1) = is1 - 2 + j1( i1, 1 ) - jg( 2, i, 1) = is2 - 2 + j2( i1, 1 ) - jg( 3, i, 1) = is3 - 2 + j3( i1, 1 ) - i1 = i1-1 - else - jg( 0, i, 1) = 0 - jg( 1, i, 1) = 0 - jg( 2, i, 1) = 0 - jg( 3, i, 1) = 0 - endif - ten( i, 1 ) = best - call mpi_allreduce(jg(0,i,1), jg_temp,4,MPI_INTEGER, - > mpi_max,mpi_comm_world,ierr) - jg( 0, i, 1) = jg_temp(1) - jg( 1, i, 1) = jg_temp(2) - jg( 2, i, 1) = jg_temp(3) - jg( 3, i, 1) = jg_temp(4) - - best = z( j1(i0,0), j2(i0,0), j3(i0,0) ) - call mpi_allreduce(best,temp,1,dp_type, - > mpi_min,mpi_comm_world,ierr) - best = temp - if(best.eq.z(j1(i0,0),j2(i0,0),j3(i0,0)))then - jg( 0, i, 0) = me - jg( 1, i, 0) = is1 - 2 + j1( i0, 0 ) - jg( 2, i, 0) = is2 - 2 + j2( i0, 0 ) - jg( 3, i, 0) = is3 - 2 + j3( i0, 0 ) - i0 = i0-1 - else - jg( 0, i, 0) = 0 - jg( 1, i, 0) = 0 - jg( 2, i, 0) = 0 - jg( 3, i, 0) = 0 - endif - ten( i, 0 ) = best - call mpi_allreduce(jg(0,i,0), jg_temp,4,MPI_INTEGER, - > mpi_max,mpi_comm_world,ierr) - jg( 0, i, 0) = jg_temp(1) - jg( 1, i, 0) = jg_temp(2) - jg( 2, i, 0) = jg_temp(3) - jg( 3, i, 0) = jg_temp(4) - - enddo - m1 = i1+1 - m0 = i0+1 - -c if( me .eq. root) then -c write(*,*)' ' -c write(*,*)' negative charges at' -c write(*,9)(jg(1,i,0),jg(2,i,0),jg(3,i,0),i=1,mm) -c write(*,*)' positive charges at' -c write(*,9)(jg(1,i,1),jg(2,i,1),jg(3,i,1),i=1,mm) -c write(*,*)' small random numbers were' -c write(*,8)(ten( i,0),i=mm,1,-1) -c write(*,*)' and they were found on processor number' -c write(*,7)(jg(0,i,0),i=mm,1,-1) -c write(*,*)' large random numbers were' -c write(*,8)(ten( i,1),i=mm,1,-1) -c write(*,*)' and they were found on processor number' -c write(*,7)(jg(0,i,1),i=mm,1,-1) -c endif -c 9 format(5(' (',i3,2(',',i3),')')) -c 8 format(5D15.8) -c 7 format(10i4) - call mpi_barrier(mpi_comm_world,ierr) - do i3=1,n3 - do i2=1,n2 - do i1=1,n1 - z(i1,i2,i3) = 0.0D0 - enddo - enddo - enddo - do i=mm,m0,-1 - z( j1(i,0), j2(i,0), j3(i,0) ) = -1.0D0 - enddo - do i=mm,m1,-1 - z( j1(i,1), j2(i,1), j3(i,1) ) = +1.0D0 - enddo - call comm3(z,n1,n2,n3,k) - -c--------------------------------------------------------------------- -c call showall(z,n1,n2,n3) -c--------------------------------------------------------------------- - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine show_l(z,n1,n2,n3) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - - integer n1,n2,n3,i1,i2,i3,ierr - double precision z(n1,n2,n3) - integer m1, m2, m3,i - - m1 = min(n1,18) - m2 = min(n2,14) - m3 = min(n3,18) - - write(*,*)' ' - do i=0,nprocs-1 - if( me .eq. i )then - write(*,*)' id = ', me - do i3=1,m3 - do i1=1,m1 - write(*,6)(z(i1,i2,i3),i2=1,m2) - enddo - write(*,*)' - - - - - - - ' - enddo - write(*,*)' ' - 6 format(6f15.11) - endif - call mpi_barrier(mpi_comm_world,ierr) - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine showall(z,n1,n2,n3) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - - integer n1,n2,n3,i1,i2,i3,i,ierr - double precision z(n1,n2,n3) - integer m1, m2, m3 - - m1 = min(n1,18) - m2 = min(n2,14) - m3 = min(n3,18) - - write(*,*)' ' - do i=0,nprocs-1 - if( me .eq. i )then - write(*,*)' id = ', me - do i3=1,m3 - do i1=1,m1 - write(*,6)(z(i1,i2,i3),i2=1,m2) - enddo - write(*,*)' - - - - - - - ' - enddo - write(*,*)' ' - 6 format(15f6.3) - endif - call mpi_barrier(mpi_comm_world,ierr) - enddo - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine show(z,n1,n2,n3) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - integer n1,n2,n3,i1,i2,i3,ierr,i - double precision z(n1,n2,n3) - - write(*,*)' ' - do i=0,nprocs-1 - if( me .eq. i )then - write(*,*)' id = ', me - do i3=2,n3-1 - do i1=2,n1-1 - write(*,6)(z(i1,i2,i3),i2=2,n1-1) - enddo - write(*,*)' - - - - - - - ' - enddo - write(*,*)' ' - 6 format(8D10.3) - endif - call mpi_barrier(mpi_comm_world,ierr) - enddo - -c call comm3(z,n1,n2,n3) - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - double precision function power( a, n1, n2, n3 ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c power raises an integer, disguised as a double -c precision real, to an integer power. -c This version tries to avoid integer overflow by treating -c it as expressed in a form of "n1*n2+n3". -c--------------------------------------------------------------------- - implicit none - - double precision a, aj - integer n1, n2, n3 - - integer n1j, n2j, nj - external randlc - double precision randlc, rdummy - - power = 1.0d0 - aj = a - nj = n3 - n1j = n1 - n2j = n2 - 100 continue - - if( n2j .gt. 0 ) then - if( mod(n2j,2) .eq. 1 ) nj = nj + n1j - n2j = n2j/2 - else if( nj .eq. 0 ) then - go to 200 - endif - if( mod(nj,2) .eq. 1 ) rdummy = randlc( power, aj ) - rdummy = randlc( aj, aj ) - nj = nj/2 - go to 100 - - 200 continue - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine bubble( ten, j1, j2, j3, m, ind ) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c bubble does a bubble sort in direction dir -c--------------------------------------------------------------------- - implicit none - - include 'mpinpb.h' - - integer m, ind, j1( m, 0:1 ), j2( m, 0:1 ), j3( m, 0:1 ) - double precision ten( m, 0:1 ) - double precision temp - integer i, j_temp - - if( ind .eq. 1 )then - - do i=1,m-1 - if( ten(i,ind) .gt. ten(i+1,ind) )then - - temp = ten( i+1, ind ) - ten( i+1, ind ) = ten( i, ind ) - ten( i, ind ) = temp - - j_temp = j1( i+1, ind ) - j1( i+1, ind ) = j1( i, ind ) - j1( i, ind ) = j_temp - - j_temp = j2( i+1, ind ) - j2( i+1, ind ) = j2( i, ind ) - j2( i, ind ) = j_temp - - j_temp = j3( i+1, ind ) - j3( i+1, ind ) = j3( i, ind ) - j3( i, ind ) = j_temp - - else - return - endif - enddo - - else - - do i=1,m-1 - if( ten(i,ind) .lt. ten(i+1,ind) )then - - temp = ten( i+1, ind ) - ten( i+1, ind ) = ten( i, ind ) - ten( i, ind ) = temp - - j_temp = j1( i+1, ind ) - j1( i+1, ind ) = j1( i, ind ) - j1( i, ind ) = j_temp - - j_temp = j2( i+1, ind ) - j2( i+1, ind ) = j2( i, ind ) - j2( i, ind ) = j_temp - - j_temp = j3( i+1, ind ) - j3( i+1, ind ) = j3( i, ind ) - j3( i, ind ) = j_temp - - else - return - endif - enddo - - endif - - return - end - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine zero3(z,n1,n2,n3) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - - include 'mpinpb.h' - - integer n1, n2, n3 - double precision z(n1,n2,n3) - integer i1, i2, i3 - - do i3=1,n3 - do i2=1,n2 - do i1=1,n1 - z(i1,i2,i3)=0.0D0 - enddo - enddo - enddo - - return - end - - -c----- end of program ------------------------------------------------ diff --git a/examples/smpi/NAS/MG/mg.input.sample b/examples/smpi/NAS/MG/mg.input.sample deleted file mode 100644 index a4dcf81275..0000000000 --- a/examples/smpi/NAS/MG/mg.input.sample +++ /dev/null @@ -1,4 +0,0 @@ - 8 = top level - 256 256 256 = nx ny nz - 20 = nit - 0 0 0 0 0 0 0 0 = debug_vec diff --git a/examples/smpi/NAS/MG/mpinpb.h b/examples/smpi/NAS/MG/mpinpb.h deleted file mode 100644 index 1f0368c0b7..0000000000 --- a/examples/smpi/NAS/MG/mpinpb.h +++ /dev/null @@ -1,9 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'mpif.h' - - integer me, nprocs, root, dp_type - common /mpistuff/ me, nprocs, root, dp_type - diff --git a/examples/smpi/NAS/Makefile b/examples/smpi/NAS/Makefile index f40f6b1226..7f1bee88a4 100644 --- a/examples/smpi/NAS/Makefile +++ b/examples/smpi/NAS/Makefile @@ -8,26 +8,6 @@ SFILE=config/suite.def default: header @ sys/print_instructions -BT: bt -bt: header - cd BT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) SUBTYPE=$(SUBTYPE) VERSION=$(VERSION) - -SP: sp -sp: header - cd SP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) - -LU: lu -lu: header - cd LU; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) VERSION=$(VERSION) - -MG: mg -mg: header - cd MG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) - -FT: ft -ft: header - cd FT; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) - IS: is is: header cd IS; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) @@ -36,10 +16,6 @@ IS-trace: is-trace is-trace: header cd IS-trace; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) -CG: cg -cg: header - cd CG; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) - EP: ep ep: header cd EP; $(MAKE) NPROCS=$(NPROCS) CLASS=$(CLASS) diff --git a/examples/smpi/NAS/SP/Makefile b/examples/smpi/NAS/SP/Makefile deleted file mode 100644 index 01508aa935..0000000000 --- a/examples/smpi/NAS/SP/Makefile +++ /dev/null @@ -1,60 +0,0 @@ -SHELL=/bin/sh -BENCHMARK=sp -BENCHMARKU=SP - -include ../config/make.def - - -OBJS = sp.o make_set.o initialize.o exact_solution.o exact_rhs.o \ - set_constants.o adi.o define.o copy_faces.o rhs.o \ - lhsx.o lhsy.o lhsz.o x_solve.o ninvr.o y_solve.o pinvr.o \ - z_solve.o tzetar.o add.o txinvr.o error.o verify.o setup_mpi.o \ - ${COMMON}/print_results.o ${COMMON}/timers.o - -include ../sys/make.common - -# npbparams.h is included by header.h -# The following rule should do the trick but many make programs (not gmake) -# will do the wrong thing and rebuild the world every time (because the -# mod time on header.h is not changed. One solution would be to -# touch header.h but this might cause confusion if someone has -# accidentally deleted it. Instead, make the dependency on npbparams.h -# explicit in all the lines below (even though dependence is indirect). - -# header.h: npbparams.h - -${PROGRAM}: config ${OBJS} - ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} ${FMPI_LIB} - -.f.o: - ${FCOMPILE} $< - -sp.o: sp.f header.h npbparams.h mpinpb.h -make_set.o: make_set.f header.h npbparams.h mpinpb.h -initialize.o: initialize.f header.h npbparams.h -exact_solution.o: exact_solution.f header.h npbparams.h -exact_rhs.o: exact_rhs.f header.h npbparams.h -set_constants.o: set_constants.f header.h npbparams.h -adi.o: adi.f header.h npbparams.h -define.o: define.f header.h npbparams.h -copy_faces.o: copy_faces.f header.h npbparams.h mpinpb.h -rhs.o: rhs.f header.h npbparams.h -lhsx.o: lhsx.f header.h npbparams.h -lhsy.o: lhsy.f header.h npbparams.h -lhsz.o: lhsz.f header.h npbparams.h -x_solve.o: x_solve.f header.h npbparams.h mpinpb.h -ninvr.o: ninvr.f header.h npbparams.h -y_solve.o: y_solve.f header.h npbparams.h mpinpb.h -pinvr.o: pinvr.f header.h npbparams.h -z_solve.o: z_solve.f header.h npbparams.h mpinpb.h -tzetar.o: tzetar.f header.h npbparams.h -add.o: add.f header.h npbparams.h -txinvr.o: txinvr.f header.h npbparams.h -error.o: error.f header.h npbparams.h mpinpb.h -verify.o: verify.f header.h npbparams.h mpinpb.h -setup_mpi.o: setup_mpi.f mpinpb.h npbparams.h - - -clean: - - rm -f *.o *~ mputil* - - rm -f npbparams.h core diff --git a/examples/smpi/NAS/SP/README b/examples/smpi/NAS/SP/README deleted file mode 100644 index fe423db43f..0000000000 --- a/examples/smpi/NAS/SP/README +++ /dev/null @@ -1,17 +0,0 @@ - -This code implements a 3D Multi-partition algorithm for the solution -of the uncoupled systems of linear equations resulting from -Beam-Warming approximate factorization. Consequently, the program -must be run on a square number of processors. The included file -"npbparams.h" contains a parameter statement which sets "maxcells" -and "problem_size". The parameter maxcells must be set to the -square root of the number of processors. For example, if running -on 25 processors, then set max_cells=5. The standard problem sizes -are problem_size=64 for class A, 102 for class B, and 162 for class C. - -The number of time steps and the time step size dt are set in the -npbparams.h but may be overridden in the input deck "inputsp.data". -The number of time steps is 400 for all three -standard problems, and the appropriate time step sizes "dt" are -0.0015d0 for class A, 0.001d0 for class B, and 0.00067 for class C. - diff --git a/examples/smpi/NAS/SP/add.f b/examples/smpi/NAS/SP/add.f deleted file mode 100644 index cdc4765cbf..0000000000 --- a/examples/smpi/NAS/SP/add.f +++ /dev/null @@ -1,31 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine add - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c addition of update to the vector u -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, i, j, k, m - - do c = 1, ncells - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - u(i,j,k,m,c) = u(i,j,k,m,c) + rhs(i,j,k,m,c) - end do - end do - end do - end do - end do - - return - end diff --git a/examples/smpi/NAS/SP/adi.f b/examples/smpi/NAS/SP/adi.f deleted file mode 100644 index e55cfd60da..0000000000 --- a/examples/smpi/NAS/SP/adi.f +++ /dev/null @@ -1,24 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine adi - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - call copy_faces - - call txinvr - - call x_solve - - call y_solve - - call z_solve - - call add - - return - end - diff --git a/examples/smpi/NAS/SP/copy_faces.f b/examples/smpi/NAS/SP/copy_faces.f deleted file mode 100644 index 41824d2198..0000000000 --- a/examples/smpi/NAS/SP/copy_faces.f +++ /dev/null @@ -1,306 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine copy_faces - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c this function copies the face values of a variable defined on a set -c of cells to the overlap locations of the adjacent sets of cells. -c Because a set of cells interfaces in each direction with exactly one -c other set, we only need to fill six different buffers. We could try to -c overlap communication with computation, by computing -c some internal values while communicating boundary values, but this -c adds so much overhead that it's not clearly useful. -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i, j, k, c, m, requests(0:11), p0, p1, - > p2, p3, p4, p5, b_size(0:5), ss(0:5), - > sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11) - -c--------------------------------------------------------------------- -c exit immediately if there are no faces to be copied -c--------------------------------------------------------------------- - if (no_nodes .eq. 1) then - call compute_rhs - return - endif - - - ss(0) = start_send_east - ss(1) = start_send_west - ss(2) = start_send_north - ss(3) = start_send_south - ss(4) = start_send_top - ss(5) = start_send_bottom - - sr(0) = start_recv_east - sr(1) = start_recv_west - sr(2) = start_recv_north - sr(3) = start_recv_south - sr(4) = start_recv_top - sr(5) = start_recv_bottom - - b_size(0) = east_size - b_size(1) = west_size - b_size(2) = north_size - b_size(3) = south_size - b_size(4) = top_size - b_size(5) = bottom_size - -c--------------------------------------------------------------------- -c because the difference stencil for the diagonalized scheme is -c orthogonal, we do not have to perform the staged copying of faces, -c but can send all face information simultaneously to the neighboring -c cells in all directions -c--------------------------------------------------------------------- - p0 = 0 - p1 = 0 - p2 = 0 - p3 = 0 - p4 = 0 - p5 = 0 - - do c = 1, ncells - do m = 1, 5 - -c--------------------------------------------------------------------- -c fill the buffer to be sent to eastern neighbors (i-dir) -c--------------------------------------------------------------------- - if (cell_coord(1,c) .ne. ncells) then - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = cell_size(1,c)-2, cell_size(1,c)-1 - out_buffer(ss(0)+p0) = u(i,j,k,m,c) - p0 = p0 + 1 - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to western neighbors -c--------------------------------------------------------------------- - if (cell_coord(1,c) .ne. 1) then - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, 1 - out_buffer(ss(1)+p1) = u(i,j,k,m,c) - p1 = p1 + 1 - end do - end do - end do - - - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to northern neighbors (j_dir) -c--------------------------------------------------------------------- - if (cell_coord(2,c) .ne. ncells) then - do k = 0, cell_size(3,c)-1 - do j = cell_size(2,c)-2, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - out_buffer(ss(2)+p2) = u(i,j,k,m,c) - p2 = p2 + 1 - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to southern neighbors -c--------------------------------------------------------------------- - if (cell_coord(2,c).ne. 1) then - do k = 0, cell_size(3,c)-1 - do j = 0, 1 - do i = 0, cell_size(1,c)-1 - out_buffer(ss(3)+p3) = u(i,j,k,m,c) - p3 = p3 + 1 - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to top neighbors (k-dir) -c--------------------------------------------------------------------- - if (cell_coord(3,c) .ne. ncells) then - do k = cell_size(3,c)-2, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - out_buffer(ss(4)+p4) = u(i,j,k,m,c) - p4 = p4 + 1 - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c fill the buffer to be sent to bottom neighbors -c--------------------------------------------------------------------- - if (cell_coord(3,c).ne. 1) then - do k=0, 1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - out_buffer(ss(5)+p5) = u(i,j,k,m,c) - p5 = p5 + 1 - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c m loop -c--------------------------------------------------------------------- - end do - -c--------------------------------------------------------------------- -c cell loop -c--------------------------------------------------------------------- - end do - - call mpi_irecv(in_buffer(sr(0)), b_size(0), - > dp_type, successor(1), WEST, - > comm_rhs, requests(0), error) - call mpi_irecv(in_buffer(sr(1)), b_size(1), - > dp_type, predecessor(1), EAST, - > comm_rhs, requests(1), error) - call mpi_irecv(in_buffer(sr(2)), b_size(2), - > dp_type, successor(2), SOUTH, - > comm_rhs, requests(2), error) - call mpi_irecv(in_buffer(sr(3)), b_size(3), - > dp_type, predecessor(2), NORTH, - > comm_rhs, requests(3), error) - call mpi_irecv(in_buffer(sr(4)), b_size(4), - > dp_type, successor(3), BOTTOM, - > comm_rhs, requests(4), error) - call mpi_irecv(in_buffer(sr(5)), b_size(5), - > dp_type, predecessor(3), TOP, - > comm_rhs, requests(5), error) - - call mpi_isend(out_buffer(ss(0)), b_size(0), - > dp_type, successor(1), EAST, - > comm_rhs, requests(6), error) - call mpi_isend(out_buffer(ss(1)), b_size(1), - > dp_type, predecessor(1), WEST, - > comm_rhs, requests(7), error) - call mpi_isend(out_buffer(ss(2)), b_size(2), - > dp_type,successor(2), NORTH, - > comm_rhs, requests(8), error) - call mpi_isend(out_buffer(ss(3)), b_size(3), - > dp_type,predecessor(2), SOUTH, - > comm_rhs, requests(9), error) - call mpi_isend(out_buffer(ss(4)), b_size(4), - > dp_type,successor(3), TOP, - > comm_rhs, requests(10), error) - call mpi_isend(out_buffer(ss(5)), b_size(5), - > dp_type,predecessor(3), BOTTOM, - > comm_rhs,requests(11), error) - - - call mpi_waitall(12, requests, statuses, error) - -c--------------------------------------------------------------------- -c unpack the data that has just been received; -c--------------------------------------------------------------------- - p0 = 0 - p1 = 0 - p2 = 0 - p3 = 0 - p4 = 0 - p5 = 0 - - do c = 1, ncells - do m = 1, 5 - - if (cell_coord(1,c) .ne. 1) then - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = -2, -1 - u(i,j,k,m,c) = in_buffer(sr(1)+p0) - p0 = p0 + 1 - end do - end do - end do - endif - - if (cell_coord(1,c) .ne. ncells) then - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = cell_size(1,c), cell_size(1,c)+1 - u(i,j,k,m,c) = in_buffer(sr(0)+p1) - p1 = p1 + 1 - end do - end do - end do - end if - - if (cell_coord(2,c) .ne. 1) then - do k = 0, cell_size(3,c)-1 - do j = -2, -1 - do i = 0, cell_size(1,c)-1 - u(i,j,k,m,c) = in_buffer(sr(3)+p2) - p2 = p2 + 1 - end do - end do - end do - - endif - - if (cell_coord(2,c) .ne. ncells) then - do k = 0, cell_size(3,c)-1 - do j = cell_size(2,c), cell_size(2,c)+1 - do i = 0, cell_size(1,c)-1 - u(i,j,k,m,c) = in_buffer(sr(2)+p3) - p3 = p3 + 1 - end do - end do - end do - endif - - if (cell_coord(3,c) .ne. 1) then - do k = -2, -1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - u(i,j,k,m,c) = in_buffer(sr(5)+p4) - p4 = p4 + 1 - end do - end do - end do - endif - - if (cell_coord(3,c) .ne. ncells) then - do k = cell_size(3,c), cell_size(3,c)+1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - u(i,j,k,m,c) = in_buffer(sr(4)+p5) - p5 = p5 + 1 - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c m loop -c--------------------------------------------------------------------- - end do - -c--------------------------------------------------------------------- -c cells loop -c--------------------------------------------------------------------- - end do - -c--------------------------------------------------------------------- -c now that we have all the data, compute the rhs -c--------------------------------------------------------------------- - call compute_rhs - - return - end diff --git a/examples/smpi/NAS/SP/define.f b/examples/smpi/NAS/SP/define.f deleted file mode 100644 index c465533f9a..0000000000 --- a/examples/smpi/NAS/SP/define.f +++ /dev/null @@ -1,66 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine compute_buffer_size(dim) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, dim, face_size - - if (ncells .eq. 1) return - -c--------------------------------------------------------------------- -c compute the actual sizes of the buffers; note that there is -c always one cell face that doesn't need buffer space, because it -c is at the boundary of the grid -c--------------------------------------------------------------------- - - west_size = 0 - east_size = 0 - - do c = 1, ncells - face_size = cell_size(2,c) * cell_size(3,c) * dim * 2 - if (cell_coord(1,c).ne.1) west_size = west_size + face_size - if (cell_coord(1,c).ne.ncells) east_size = east_size + - > face_size - end do - - north_size = 0 - south_size = 0 - do c = 1, ncells - face_size = cell_size(1,c)*cell_size(3,c) * dim * 2 - if (cell_coord(2,c).ne.1) south_size = south_size + face_size - if (cell_coord(2,c).ne.ncells) north_size = north_size + - > face_size - end do - - top_size = 0 - bottom_size = 0 - do c = 1, ncells - face_size = cell_size(1,c) * cell_size(2,c) * dim * 2 - if (cell_coord(3,c).ne.1) bottom_size = bottom_size + - > face_size - if (cell_coord(3,c).ne.ncells) top_size = top_size + - > face_size - end do - - start_send_west = 1 - start_send_east = start_send_west + west_size - start_send_south = start_send_east + east_size - start_send_north = start_send_south + south_size - start_send_bottom = start_send_north + north_size - start_send_top = start_send_bottom + bottom_size - start_recv_west = 1 - start_recv_east = start_recv_west + west_size - start_recv_south = start_recv_east + east_size - start_recv_north = start_recv_south + south_size - start_recv_bottom = start_recv_north + north_size - start_recv_top = start_recv_bottom + bottom_size - - return - end - diff --git a/examples/smpi/NAS/SP/error.f b/examples/smpi/NAS/SP/error.f deleted file mode 100644 index fd9aab37b3..0000000000 --- a/examples/smpi/NAS/SP/error.f +++ /dev/null @@ -1,105 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine error_norm(rms) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c this function computes the norm of the difference between the -c computed solution and the exact solution -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer c, i, j, k, m, ii, jj, kk, d, error - double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5), - > add - - do m = 1, 5 - rms_work(m) = 0.0d0 - end do - - do c = 1, ncells - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - ii = 0 - do i = cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - call exact_solution(xi, eta, zeta, u_exact) - - do m = 1, 5 - add = u(ii,jj,kk,m,c)-u_exact(m) - rms_work(m) = rms_work(m) + add*add - end do - ii = ii + 1 - end do - jj = jj + 1 - end do - kk = kk + 1 - end do - end do - - call mpi_allreduce(rms_work, rms, 5, dp_type, - > MPI_SUM, comm_setup, error) - - do m = 1, 5 - do d = 1, 3 - rms(m) = rms(m) / dble(grid_points(d)-2) - end do - rms(m) = dsqrt(rms(m)) - end do - - return - end - - - - subroutine rhs_norm(rms) - - include 'header.h' - include 'mpinpb.h' - - integer c, i, j, k, d, m, error - double precision rms(5), rms_work(5), add - - do m = 1, 5 - rms_work(m) = 0.0d0 - end do - - do c = 1, ncells - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - do m = 1, 5 - add = rhs(i,j,k,m,c) - rms_work(m) = rms_work(m) + add*add - end do - end do - end do - end do - end do - - - - call mpi_allreduce(rms_work, rms, 5, dp_type, - > MPI_SUM, comm_setup, error) - - do m = 1, 5 - do d = 1, 3 - rms(m) = rms(m) / dble(grid_points(d)-2) - end do - rms(m) = dsqrt(rms(m)) - end do - - return - end - - diff --git a/examples/smpi/NAS/SP/exact_rhs.f b/examples/smpi/NAS/SP/exact_rhs.f deleted file mode 100644 index b589668126..0000000000 --- a/examples/smpi/NAS/SP/exact_rhs.f +++ /dev/null @@ -1,363 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exact_rhs - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c compute the right hand side based on exact solution -c--------------------------------------------------------------------- - - include 'header.h' - - double precision dtemp(5), xi, eta, zeta, dtpp - integer c, m, i, j, k, ip1, im1, jp1, - > jm1, km1, kp1 - -c--------------------------------------------------------------------- -c loop over all cells owned by this node -c--------------------------------------------------------------------- - do c = 1, ncells - -c--------------------------------------------------------------------- -c initialize -c--------------------------------------------------------------------- - do m = 1, 5 - do k= 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - forcing(i,j,k,m,c) = 0.0d0 - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c xi-direction flux differences -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - zeta = dble(k+cell_low(3,c)) * dnzm1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - eta = dble(j+cell_low(2,c)) * dnym1 - - do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c) - xi = dble(i+cell_low(1,c)) * dnxm1 - - call exact_solution(xi, eta, zeta, dtemp) - do m = 1, 5 - ue(i,m) = dtemp(m) - end do - - dtpp = 1.0d0 / dtemp(1) - - do m = 2, 5 - buf(i,m) = dtpp * dtemp(m) - end do - - cuf(i) = buf(i,2) * buf(i,2) - buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + - > buf(i,4) * buf(i,4) - q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) + - > buf(i,4)*ue(i,4)) - - end do - - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - im1 = i-1 - ip1 = i+1 - - forcing(i,j,k,1,c) = forcing(i,j,k,1,c) - - > tx2*( ue(ip1,2)-ue(im1,2) )+ - > dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1)) - - forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tx2 * ( - > (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))- - > (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+ - > xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+ - > dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2)) - - forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tx2 * ( - > ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+ - > xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+ - > dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3)) - - forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tx2*( - > ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+ - > xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+ - > dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4)) - - forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tx2*( - > buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))- - > buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+ - > 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+ - > buf(im1,1))+ - > xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+ - > xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+ - > dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5)) - end do - -c--------------------------------------------------------------------- -c Fourth-order dissipation -c--------------------------------------------------------------------- - if (start(1,c) .gt. 0) then - do m = 1, 5 - i = 1 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m)) - i = 2 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) - - > 4.0d0*ue(i+1,m) + ue(i+2,m)) - end do - endif - - do m = 1, 5 - do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp* - > (ue(i-2,m) - 4.0d0*ue(i-1,m) + - > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m)) - end do - end do - - if (end(1,c) .gt. 0) then - do m = 1, 5 - i = cell_size(1,c)-3 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (ue(i-2,m) - 4.0d0*ue(i-1,m) + - > 6.0d0*ue(i,m) - 4.0d0*ue(i+1,m)) - i = cell_size(1,c)-2 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m)) - end do - endif - - end do - end do -c--------------------------------------------------------------------- -c eta-direction flux differences -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - zeta = dble(k+cell_low(3,c)) * dnzm1 - do i=start(1,c), cell_size(1,c)-end(1,c)-1 - xi = dble(i+cell_low(1,c)) * dnxm1 - - do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c) - eta = dble(j+cell_low(2,c)) * dnym1 - - call exact_solution(xi, eta, zeta, dtemp) - do m = 1, 5 - ue(j,m) = dtemp(m) - end do - dtpp = 1.0d0/dtemp(1) - - do m = 2, 5 - buf(j,m) = dtpp * dtemp(m) - end do - - cuf(j) = buf(j,3) * buf(j,3) - buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + - > buf(j,4) * buf(j,4) - q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) + - > buf(j,4)*ue(j,4)) - end do - - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - jm1 = j-1 - jp1 = j+1 - - forcing(i,j,k,1,c) = forcing(i,j,k,1,c) - - > ty2*( ue(jp1,3)-ue(jm1,3) )+ - > dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1)) - - forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - ty2*( - > ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+ - > yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+ - > dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2)) - - forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - ty2*( - > (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))- - > (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+ - > yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+ - > dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3)) - - forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - ty2*( - > ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+ - > yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+ - > dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4)) - - forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - ty2*( - > buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))- - > buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+ - > 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+ - > buf(jm1,1))+ - > yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+ - > yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+ - > dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5)) - end do - -c--------------------------------------------------------------------- -c Fourth-order dissipation -c--------------------------------------------------------------------- - if (start(2,c) .gt. 0) then - do m = 1, 5 - j = 1 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m)) - j = 2 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) - - > 4.0d0*ue(j+1,m) + ue(j+2,m)) - end do - endif - - do m = 1, 5 - do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp* - > (ue(j-2,m) - 4.0d0*ue(j-1,m) + - > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m)) - end do - end do - if (end(2,c) .gt. 0) then - do m = 1, 5 - j = cell_size(2,c)-3 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (ue(j-2,m) - 4.0d0*ue(j-1,m) + - > 6.0d0*ue(j,m) - 4.0d0*ue(j+1,m)) - j = cell_size(2,c)-2 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m)) - - end do - endif - - end do - end do - -c--------------------------------------------------------------------- -c zeta-direction flux differences -c--------------------------------------------------------------------- - do j=start(2,c), cell_size(2,c)-end(2,c)-1 - eta = dble(j+cell_low(2,c)) * dnym1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - xi = dble(i+cell_low(1,c)) * dnxm1 - - do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c) - zeta = dble(k+cell_low(3,c)) * dnzm1 - - call exact_solution(xi, eta, zeta, dtemp) - do m = 1, 5 - ue(k,m) = dtemp(m) - end do - - dtpp = 1.0d0/dtemp(1) - - do m = 2, 5 - buf(k,m) = dtpp * dtemp(m) - end do - - cuf(k) = buf(k,4) * buf(k,4) - buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + - > buf(k,3) * buf(k,3) - q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) + - > buf(k,4)*ue(k,4)) - end do - - do k=start(3,c), cell_size(3,c)-end(3,c)-1 - km1 = k-1 - kp1 = k+1 - - forcing(i,j,k,1,c) = forcing(i,j,k,1,c) - - > tz2*( ue(kp1,4)-ue(km1,4) )+ - > dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1)) - - forcing(i,j,k,2,c) = forcing(i,j,k,2,c) - tz2 * ( - > ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+ - > zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+ - > dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2)) - - forcing(i,j,k,3,c) = forcing(i,j,k,3,c) - tz2 * ( - > ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+ - > zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+ - > dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3)) - - forcing(i,j,k,4,c) = forcing(i,j,k,4,c) - tz2 * ( - > (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))- - > (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+ - > zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+ - > dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4)) - - forcing(i,j,k,5,c) = forcing(i,j,k,5,c) - tz2 * ( - > buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))- - > buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+ - > 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1) - > +buf(km1,1))+ - > zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+ - > zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+ - > dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5)) - end do - -c--------------------------------------------------------------------- -c Fourth-order dissipation -c--------------------------------------------------------------------- - if (start(3,c) .gt. 0) then - do m = 1, 5 - k = 1 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m)) - k = 2 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) - - > 4.0d0*ue(k+1,m) + ue(k+2,m)) - end do - endif - - do m = 1, 5 - do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp* - > (ue(k-2,m) - 4.0d0*ue(k-1,m) + - > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m)) - end do - end do - - if (end(3,c) .gt. 0) then - do m = 1, 5 - k = cell_size(3,c)-3 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (ue(k-2,m) - 4.0d0*ue(k-1,m) + - > 6.0d0*ue(k,m) - 4.0d0*ue(k+1,m)) - k = cell_size(3,c)-2 - forcing(i,j,k,m,c) = forcing(i,j,k,m,c) - dssp * - > (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m)) - end do - endif - - end do - end do -c--------------------------------------------------------------------- -c now change the sign of the forcing function, -c--------------------------------------------------------------------- - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - forcing(i,j,k,m,c) = -1.d0 * forcing(i,j,k,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c cell loop -c--------------------------------------------------------------------- - end do - - return - end - - - - - diff --git a/examples/smpi/NAS/SP/exact_solution.f b/examples/smpi/NAS/SP/exact_solution.f deleted file mode 100644 index 2644f0b8f9..0000000000 --- a/examples/smpi/NAS/SP/exact_solution.f +++ /dev/null @@ -1,30 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine exact_solution(xi,eta,zeta,dtemp) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c this function returns the exact solution at point xi, eta, zeta -c--------------------------------------------------------------------- - - include 'header.h' - - double precision xi, eta, zeta, dtemp(5) - integer m - - do m = 1, 5 - dtemp(m) = ce(m,1) + - > xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) + - > eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+ - > zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + - > zeta*ce(m,13)))) - end do - - return - end - - diff --git a/examples/smpi/NAS/SP/header.h b/examples/smpi/NAS/SP/header.h deleted file mode 100644 index 663515a104..0000000000 --- a/examples/smpi/NAS/SP/header.h +++ /dev/null @@ -1,113 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - implicit none - -c--------------------------------------------------------------------- -c The following include file is generated automatically by the -c "setparams" utility. It defines -c maxcells: the square root of the maximum number of processors -c problem_size: 12, 64, 102, 162 (for class T, A, B, C) -c dt_default: default time step for this problem size if no -c config file -c niter_default: default number of iterations for this problem size -c--------------------------------------------------------------------- - - include 'npbparams.h' - - integer ncells, grid_points(3) - common /global/ ncells, grid_points - - double precision tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, - > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, - > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, - > ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2, - > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, - > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, - > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, - > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, - > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, - > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, - > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, - > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, - > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, - > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16 - - common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, - > dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, - > dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, - > ce, dxmax, dymax, dzmax, xxcon1, xxcon2, - > xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1, - > dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4, - > yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1, - > zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, - > dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, - > dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, - > c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt, - > dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, - > c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, - > c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16 - - integer EAST, WEST, NORTH, SOUTH, - > BOTTOM, TOP - - parameter (EAST=2000, WEST=3000, NORTH=4000, SOUTH=5000, - > BOTTOM=6000, TOP=7000) - - integer cell_coord (3,maxcells), cell_low (3,maxcells), - > cell_high (3,maxcells), cell_size(3,maxcells), - > predecessor(3), slice (3,maxcells), - > grid_size (3), successor(3), - > start (3,maxcells), end (3,maxcells) - common /partition/ cell_coord, cell_low, cell_high, cell_size, - > grid_size, successor, predecessor, slice, - > start, end - - integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE, IMAXP, JMAXP - - parameter (MAX_CELL_DIM = (problem_size/maxcells)+1) - - parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM) - parameter (IMAXP=IMAX/2*2+1,JMAXP=JMAX/2*2+1) - -c--------------------------------------------------------------------- -c +1 at end to avoid zero length arrays for 1 node -c--------------------------------------------------------------------- - parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60*2+1) - - double precision - > u (-2:IMAXP+1,-2:JMAXP+1,-2:KMAX+1, 5,maxcells), - > us (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > vs (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > ws (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > qs (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > ainv (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > rho_i (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > speed (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > square (-1:IMAX, -1:JMAX, -1:KMAX, maxcells), - > rhs ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells), - > forcing ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1, 5,maxcells), - > lhs ( 0:IMAXP-1, 0:JMAXP-1, 0:KMAX-1,15,maxcells), - > in_buffer(BUF_SIZE), out_buffer(BUF_SIZE) - common /fields/ u, us, vs, ws, qs, ainv, rho_i, speed, square, - > rhs, forcing, lhs, in_buffer, out_buffer - - double precision cv(-2:MAX_CELL_DIM+1), rhon(-2:MAX_CELL_DIM+1), - > rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1), - > cuf(-2:MAX_CELL_DIM+1), q(-2:MAX_CELL_DIM+1), - > ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5) - common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf - - integer west_size, east_size, bottom_size, top_size, - > north_size, south_size, start_send_west, - > start_send_east, start_send_south, start_send_north, - > start_send_bottom, start_send_top, start_recv_west, - > start_recv_east, start_recv_south, start_recv_north, - > start_recv_bottom, start_recv_top - common /box/ west_size, east_size, bottom_size, - > top_size, north_size, south_size, - > start_send_west, start_send_east, start_send_south, - > start_send_north, start_send_bottom, start_send_top, - > start_recv_west, start_recv_east, start_recv_south, - > start_recv_north, start_recv_bottom, start_recv_top diff --git a/examples/smpi/NAS/SP/initialize.f b/examples/smpi/NAS/SP/initialize.f deleted file mode 100644 index 655c8d9369..0000000000 --- a/examples/smpi/NAS/SP/initialize.f +++ /dev/null @@ -1,286 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine initialize - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c This subroutine initializes the field variable u using -c tri-linear transfinite interpolation of the boundary values -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, i, j, k, m, ii, jj, kk, ix, iy, iz - double precision xi, eta, zeta, Pface(5,3,2), Pxi, Peta, - > Pzeta, temp(5) - - -c--------------------------------------------------------------------- -c Later (in compute_rhs) we compute 1/u for every element. A few of -c the corner elements are not used, but it convenient (and faster) -c to compute the whole thing with a simple loop. Make sure those -c values are nonzero by initializing the whole thing here. -c--------------------------------------------------------------------- - do c = 1, ncells - do kk = -1, IMAX - do jj = -1, IMAX - do ii = -1, IMAX - u(ii, jj, kk, 1, c) = 1.0 - u(ii, jj, kk, 2, c) = 0.0 - u(ii, jj, kk, 3, c) = 0.0 - u(ii, jj, kk, 4, c) = 0.0 - u(ii, jj, kk, 5, c) = 1.0 - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c first store the "interpolated" values everywhere on the grid -c--------------------------------------------------------------------- - do c=1, ncells - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - ii = 0 - do i = cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - - do ix = 1, 2 - call exact_solution(dble(ix-1), eta, zeta, - > Pface(1,1,ix)) - end do - - do iy = 1, 2 - call exact_solution(xi, dble(iy-1) , zeta, - > Pface(1,2,iy)) - end do - - do iz = 1, 2 - call exact_solution(xi, eta, dble(iz-1), - > Pface(1,3,iz)) - end do - - do m = 1, 5 - Pxi = xi * Pface(m,1,2) + - > (1.0d0-xi) * Pface(m,1,1) - Peta = eta * Pface(m,2,2) + - > (1.0d0-eta) * Pface(m,2,1) - Pzeta = zeta * Pface(m,3,2) + - > (1.0d0-zeta) * Pface(m,3,1) - - u(ii,jj,kk,m,c) = Pxi + Peta + Pzeta - - > Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + - > Pxi*Peta*Pzeta - - end do - ii = ii + 1 - end do - jj = jj + 1 - end do - kk = kk+1 - end do - end do - -c--------------------------------------------------------------------- -c now store the exact values on the boundaries -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c west face -c--------------------------------------------------------------------- - c = slice(1,1) - ii = 0 - xi = 0.0d0 - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(ii,jj,kk,m,c) = temp(m) - end do - jj = jj + 1 - end do - kk = kk + 1 - end do - -c--------------------------------------------------------------------- -c east face -c--------------------------------------------------------------------- - c = slice(1,ncells) - ii = cell_size(1,c)-1 - xi = 1.0d0 - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(ii,jj,kk,m,c) = temp(m) - end do - jj = jj + 1 - end do - kk = kk + 1 - end do - -c--------------------------------------------------------------------- -c south face -c--------------------------------------------------------------------- - c = slice(2,1) - jj = 0 - eta = 0.0d0 - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - ii = 0 - do i = cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(ii,jj,kk,m,c) = temp(m) - end do - ii = ii + 1 - end do - kk = kk + 1 - end do - - -c--------------------------------------------------------------------- -c north face -c--------------------------------------------------------------------- - c = slice(2,ncells) - jj = cell_size(2,c)-1 - eta = 1.0d0 - kk = 0 - do k = cell_low(3,c), cell_high(3,c) - zeta = dble(k) * dnzm1 - ii = 0 - do i = cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(ii,jj,kk,m,c) = temp(m) - end do - ii = ii + 1 - end do - kk = kk + 1 - end do - -c--------------------------------------------------------------------- -c bottom face -c--------------------------------------------------------------------- - c = slice(3,1) - kk = 0 - zeta = 0.0d0 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - ii = 0 - do i =cell_low(1,c), cell_high(1,c) - xi = dble(i) *dnxm1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(ii,jj,kk,m,c) = temp(m) - end do - ii = ii + 1 - end do - jj = jj + 1 - end do - -c--------------------------------------------------------------------- -c top face -c--------------------------------------------------------------------- - c = slice(3,ncells) - kk = cell_size(3,c)-1 - zeta = 1.0d0 - jj = 0 - do j = cell_low(2,c), cell_high(2,c) - eta = dble(j) * dnym1 - ii = 0 - do i =cell_low(1,c), cell_high(1,c) - xi = dble(i) * dnxm1 - call exact_solution(xi, eta, zeta, temp) - do m = 1, 5 - u(ii,jj,kk,m,c) = temp(m) - end do - ii = ii + 1 - end do - jj = jj + 1 - end do - - return - end - - - subroutine lhsinit - - include 'header.h' - - integer i, j, k, d, c, n - -c--------------------------------------------------------------------- -c loop over all cells -c--------------------------------------------------------------------- - do c = 1, ncells - -c--------------------------------------------------------------------- -c first, initialize the start and end arrays -c--------------------------------------------------------------------- - do d = 1, 3 - if (cell_coord(d,c) .eq. 1) then - start(d,c) = 1 - else - start(d,c) = 0 - endif - if (cell_coord(d,c) .eq. ncells) then - end(d,c) = 1 - else - end(d,c) = 0 - endif - end do - -c--------------------------------------------------------------------- -c zap the whole left hand side for starters -c--------------------------------------------------------------------- - do n = 1, 15 - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - lhs(i,j,k,n,c) = 0.0d0 - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c next, set all diagonal values to 1. This is overkill, but convenient -c--------------------------------------------------------------------- - do n = 1, 3 - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - lhs(i,j,k,5*n-2,c) = 1.0d0 - end do - end do - end do - end do - - end do - - return - end - - - diff --git a/examples/smpi/NAS/SP/inputsp.data.sample b/examples/smpi/NAS/SP/inputsp.data.sample deleted file mode 100644 index ae3801fdb7..0000000000 --- a/examples/smpi/NAS/SP/inputsp.data.sample +++ /dev/null @@ -1,3 +0,0 @@ -400 number of time steps -0.0015d0 dt for class A = 0.0015d0. class B = 0.001d0 class C = 0.00067d0 -64 64 64 diff --git a/examples/smpi/NAS/SP/lhsx.f b/examples/smpi/NAS/SP/lhsx.f deleted file mode 100644 index cae7779122..0000000000 --- a/examples/smpi/NAS/SP/lhsx.f +++ /dev/null @@ -1,124 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine lhsx(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c This function computes the left hand side for the three x-factors -c--------------------------------------------------------------------- - - include 'header.h' - - double precision ru1 - integer i, j, k, c - - -c--------------------------------------------------------------------- -c treat only cell c -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c first fill the lhs for the u-eigenvalue -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c)-1, cell_size(1,c)-end(1,c) - ru1 = c3c4*rho_i(i,j,k,c) - cv(i) = us(i,j,k,c) - rhon(i) = dmax1(dx2+con43*ru1, - > dx5+c1c5*ru1, - > dxmax+ru1, - > dx1) - end do - - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - lhs(i,j,k,1,c) = 0.0d0 - lhs(i,j,k,2,c) = - dttx2 * cv(i-1) - dttx1 * rhon(i-1) - lhs(i,j,k,3,c) = 1.0d0 + c2dttx1 * rhon(i) - lhs(i,j,k,4,c) = dttx2 * cv(i+1) - dttx1 * rhon(i+1) - lhs(i,j,k,5,c) = 0.0d0 - end do - end do - end do - -c--------------------------------------------------------------------- -c add fourth order dissipation -c--------------------------------------------------------------------- - if (start(1,c) .gt. 0) then - i = 1 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5 - lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 - lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 - - lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4 - lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz6 - lhs(i+1,j,k,4,c) = lhs(i+1,j,k,4,c) - comz4 - lhs(i+1,j,k,5,c) = lhs(i+1,j,k,5,c) + comz1 - end do - end do - endif - - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i=3*start(1,c), cell_size(1,c)-3*end(1,c)-1 - lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 - lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 - lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 - lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 - lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 - end do - end do - end do - - if (end(1,c) .gt. 0) then - i = cell_size(1,c)-3 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 - lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 - lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 - lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 - - lhs(i+1,j,k,1,c) = lhs(i+1,j,k,1,c) + comz1 - lhs(i+1,j,k,2,c) = lhs(i+1,j,k,2,c) - comz4 - lhs(i+1,j,k,3,c) = lhs(i+1,j,k,3,c) + comz5 - end do - end do - endif - -c--------------------------------------------------------------------- -c subsequently, fill the other factors (u+c), (u-c) by a4ing to -c the first -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c) - lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) - - > dttx2 * speed(i-1,j,k,c) - lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c) - lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) + - > dttx2 * speed(i+1,j,k,c) - lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c) - lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c) - lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + - > dttx2 * speed(i-1,j,k,c) - lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c) - lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - - > dttx2 * speed(i+1,j,k,c) - lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c) - end do - end do - end do - - return - end - - - diff --git a/examples/smpi/NAS/SP/lhsy.f b/examples/smpi/NAS/SP/lhsy.f deleted file mode 100644 index 9c07a35538..0000000000 --- a/examples/smpi/NAS/SP/lhsy.f +++ /dev/null @@ -1,125 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine lhsy(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c This function computes the left hand side for the three y-factors -c--------------------------------------------------------------------- - - include 'header.h' - - double precision ru1 - integer i, j, k, c - -c--------------------------------------------------------------------- -c treat only cell c -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c first fill the lhs for the u-eigenvalue -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - - do j = start(2,c)-1, cell_size(2,c)-end(2,c) - ru1 = c3c4*rho_i(i,j,k,c) - cv(j) = vs(i,j,k,c) - rhoq(j) = dmax1( dy3 + con43 * ru1, - > dy5 + c1c5*ru1, - > dymax + ru1, - > dy1) - end do - - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - lhs(i,j,k,1,c) = 0.0d0 - lhs(i,j,k,2,c) = -dtty2 * cv(j-1) - dtty1 * rhoq(j-1) - lhs(i,j,k,3,c) = 1.0 + c2dtty1 * rhoq(j) - lhs(i,j,k,4,c) = dtty2 * cv(j+1) - dtty1 * rhoq(j+1) - lhs(i,j,k,5,c) = 0.0d0 - end do - end do - end do - -c--------------------------------------------------------------------- -c add fourth order dissipation -c--------------------------------------------------------------------- - if (start(2,c) .gt. 0) then - j = 1 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - - lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5 - lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 - lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 - - lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4 - lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz6 - lhs(i,j+1,k,4,c) = lhs(i,j+1,k,4,c) - comz4 - lhs(i,j+1,k,5,c) = lhs(i,j+1,k,5,c) + comz1 - end do - end do - endif - - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j=3*start(2,c), cell_size(2,c)-3*end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - - lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 - lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 - lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 - lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 - lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 - end do - end do - end do - - if (end(2,c) .gt. 0) then - j = cell_size(2,c)-3 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 - lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 - lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 - lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 - - lhs(i,j+1,k,1,c) = lhs(i,j+1,k,1,c) + comz1 - lhs(i,j+1,k,2,c) = lhs(i,j+1,k,2,c) - comz4 - lhs(i,j+1,k,3,c) = lhs(i,j+1,k,3,c) + comz5 - end do - end do - endif - -c--------------------------------------------------------------------- -c subsequently, do the other two factors -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c) - lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) - - > dtty2 * speed(i,j-1,k,c) - lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c) - lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) + - > dtty2 * speed(i,j+1,k,c) - lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c) - lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c) - lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + - > dtty2 * speed(i,j-1,k,c) - lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c) - lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - - > dtty2 * speed(i,j+1,k,c) - lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c) - end do - end do - end do - - return - end - - - diff --git a/examples/smpi/NAS/SP/lhsz.f b/examples/smpi/NAS/SP/lhsz.f deleted file mode 100644 index 08ea0bc24d..0000000000 --- a/examples/smpi/NAS/SP/lhsz.f +++ /dev/null @@ -1,123 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine lhsz(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c This function computes the left hand side for the three z-factors -c--------------------------------------------------------------------- - - include 'header.h' - - double precision ru1 - integer i, j, k, c - -c--------------------------------------------------------------------- -c treat only cell c -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c first fill the lhs for the u-eigenvalue -c--------------------------------------------------------------------- - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - - do k = start(3,c)-1, cell_size(3,c)-end(3,c) - ru1 = c3c4*rho_i(i,j,k,c) - cv(k) = ws(i,j,k,c) - rhos(k) = dmax1(dz4 + con43 * ru1, - > dz5 + c1c5 * ru1, - > dzmax + ru1, - > dz1) - end do - - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - lhs(i,j,k,1,c) = 0.0d0 - lhs(i,j,k,2,c) = -dttz2 * cv(k-1) - dttz1 * rhos(k-1) - lhs(i,j,k,3,c) = 1.0 + c2dttz1 * rhos(k) - lhs(i,j,k,4,c) = dttz2 * cv(k+1) - dttz1 * rhos(k+1) - lhs(i,j,k,5,c) = 0.0d0 - end do - end do - end do - -c--------------------------------------------------------------------- -c add fourth order dissipation -c--------------------------------------------------------------------- - if (start(3,c) .gt. 0) then - k = 1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz5 - lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 - lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 - - lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4 - lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz6 - lhs(i,j,k+1,4,c) = lhs(i,j,k+1,4,c) - comz4 - lhs(i,j,k+1,5,c) = lhs(i,j,k+1,5,c) + comz1 - end do - end do - endif - - do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 - lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 - lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 - lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 - lhs(i,j,k,5,c) = lhs(i,j,k,5,c) + comz1 - end do - end do - end do - - if (end(3,c) .gt. 0) then - k = cell_size(3,c)-3 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - lhs(i,j,k,1,c) = lhs(i,j,k,1,c) + comz1 - lhs(i,j,k,2,c) = lhs(i,j,k,2,c) - comz4 - lhs(i,j,k,3,c) = lhs(i,j,k,3,c) + comz6 - lhs(i,j,k,4,c) = lhs(i,j,k,4,c) - comz4 - - lhs(i,j,k+1,1,c) = lhs(i,j,k+1,1,c) + comz1 - lhs(i,j,k+1,2,c) = lhs(i,j,k+1,2,c) - comz4 - lhs(i,j,k+1,3,c) = lhs(i,j,k+1,3,c) + comz5 - end do - end do - endif - - -c--------------------------------------------------------------------- -c subsequently, fill the other factors (u+c), (u-c) -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - lhs(i,j,k,1+5,c) = lhs(i,j,k,1,c) - lhs(i,j,k,2+5,c) = lhs(i,j,k,2,c) - - > dttz2 * speed(i,j,k-1,c) - lhs(i,j,k,3+5,c) = lhs(i,j,k,3,c) - lhs(i,j,k,4+5,c) = lhs(i,j,k,4,c) + - > dttz2 * speed(i,j,k+1,c) - lhs(i,j,k,5+5,c) = lhs(i,j,k,5,c) - lhs(i,j,k,1+10,c) = lhs(i,j,k,1,c) - lhs(i,j,k,2+10,c) = lhs(i,j,k,2,c) + - > dttz2 * speed(i,j,k-1,c) - lhs(i,j,k,3+10,c) = lhs(i,j,k,3,c) - lhs(i,j,k,4+10,c) = lhs(i,j,k,4,c) - - > dttz2 * speed(i,j,k+1,c) - lhs(i,j,k,5+10,c) = lhs(i,j,k,5,c) - end do - end do - end do - - return - end - - diff --git a/examples/smpi/NAS/SP/make_set.f b/examples/smpi/NAS/SP/make_set.f deleted file mode 100644 index 7a84e93010..0000000000 --- a/examples/smpi/NAS/SP/make_set.f +++ /dev/null @@ -1,120 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine make_set - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c This function allocates space for a set of cells and fills the set -c such that communication between cells on different nodes is only -c nearest neighbor -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer p, i, j, c, dir, size, excess, ierr,ierrcode - -c--------------------------------------------------------------------- -c compute square root; add small number to allow for roundoff -c (note: this is computed in setup_mpi.f also, but prefer to do -c it twice because of some include file problems). -c--------------------------------------------------------------------- - ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0)) - -c--------------------------------------------------------------------- -c this makes coding easier -c--------------------------------------------------------------------- - p = ncells - -c--------------------------------------------------------------------- -c determine the location of the cell at the bottom of the 3D -c array of cells -c--------------------------------------------------------------------- - cell_coord(1,1) = mod(node,p) - cell_coord(2,1) = node/p - cell_coord(3,1) = 0 - -c--------------------------------------------------------------------- -c set the cell_coords for cells in the rest of the z-layers; -c this comes down to a simple linear numbering in the z-direct- -c ion, and to the doubly-cyclic numbering in the other dirs -c--------------------------------------------------------------------- - do c=2, p - cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) - cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) - cell_coord(3,c) = c-1 - end do - -c--------------------------------------------------------------------- -c offset all the coordinates by 1 to adjust for Fortran arrays -c--------------------------------------------------------------------- - do dir = 1, 3 - do c = 1, p - cell_coord(dir,c) = cell_coord(dir,c) + 1 - end do - end do - -c--------------------------------------------------------------------- -c slice(dir,n) contains the sequence number of the cell that is in -c coordinate plane n in the dir direction -c--------------------------------------------------------------------- - do dir = 1, 3 - do c = 1, p - slice(dir,cell_coord(dir,c)) = c - end do - end do - - -c--------------------------------------------------------------------- -c fill the predecessor and successor entries, using the indices -c of the bottom cells (they are the same at each level of k -c anyway) acting as if full periodicity pertains; note that p is -c added to those arguments to the mod functions that might -c otherwise return wrong values when using the modulo function -c--------------------------------------------------------------------- - i = cell_coord(1,1)-1 - j = cell_coord(2,1)-1 - - predecessor(1) = mod(i-1+p,p) + p*j - predecessor(2) = i + p*mod(j-1+p,p) - predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p) - successor(1) = mod(i+1,p) + p*j - successor(2) = i + p*mod(j+1,p) - successor(3) = mod(i-1+p,p) + p*mod(j+1,p) - -c--------------------------------------------------------------------- -c now compute the sizes of the cells -c--------------------------------------------------------------------- - do dir= 1, 3 -c--------------------------------------------------------------------- -c set cell_coord range for each direction -c--------------------------------------------------------------------- - size = grid_points(dir)/p - excess = mod(grid_points(dir),p) - do c=1, ncells - if (cell_coord(dir,c) .le. excess) then - cell_size(dir,c) = size+1 - cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1) - cell_high(dir,c) = cell_low(dir,c)+size - else - cell_size(dir,c) = size - cell_low(dir,c) = excess*(size+1)+ - > (cell_coord(dir,c)-excess-1)*size - cell_high(dir,c) = cell_low(dir,c)+size-1 - endif - if (cell_size(dir, c) .le. 2) then - write(*,50) - 50 format(' Error: Cell size too small. Min size is 3') - call MPI_Abort(mpi_comm_world,ierrcode,ierr) - stop - endif - end do - end do - - return - end - diff --git a/examples/smpi/NAS/SP/mpinpb.h b/examples/smpi/NAS/SP/mpinpb.h deleted file mode 100644 index 439db34f60..0000000000 --- a/examples/smpi/NAS/SP/mpinpb.h +++ /dev/null @@ -1,13 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'mpif.h' - - integer node, no_nodes, total_nodes, root, comm_setup, - > comm_solve, comm_rhs, dp_type - logical active - common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup, - > comm_solve, comm_rhs, dp_type, active - integer DEFAULT_TAG - parameter (DEFAULT_TAG = 0) diff --git a/examples/smpi/NAS/SP/ninvr.f b/examples/smpi/NAS/SP/ninvr.f deleted file mode 100644 index 146d046e8b..0000000000 --- a/examples/smpi/NAS/SP/ninvr.f +++ /dev/null @@ -1,45 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine ninvr(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c block-diagonal matrix-vector multiplication -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, i, j, k - double precision r1, r2, r3, r4, r5, t1, t2 - -c--------------------------------------------------------------------- -c treat only one cell -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - - r1 = rhs(i,j,k,1,c) - r2 = rhs(i,j,k,2,c) - r3 = rhs(i,j,k,3,c) - r4 = rhs(i,j,k,4,c) - r5 = rhs(i,j,k,5,c) - - t1 = bt * r3 - t2 = 0.5d0 * ( r4 + r5 ) - - rhs(i,j,k,1,c) = -r2 - rhs(i,j,k,2,c) = r1 - rhs(i,j,k,3,c) = bt * ( r4 - r5 ) - rhs(i,j,k,4,c) = -t1 + t2 - rhs(i,j,k,5,c) = t1 + t2 - enddo - enddo - enddo - - return - end diff --git a/examples/smpi/NAS/SP/pinvr.f b/examples/smpi/NAS/SP/pinvr.f deleted file mode 100644 index 060f0a57ef..0000000000 --- a/examples/smpi/NAS/SP/pinvr.f +++ /dev/null @@ -1,48 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine pinvr(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c block-diagonal matrix-vector multiplication -c--------------------------------------------------------------------- - - include 'header.h' - - integer i, j, k, c - double precision r1, r2, r3, r4, r5, t1, t2 - -c--------------------------------------------------------------------- -c treat only one cell -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - - r1 = rhs(i,j,k,1,c) - r2 = rhs(i,j,k,2,c) - r3 = rhs(i,j,k,3,c) - r4 = rhs(i,j,k,4,c) - r5 = rhs(i,j,k,5,c) - - t1 = bt * r1 - t2 = 0.5d0 * ( r4 + r5 ) - - rhs(i,j,k,1,c) = bt * ( r4 - r5 ) - rhs(i,j,k,2,c) = -r3 - rhs(i,j,k,3,c) = r2 - rhs(i,j,k,4,c) = -t1 + t2 - rhs(i,j,k,5,c) = t1 + t2 - end do - end do - end do - - return - end - - - diff --git a/examples/smpi/NAS/SP/rhs.f b/examples/smpi/NAS/SP/rhs.f deleted file mode 100644 index 34e562a4e3..0000000000 --- a/examples/smpi/NAS/SP/rhs.f +++ /dev/null @@ -1,446 +0,0 @@ -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine compute_rhs - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, i, j, k, m - double precision aux, rho_inv, uijk, up1, um1, vijk, vp1, vm1, - > wijk, wp1, wm1 - - -c--------------------------------------------------------------------- -c loop over all cells owned by this node -c--------------------------------------------------------------------- - do c = 1, ncells - -c--------------------------------------------------------------------- -c compute the reciprocal of density, and the kinetic energy, -c and the speed of sound. -c--------------------------------------------------------------------- - - do k = -1, cell_size(3,c) - do j = -1, cell_size(2,c) - do i = -1, cell_size(1,c) - rho_inv = 1.0d0/u(i,j,k,1,c) - rho_i(i,j,k,c) = rho_inv - us(i,j,k,c) = u(i,j,k,2,c) * rho_inv - vs(i,j,k,c) = u(i,j,k,3,c) * rho_inv - ws(i,j,k,c) = u(i,j,k,4,c) * rho_inv - square(i,j,k,c) = 0.5d0* ( - > u(i,j,k,2,c)*u(i,j,k,2,c) + - > u(i,j,k,3,c)*u(i,j,k,3,c) + - > u(i,j,k,4,c)*u(i,j,k,4,c) ) * rho_inv - qs(i,j,k,c) = square(i,j,k,c) * rho_inv -c--------------------------------------------------------------------- -c (don't need speed and ainx until the lhs computation) -c--------------------------------------------------------------------- - aux = c1c2*rho_inv* (u(i,j,k,5,c) - square(i,j,k,c)) - aux = dsqrt(aux) - speed(i,j,k,c) = aux - ainv(i,j,k,c) = 1.0d0/aux - end do - end do - end do - -c--------------------------------------------------------------------- -c copy the exact forcing term to the right hand side; because -c this forcing term is known, we can store it on the whole of every -c cell, including the boundary -c--------------------------------------------------------------------- - - do m = 1, 5 - do k = 0, cell_size(3,c)-1 - do j = 0, cell_size(2,c)-1 - do i = 0, cell_size(1,c)-1 - rhs(i,j,k,m,c) = forcing(i,j,k,m,c) - end do - end do - end do - end do - - -c--------------------------------------------------------------------- -c compute xi-direction fluxes -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - uijk = us(i,j,k,c) - up1 = us(i+1,j,k,c) - um1 = us(i-1,j,k,c) - - rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dx1tx1 * - > (u(i+1,j,k,1,c) - 2.0d0*u(i,j,k,1,c) + - > u(i-1,j,k,1,c)) - - > tx2 * (u(i+1,j,k,2,c) - u(i-1,j,k,2,c)) - - rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dx2tx1 * - > (u(i+1,j,k,2,c) - 2.0d0*u(i,j,k,2,c) + - > u(i-1,j,k,2,c)) + - > xxcon2*con43 * (up1 - 2.0d0*uijk + um1) - - > tx2 * (u(i+1,j,k,2,c)*up1 - - > u(i-1,j,k,2,c)*um1 + - > (u(i+1,j,k,5,c)- square(i+1,j,k,c)- - > u(i-1,j,k,5,c)+ square(i-1,j,k,c))* - > c2) - - rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dx3tx1 * - > (u(i+1,j,k,3,c) - 2.0d0*u(i,j,k,3,c) + - > u(i-1,j,k,3,c)) + - > xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) + - > vs(i-1,j,k,c)) - - > tx2 * (u(i+1,j,k,3,c)*up1 - - > u(i-1,j,k,3,c)*um1) - - rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dx4tx1 * - > (u(i+1,j,k,4,c) - 2.0d0*u(i,j,k,4,c) + - > u(i-1,j,k,4,c)) + - > xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) + - > ws(i-1,j,k,c)) - - > tx2 * (u(i+1,j,k,4,c)*up1 - - > u(i-1,j,k,4,c)*um1) - - rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dx5tx1 * - > (u(i+1,j,k,5,c) - 2.0d0*u(i,j,k,5,c) + - > u(i-1,j,k,5,c)) + - > xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) + - > qs(i-1,j,k,c)) + - > xxcon4 * (up1*up1 - 2.0d0*uijk*uijk + - > um1*um1) + - > xxcon5 * (u(i+1,j,k,5,c)*rho_i(i+1,j,k,c) - - > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) + - > u(i-1,j,k,5,c)*rho_i(i-1,j,k,c)) - - > tx2 * ( (c1*u(i+1,j,k,5,c) - - > c2*square(i+1,j,k,c))*up1 - - > (c1*u(i-1,j,k,5,c) - - > c2*square(i-1,j,k,c))*um1 ) - end do - end do - end do - -c--------------------------------------------------------------------- -c add fourth order xi-direction dissipation -c--------------------------------------------------------------------- - if (start(1,c) .gt. 0) then - i = 1 - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * - > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) + - > u(i+2,j,k,m,c)) - end do - end do - end do - - i = 2 - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > (-4.0d0*u(i-1,j,k,m,c) + 6.0d0*u(i,j,k,m,c) - - > 4.0d0*u(i+1,j,k,m,c) + u(i+2,j,k,m,c)) - end do - end do - end do - endif - - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) + - > 6.0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) + - > u(i+2,j,k,m,c) ) - end do - end do - end do - end do - - - if (end(1,c) .gt. 0) then - i = cell_size(1,c)-3 - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > ( u(i-2,j,k,m,c) - 4.0d0*u(i-1,j,k,m,c) + - > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i+1,j,k,m,c) ) - end do - end do - end do - - i = cell_size(1,c)-2 - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > ( u(i-2,j,k,m,c) - 4.d0*u(i-1,j,k,m,c) + - > 5.d0*u(i,j,k,m,c) ) - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c compute eta-direction fluxes -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - vijk = vs(i,j,k,c) - vp1 = vs(i,j+1,k,c) - vm1 = vs(i,j-1,k,c) - rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dy1ty1 * - > (u(i,j+1,k,1,c) - 2.0d0*u(i,j,k,1,c) + - > u(i,j-1,k,1,c)) - - > ty2 * (u(i,j+1,k,3,c) - u(i,j-1,k,3,c)) - rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dy2ty1 * - > (u(i,j+1,k,2,c) - 2.0d0*u(i,j,k,2,c) + - > u(i,j-1,k,2,c)) + - > yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) + - > us(i,j-1,k,c)) - - > ty2 * (u(i,j+1,k,2,c)*vp1 - - > u(i,j-1,k,2,c)*vm1) - rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dy3ty1 * - > (u(i,j+1,k,3,c) - 2.0d0*u(i,j,k,3,c) + - > u(i,j-1,k,3,c)) + - > yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) - - > ty2 * (u(i,j+1,k,3,c)*vp1 - - > u(i,j-1,k,3,c)*vm1 + - > (u(i,j+1,k,5,c) - square(i,j+1,k,c) - - > u(i,j-1,k,5,c) + square(i,j-1,k,c)) - > *c2) - rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dy4ty1 * - > (u(i,j+1,k,4,c) - 2.0d0*u(i,j,k,4,c) + - > u(i,j-1,k,4,c)) + - > yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) + - > ws(i,j-1,k,c)) - - > ty2 * (u(i,j+1,k,4,c)*vp1 - - > u(i,j-1,k,4,c)*vm1) - rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dy5ty1 * - > (u(i,j+1,k,5,c) - 2.0d0*u(i,j,k,5,c) + - > u(i,j-1,k,5,c)) + - > yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) + - > qs(i,j-1,k,c)) + - > yycon4 * (vp1*vp1 - 2.0d0*vijk*vijk + - > vm1*vm1) + - > yycon5 * (u(i,j+1,k,5,c)*rho_i(i,j+1,k,c) - - > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) + - > u(i,j-1,k,5,c)*rho_i(i,j-1,k,c)) - - > ty2 * ((c1*u(i,j+1,k,5,c) - - > c2*square(i,j+1,k,c)) * vp1 - - > (c1*u(i,j-1,k,5,c) - - > c2*square(i,j-1,k,c)) * vm1) - end do - end do - end do - -c--------------------------------------------------------------------- -c add fourth order eta-direction dissipation -c--------------------------------------------------------------------- - if (start(2,c) .gt. 0) then - j = 1 - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * - > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) + - > u(i,j+2,k,m,c)) - end do - end do - end do - - j = 2 - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > (-4.0d0*u(i,j-1,k,m,c) + 6.0d0*u(i,j,k,m,c) - - > 4.0d0*u(i,j+1,k,m,c) + u(i,j+2,k,m,c)) - end do - end do - end do - endif - - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1 - do i = start(1,c),cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) + - > 6.0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) + - > u(i,j+2,k,m,c) ) - end do - end do - end do - end do - - if (end(2,c) .gt. 0) then - j = cell_size(2,c)-3 - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > ( u(i,j-2,k,m,c) - 4.0d0*u(i,j-1,k,m,c) + - > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j+1,k,m,c) ) - end do - end do - end do - - j = cell_size(2,c)-2 - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > ( u(i,j-2,k,m,c) - 4.d0*u(i,j-1,k,m,c) + - > 5.d0*u(i,j,k,m,c) ) - end do - end do - end do - endif - - -c--------------------------------------------------------------------- -c compute zeta-direction fluxes -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - wijk = ws(i,j,k,c) - wp1 = ws(i,j,k+1,c) - wm1 = ws(i,j,k-1,c) - - rhs(i,j,k,1,c) = rhs(i,j,k,1,c) + dz1tz1 * - > (u(i,j,k+1,1,c) - 2.0d0*u(i,j,k,1,c) + - > u(i,j,k-1,1,c)) - - > tz2 * (u(i,j,k+1,4,c) - u(i,j,k-1,4,c)) - rhs(i,j,k,2,c) = rhs(i,j,k,2,c) + dz2tz1 * - > (u(i,j,k+1,2,c) - 2.0d0*u(i,j,k,2,c) + - > u(i,j,k-1,2,c)) + - > zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) + - > us(i,j,k-1,c)) - - > tz2 * (u(i,j,k+1,2,c)*wp1 - - > u(i,j,k-1,2,c)*wm1) - rhs(i,j,k,3,c) = rhs(i,j,k,3,c) + dz3tz1 * - > (u(i,j,k+1,3,c) - 2.0d0*u(i,j,k,3,c) + - > u(i,j,k-1,3,c)) + - > zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) + - > vs(i,j,k-1,c)) - - > tz2 * (u(i,j,k+1,3,c)*wp1 - - > u(i,j,k-1,3,c)*wm1) - rhs(i,j,k,4,c) = rhs(i,j,k,4,c) + dz4tz1 * - > (u(i,j,k+1,4,c) - 2.0d0*u(i,j,k,4,c) + - > u(i,j,k-1,4,c)) + - > zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) - - > tz2 * (u(i,j,k+1,4,c)*wp1 - - > u(i,j,k-1,4,c)*wm1 + - > (u(i,j,k+1,5,c) - square(i,j,k+1,c) - - > u(i,j,k-1,5,c) + square(i,j,k-1,c)) - > *c2) - rhs(i,j,k,5,c) = rhs(i,j,k,5,c) + dz5tz1 * - > (u(i,j,k+1,5,c) - 2.0d0*u(i,j,k,5,c) + - > u(i,j,k-1,5,c)) + - > zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) + - > qs(i,j,k-1,c)) + - > zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk + - > wm1*wm1) + - > zzcon5 * (u(i,j,k+1,5,c)*rho_i(i,j,k+1,c) - - > 2.0d0*u(i,j,k,5,c)*rho_i(i,j,k,c) + - > u(i,j,k-1,5,c)*rho_i(i,j,k-1,c)) - - > tz2 * ( (c1*u(i,j,k+1,5,c) - - > c2*square(i,j,k+1,c))*wp1 - - > (c1*u(i,j,k-1,5,c) - - > c2*square(i,j,k-1,c))*wm1) - end do - end do - end do - -c--------------------------------------------------------------------- -c add fourth order zeta-direction dissipation -c--------------------------------------------------------------------- - if (start(3,c) .gt. 0) then - k = 1 - do m = 1, 5 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c)- dssp * - > ( 5.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) + - > u(i,j,k+2,m,c)) - end do - end do - end do - - k = 2 - do m = 1, 5 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > (-4.0d0*u(i,j,k-1,m,c) + 6.0d0*u(i,j,k,m,c) - - > 4.0d0*u(i,j,k+1,m,c) + u(i,j,k+2,m,c)) - end do - end do - end do - endif - - do m = 1, 5 - do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c),cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) + - > 6.0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) + - > u(i,j,k+2,m,c) ) - end do - end do - end do - end do - - if (end(3,c) .gt. 0) then - k = cell_size(3,c)-3 - do m = 1, 5 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > ( u(i,j,k-2,m,c) - 4.0d0*u(i,j,k-1,m,c) + - > 6.0d0*u(i,j,k,m,c) - 4.0d0*u(i,j,k+1,m,c) ) - end do - end do - end do - - k = cell_size(3,c)-2 - do m = 1, 5 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - dssp * - > ( u(i,j,k-2,m,c) - 4.d0*u(i,j,k-1,m,c) + - > 5.d0*u(i,j,k,m,c) ) - end do - end do - end do - endif - - do m = 1, 5 - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) * dt - end do - end do - end do - end do - - end do - - return - end - - - - diff --git a/examples/smpi/NAS/SP/set_constants.f b/examples/smpi/NAS/SP/set_constants.f deleted file mode 100644 index 63ce72bb9b..0000000000 --- a/examples/smpi/NAS/SP/set_constants.f +++ /dev/null @@ -1,203 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine set_constants - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - include 'header.h' - - ce(1,1) = 2.0d0 - ce(1,2) = 0.0d0 - ce(1,3) = 0.0d0 - ce(1,4) = 4.0d0 - ce(1,5) = 5.0d0 - ce(1,6) = 3.0d0 - ce(1,7) = 0.5d0 - ce(1,8) = 0.02d0 - ce(1,9) = 0.01d0 - ce(1,10) = 0.03d0 - ce(1,11) = 0.5d0 - ce(1,12) = 0.4d0 - ce(1,13) = 0.3d0 - - ce(2,1) = 1.0d0 - ce(2,2) = 0.0d0 - ce(2,3) = 0.0d0 - ce(2,4) = 0.0d0 - ce(2,5) = 1.0d0 - ce(2,6) = 2.0d0 - ce(2,7) = 3.0d0 - ce(2,8) = 0.01d0 - ce(2,9) = 0.03d0 - ce(2,10) = 0.02d0 - ce(2,11) = 0.4d0 - ce(2,12) = 0.3d0 - ce(2,13) = 0.5d0 - - ce(3,1) = 2.0d0 - ce(3,2) = 2.0d0 - ce(3,3) = 0.0d0 - ce(3,4) = 0.0d0 - ce(3,5) = 0.0d0 - ce(3,6) = 2.0d0 - ce(3,7) = 3.0d0 - ce(3,8) = 0.04d0 - ce(3,9) = 0.03d0 - ce(3,10) = 0.05d0 - ce(3,11) = 0.3d0 - ce(3,12) = 0.5d0 - ce(3,13) = 0.4d0 - - ce(4,1) = 2.0d0 - ce(4,2) = 2.0d0 - ce(4,3) = 0.0d0 - ce(4,4) = 0.0d0 - ce(4,5) = 0.0d0 - ce(4,6) = 2.0d0 - ce(4,7) = 3.0d0 - ce(4,8) = 0.03d0 - ce(4,9) = 0.05d0 - ce(4,10) = 0.04d0 - ce(4,11) = 0.2d0 - ce(4,12) = 0.1d0 - ce(4,13) = 0.3d0 - - ce(5,1) = 5.0d0 - ce(5,2) = 4.0d0 - ce(5,3) = 3.0d0 - ce(5,4) = 2.0d0 - ce(5,5) = 0.1d0 - ce(5,6) = 0.4d0 - ce(5,7) = 0.3d0 - ce(5,8) = 0.05d0 - ce(5,9) = 0.04d0 - ce(5,10) = 0.03d0 - ce(5,11) = 0.1d0 - ce(5,12) = 0.3d0 - ce(5,13) = 0.2d0 - - c1 = 1.4d0 - c2 = 0.4d0 - c3 = 0.1d0 - c4 = 1.0d0 - c5 = 1.4d0 - - bt = dsqrt(0.5d0) - - dnxm1 = 1.0d0 / dble(grid_points(1)-1) - dnym1 = 1.0d0 / dble(grid_points(2)-1) - dnzm1 = 1.0d0 / dble(grid_points(3)-1) - - c1c2 = c1 * c2 - c1c5 = c1 * c5 - c3c4 = c3 * c4 - c1345 = c1c5 * c3c4 - - conz1 = (1.0d0-c1c5) - - tx1 = 1.0d0 / (dnxm1 * dnxm1) - tx2 = 1.0d0 / (2.0d0 * dnxm1) - tx3 = 1.0d0 / dnxm1 - - ty1 = 1.0d0 / (dnym1 * dnym1) - ty2 = 1.0d0 / (2.0d0 * dnym1) - ty3 = 1.0d0 / dnym1 - - tz1 = 1.0d0 / (dnzm1 * dnzm1) - tz2 = 1.0d0 / (2.0d0 * dnzm1) - tz3 = 1.0d0 / dnzm1 - - dx1 = 0.75d0 - dx2 = 0.75d0 - dx3 = 0.75d0 - dx4 = 0.75d0 - dx5 = 0.75d0 - - dy1 = 0.75d0 - dy2 = 0.75d0 - dy3 = 0.75d0 - dy4 = 0.75d0 - dy5 = 0.75d0 - - dz1 = 1.0d0 - dz2 = 1.0d0 - dz3 = 1.0d0 - dz4 = 1.0d0 - dz5 = 1.0d0 - - dxmax = dmax1(dx3, dx4) - dymax = dmax1(dy2, dy4) - dzmax = dmax1(dz2, dz3) - - dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) ) - - c4dssp = 4.0d0 * dssp - c5dssp = 5.0d0 * dssp - - dttx1 = dt*tx1 - dttx2 = dt*tx2 - dtty1 = dt*ty1 - dtty2 = dt*ty2 - dttz1 = dt*tz1 - dttz2 = dt*tz2 - - c2dttx1 = 2.0d0*dttx1 - c2dtty1 = 2.0d0*dtty1 - c2dttz1 = 2.0d0*dttz1 - - dtdssp = dt*dssp - - comz1 = dtdssp - comz4 = 4.0d0*dtdssp - comz5 = 5.0d0*dtdssp - comz6 = 6.0d0*dtdssp - - c3c4tx3 = c3c4*tx3 - c3c4ty3 = c3c4*ty3 - c3c4tz3 = c3c4*tz3 - - dx1tx1 = dx1*tx1 - dx2tx1 = dx2*tx1 - dx3tx1 = dx3*tx1 - dx4tx1 = dx4*tx1 - dx5tx1 = dx5*tx1 - - dy1ty1 = dy1*ty1 - dy2ty1 = dy2*ty1 - dy3ty1 = dy3*ty1 - dy4ty1 = dy4*ty1 - dy5ty1 = dy5*ty1 - - dz1tz1 = dz1*tz1 - dz2tz1 = dz2*tz1 - dz3tz1 = dz3*tz1 - dz4tz1 = dz4*tz1 - dz5tz1 = dz5*tz1 - - c2iv = 2.5d0 - con43 = 4.0d0/3.0d0 - con16 = 1.0d0/6.0d0 - - xxcon1 = c3c4tx3*con43*tx3 - xxcon2 = c3c4tx3*tx3 - xxcon3 = c3c4tx3*conz1*tx3 - xxcon4 = c3c4tx3*con16*tx3 - xxcon5 = c3c4tx3*c1c5*tx3 - - yycon1 = c3c4ty3*con43*ty3 - yycon2 = c3c4ty3*ty3 - yycon3 = c3c4ty3*conz1*ty3 - yycon4 = c3c4ty3*con16*ty3 - yycon5 = c3c4ty3*c1c5*ty3 - - zzcon1 = c3c4tz3*con43*tz3 - zzcon2 = c3c4tz3*tz3 - zzcon3 = c3c4tz3*conz1*tz3 - zzcon4 = c3c4tz3*con16*tz3 - zzcon5 = c3c4tz3*c1c5*tz3 - - return - end diff --git a/examples/smpi/NAS/SP/setup_mpi.f b/examples/smpi/NAS/SP/setup_mpi.f deleted file mode 100644 index 2d98f7dd02..0000000000 --- a/examples/smpi/NAS/SP/setup_mpi.f +++ /dev/null @@ -1,65 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine setup_mpi - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c set up MPI stuff -c--------------------------------------------------------------------- - - implicit none - include 'mpinpb.h' - include 'npbparams.h' - integer error, nc, color - - call mpi_init(error) - - call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error) - call mpi_comm_rank(MPI_COMM_WORLD, node, error) - - if (.not. convertdouble) then - dp_type = MPI_DOUBLE_PRECISION - else - dp_type = MPI_REAL - endif - -c--------------------------------------------------------------------- -c compute square root; add small number to allow for roundoff -c--------------------------------------------------------------------- - nc = dint(dsqrt(dble(total_nodes) + 0.00001d0)) - -c--------------------------------------------------------------------- -c We handle a non-square number of nodes by making the excess nodes -c inactive. However, we can never handle more cells than were compiled -c in. -c--------------------------------------------------------------------- - - if (nc .gt. maxcells) nc = maxcells - - if (node .ge. nc*nc) then - active = .false. - color = 1 - else - active = .true. - color = 0 - end if - - call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error) - if (.not. active) return - - call mpi_comm_size(comm_setup, no_nodes, error) - call mpi_comm_dup(comm_setup, comm_solve, error) - call mpi_comm_dup(comm_setup, comm_rhs, error) - -c--------------------------------------------------------------------- -c let node 0 be the root for the group (there is only one) -c--------------------------------------------------------------------- - root = 0 - - return - end - diff --git a/examples/smpi/NAS/SP/sp.f b/examples/smpi/NAS/SP/sp.f deleted file mode 100644 index 740cadee46..0000000000 --- a/examples/smpi/NAS/SP/sp.f +++ /dev/null @@ -1,194 +0,0 @@ -!-------------------------------------------------------------------------! -! ! -! N A S P A R A L L E L B E N C H M A R K S 3.3 ! -! ! -! S P ! -! ! -!-------------------------------------------------------------------------! -! ! -! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! -! It is described in NAS Technical Reports 95-020 and 02-007 ! -! ! -! Permission to use, copy, distribute and modify this software ! -! for any purpose with or without fee is hereby granted. We ! -! request, however, that all derived work reference the NAS ! -! Parallel Benchmarks 3.3. This software is provided "as is" ! -! without express or implied warranty. ! -! ! -! Information on NPB 3.3, including the technical report, the ! -! original specifications, source code, results and information ! -! on how to submit new results, is available at: ! -! ! -! http://www.nas.nasa.gov/Software/NPB/ ! -! ! -! Send comments or suggestions to npb@nas.nasa.gov ! -! ! -! NAS Parallel Benchmarks Group ! -! NASA Ames Research Center ! -! Mail Stop: T27A-1 ! -! Moffett Field, CA 94035-1000 ! -! ! -! E-mail: npb@nas.nasa.gov ! -! Fax: (650) 604-3957 ! -! ! -!-------------------------------------------------------------------------! - - -c--------------------------------------------------------------------- -c -c Authors: R. F. Van der Wijngaart -c W. Saphir -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- - program MPSP -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i, niter, step, c, error, fstatus - external timer_read - double precision mflops, t, tmax, timer_read - logical verified - character class - - call setup_mpi - if (.not. active) goto 999 - -c--------------------------------------------------------------------- -c Root node reads input file (if it exists) else takes -c defaults from parameters -c--------------------------------------------------------------------- - if (node .eq. root) then - - write(*, 1000) - open (unit=2,file='inputsp.data',status='old', iostat=fstatus) -c - if (fstatus .eq. 0) then - write(*,233) - 233 format(' Reading from input file inputsp.data') - read (2,*) niter - read (2,*) dt - read (2,*) grid_points(1), grid_points(2), grid_points(3) - close(2) - else - write(*,234) - niter = niter_default - dt = dt_default - grid_points(1) = problem_size - grid_points(2) = problem_size - grid_points(3) = problem_size - endif - 234 format(' No input file inputsp.data. Using compiled defaults') - - write(*, 1001) grid_points(1), grid_points(2), grid_points(3) - write(*, 1002) niter, dt - if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes - if (no_nodes .ne. maxcells*maxcells) - > write(*, 1005) maxcells*maxcells - write(*, 1003) no_nodes - - 1000 format(//,' NAS Parallel Benchmarks 3.3 -- SP Benchmark',/) - 1001 format(' Size: ', i4, 'x', i4, 'x', i4) - 1002 format(' Iterations: ', i4, ' dt: ', F11.7) - 1004 format(' Total number of processes: ', i5) - 1005 format(' WARNING: compiled for ', i5, ' processes ') - 1003 format(' Number of active processes: ', i5, /) - - endif - - call mpi_bcast(niter, 1, MPI_INTEGER, - > root, comm_setup, error) - - call mpi_bcast(dt, 1, dp_type, - > root, comm_setup, error) - - call mpi_bcast(grid_points(1), 3, MPI_INTEGER, - > root, comm_setup, error) - - - call make_set - - do c = 1, ncells - if ( (cell_size(1,c) .gt. IMAX) .or. - > (cell_size(2,c) .gt. JMAX) .or. - > (cell_size(3,c) .gt. KMAX) ) then - print *,node, c, (cell_size(i,c),i=1,3) - print *,' Problem size too big for compiled array sizes' - goto 999 - endif - end do - - call set_constants - - call initialize - -c call mpi_finalize(error) -c stop - - call lhsinit - - call exact_rhs - - call compute_buffer_size(5) - -c--------------------------------------------------------------------- -c do one time step to touch all code, and reinitialize -c--------------------------------------------------------------------- - call adi - call initialize - -c--------------------------------------------------------------------- -c Synchronize before placing time stamp -c--------------------------------------------------------------------- - call mpi_barrier(comm_setup, error) - - call timer_clear(1) - call timer_start(1) - - do step = 1, niter - - if (node .eq. root) then - if (mod(step, 20) .eq. 0 .or. - > step .eq. 1) then - write(*, 200) step - 200 format(' Time step ', i4) - endif - endif - - call adi - - end do - - call timer_stop(1) - t = timer_read(1) - - call verify(niter, class, verified) - - call mpi_reduce(t, tmax, 1, - > dp_type, MPI_MAX, - > root, comm_setup, error) - - if( node .eq. root ) then - if( tmax .ne. 0. ) then - mflops = (881.174*float( problem_size )**3 - > -4683.91*float( problem_size )**2 - > +11484.5*float( problem_size ) - > -19272.4) * float( niter ) / (tmax*1000000.0d0) - else - mflops = 0.0 - endif - - call print_results('SP', class, grid_points(1), - > grid_points(2), grid_points(3), niter, maxcells*maxcells, - > total_nodes, tmax, mflops, ' floating point', - > verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5, - > cs6, '(none)') - endif - - 999 continue - call mpi_barrier(MPI_COMM_WORLD, error) - call mpi_finalize(error) - - end diff --git a/examples/smpi/NAS/SP/txinvr.f b/examples/smpi/NAS/SP/txinvr.f deleted file mode 100644 index b5ca4616f5..0000000000 --- a/examples/smpi/NAS/SP/txinvr.f +++ /dev/null @@ -1,59 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine txinvr - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c block-diagonal matrix-vector multiplication -c--------------------------------------------------------------------- - - include 'header.h' - - integer c, i, j, k - double precision t1, t2, t3, ac, ru1, uu, vv, ww, r1, r2, r3, - > r4, r5, ac2inv - -c--------------------------------------------------------------------- -c loop over all cells owned by this node -c--------------------------------------------------------------------- - do c = 1, ncells - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - - ru1 = rho_i(i,j,k,c) - uu = us(i,j,k,c) - vv = vs(i,j,k,c) - ww = ws(i,j,k,c) - ac = speed(i,j,k,c) - ac2inv = ainv(i,j,k,c)*ainv(i,j,k,c) - - r1 = rhs(i,j,k,1,c) - r2 = rhs(i,j,k,2,c) - r3 = rhs(i,j,k,3,c) - r4 = rhs(i,j,k,4,c) - r5 = rhs(i,j,k,5,c) - - t1 = c2 * ac2inv * ( qs(i,j,k,c)*r1 - uu*r2 - - > vv*r3 - ww*r4 + r5 ) - t2 = bt * ru1 * ( uu * r1 - r2 ) - t3 = ( bt * ru1 * ac ) * t1 - - rhs(i,j,k,1,c) = r1 - t1 - rhs(i,j,k,2,c) = - ru1 * ( ww*r1 - r4 ) - rhs(i,j,k,3,c) = ru1 * ( vv*r1 - r3 ) - rhs(i,j,k,4,c) = - t2 + t3 - rhs(i,j,k,5,c) = t2 + t3 - end do - end do - end do - end do - - return - end - - diff --git a/examples/smpi/NAS/SP/tzetar.f b/examples/smpi/NAS/SP/tzetar.f deleted file mode 100644 index 554066d6fc..0000000000 --- a/examples/smpi/NAS/SP/tzetar.f +++ /dev/null @@ -1,60 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine tzetar(c) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c block-diagonal matrix-vector multiplication -c--------------------------------------------------------------------- - - include 'header.h' - - integer i, j, k, c - double precision t1, t2, t3, ac, xvel, yvel, zvel, r1, r2, r3, - > r4, r5, btuz, acinv, ac2u, uzik1 - -c--------------------------------------------------------------------- -c treat only one cell -c--------------------------------------------------------------------- - do k = start(3,c), cell_size(3,c)-end(3,c)-1 - do j = start(2,c), cell_size(2,c)-end(2,c)-1 - do i = start(1,c), cell_size(1,c)-end(1,c)-1 - - xvel = us(i,j,k,c) - yvel = vs(i,j,k,c) - zvel = ws(i,j,k,c) - ac = speed(i,j,k,c) - acinv = ainv(i,j,k,c) - - ac2u = ac*ac - - r1 = rhs(i,j,k,1,c) - r2 = rhs(i,j,k,2,c) - r3 = rhs(i,j,k,3,c) - r4 = rhs(i,j,k,4,c) - r5 = rhs(i,j,k,5,c) - - uzik1 = u(i,j,k,1,c) - btuz = bt * uzik1 - - t1 = btuz*acinv * (r4 + r5) - t2 = r3 + t1 - t3 = btuz * (r4 - r5) - - rhs(i,j,k,1,c) = t2 - rhs(i,j,k,2,c) = -uzik1*r2 + xvel*t2 - rhs(i,j,k,3,c) = uzik1*r1 + yvel*t2 - rhs(i,j,k,4,c) = zvel*t2 + t3 - rhs(i,j,k,5,c) = uzik1*(-xvel*r2 + yvel*r1) + - > qs(i,j,k,c)*t2 + c2iv*ac2u*t1 + zvel*t3 - - end do - end do - end do - - return - end diff --git a/examples/smpi/NAS/SP/verify.f b/examples/smpi/NAS/SP/verify.f deleted file mode 100644 index 08be79c8a6..0000000000 --- a/examples/smpi/NAS/SP/verify.f +++ /dev/null @@ -1,358 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine verify(no_time_steps, class, verified) - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c verification routine -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5), - > epsilon, xce(5), xcr(5), dtref - integer m, no_time_steps - character class - logical verified - -c--------------------------------------------------------------------- -c tolerance level -c--------------------------------------------------------------------- - epsilon = 1.0d-08 - - -c--------------------------------------------------------------------- -c compute the error norm and the residual norm, and exit if not printing -c--------------------------------------------------------------------- - call error_norm(xce) - call copy_faces - - call rhs_norm(xcr) - - do m = 1, 5 - xcr(m) = xcr(m) / dt - enddo - - if (node .ne. 0) return - - class = 'U' - verified = .true. - - do m = 1,5 - xcrref(m) = 1.0 - xceref(m) = 1.0 - end do - -c--------------------------------------------------------------------- -c reference data for 12X12X12 grids after 100 time steps, with DT = 1.50d-02 -c--------------------------------------------------------------------- - if ( (grid_points(1) .eq. 12 ) .and. - > (grid_points(2) .eq. 12 ) .and. - > (grid_points(3) .eq. 12 ) .and. - > (no_time_steps .eq. 100 )) then - - class = 'S' - dtref = 1.5d-2 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 2.7470315451339479d-02 - xcrref(2) = 1.0360746705285417d-02 - xcrref(3) = 1.6235745065095532d-02 - xcrref(4) = 1.5840557224455615d-02 - xcrref(5) = 3.4849040609362460d-02 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - xceref(1) = 2.7289258557377227d-05 - xceref(2) = 1.0364446640837285d-05 - xceref(3) = 1.6154798287166471d-05 - xceref(4) = 1.5750704994480102d-05 - xceref(5) = 3.4177666183390531d-05 - - -c--------------------------------------------------------------------- -c reference data for 36X36X36 grids after 400 time steps, with DT = 1.5d-03 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 36) .and. - > (grid_points(2) .eq. 36) .and. - > (grid_points(3) .eq. 36) .and. - > (no_time_steps . eq. 400) ) then - - class = 'W' - dtref = 1.5d-3 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 0.1893253733584d-02 - xcrref(2) = 0.1717075447775d-03 - xcrref(3) = 0.2778153350936d-03 - xcrref(4) = 0.2887475409984d-03 - xcrref(5) = 0.3143611161242d-02 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - xceref(1) = 0.7542088599534d-04 - xceref(2) = 0.6512852253086d-05 - xceref(3) = 0.1049092285688d-04 - xceref(4) = 0.1128838671535d-04 - xceref(5) = 0.1212845639773d-03 - -c--------------------------------------------------------------------- -c reference data for 64X64X64 grids after 400 time steps, with DT = 1.5d-03 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 64) .and. - > (grid_points(2) .eq. 64) .and. - > (grid_points(3) .eq. 64) .and. - > (no_time_steps . eq. 400) ) then - - class = 'A' - dtref = 1.5d-3 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 2.4799822399300195d0 - xcrref(2) = 1.1276337964368832d0 - xcrref(3) = 1.5028977888770491d0 - xcrref(4) = 1.4217816211695179d0 - xcrref(5) = 2.1292113035138280d0 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - xceref(1) = 1.0900140297820550d-04 - xceref(2) = 3.7343951769282091d-05 - xceref(3) = 5.0092785406541633d-05 - xceref(4) = 4.7671093939528255d-05 - xceref(5) = 1.3621613399213001d-04 - -c--------------------------------------------------------------------- -c reference data for 102X102X102 grids after 400 time steps, -c with DT = 1.0d-03 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 102) .and. - > (grid_points(2) .eq. 102) .and. - > (grid_points(3) .eq. 102) .and. - > (no_time_steps . eq. 400) ) then - - class = 'B' - dtref = 1.0d-3 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 0.6903293579998d+02 - xcrref(2) = 0.3095134488084d+02 - xcrref(3) = 0.4103336647017d+02 - xcrref(4) = 0.3864769009604d+02 - xcrref(5) = 0.5643482272596d+02 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - xceref(1) = 0.9810006190188d-02 - xceref(2) = 0.1022827905670d-02 - xceref(3) = 0.1720597911692d-02 - xceref(4) = 0.1694479428231d-02 - xceref(5) = 0.1847456263981d-01 - -c--------------------------------------------------------------------- -c reference data for 162X162X162 grids after 400 time steps, -c with DT = 0.67d-03 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 162) .and. - > (grid_points(2) .eq. 162) .and. - > (grid_points(3) .eq. 162) .and. - > (no_time_steps . eq. 400) ) then - - class = 'C' - dtref = 0.67d-3 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 0.5881691581829d+03 - xcrref(2) = 0.2454417603569d+03 - xcrref(3) = 0.3293829191851d+03 - xcrref(4) = 0.3081924971891d+03 - xcrref(5) = 0.4597223799176d+03 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - xceref(1) = 0.2598120500183d+00 - xceref(2) = 0.2590888922315d-01 - xceref(3) = 0.5132886416320d-01 - xceref(4) = 0.4806073419454d-01 - xceref(5) = 0.5483377491301d+00 - -c--------------------------------------------------------------------- -c reference data for 408X408X408 grids after 500 time steps, -c with DT = 0.3d-03 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 408) .and. - > (grid_points(2) .eq. 408) .and. - > (grid_points(3) .eq. 408) .and. - > (no_time_steps . eq. 500) ) then - - class = 'D' - dtref = 0.30d-3 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 0.1044696216887d+05 - xcrref(2) = 0.3204427762578d+04 - xcrref(3) = 0.4648680733032d+04 - xcrref(4) = 0.4238923283697d+04 - xcrref(5) = 0.7588412036136d+04 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - xceref(1) = 0.5089471423669d+01 - xceref(2) = 0.5323514855894d+00 - xceref(3) = 0.1187051008971d+01 - xceref(4) = 0.1083734951938d+01 - xceref(5) = 0.1164108338568d+02 - -c--------------------------------------------------------------------- -c reference data for 1020X1020X1020 grids after 500 time steps, -c with DT = 0.1d-03 -c--------------------------------------------------------------------- - elseif ( (grid_points(1) .eq. 1020) .and. - > (grid_points(2) .eq. 1020) .and. - > (grid_points(3) .eq. 1020) .and. - > (no_time_steps . eq. 500) ) then - - class = 'E' - dtref = 0.10d-3 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of residual. -c--------------------------------------------------------------------- - xcrref(1) = 0.6255387422609d+05 - xcrref(2) = 0.1495317020012d+05 - xcrref(3) = 0.2347595750586d+05 - xcrref(4) = 0.2091099783534d+05 - xcrref(5) = 0.4770412841218d+05 - -c--------------------------------------------------------------------- -c Reference values of RMS-norms of solution error. -c--------------------------------------------------------------------- - xceref(1) = 0.6742735164909d+02 - xceref(2) = 0.5390656036938d+01 - xceref(3) = 0.1680647196477d+02 - xceref(4) = 0.1536963126457d+02 - xceref(5) = 0.1575330146156d+03 - - else - verified = .false. - endif - -c--------------------------------------------------------------------- -c verification test for residuals if gridsize is one of -c the defined grid sizes above (class .ne. 'U') -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c Compute the difference of solution values and the known reference values. -c--------------------------------------------------------------------- - do m = 1, 5 - - xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) - xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m)) - - enddo - -c--------------------------------------------------------------------- -c Output the comparison of computed results to known cases. -c--------------------------------------------------------------------- - - if (class .ne. 'U') then - write(*, 1990) class - 1990 format(' Verification being performed for class ', a) - write (*,2000) epsilon - 2000 format(' accuracy setting for epsilon = ', E20.13) - verified = (dabs(dt-dtref) .le. epsilon) - if (.not.verified) then - class = 'U' - write (*,1000) dtref - 1000 format(' DT does not match the reference value of ', - > E15.8) - endif - else - write(*, 1995) - 1995 format(' Unknown class') - endif - - - if (class .ne. 'U') then - write (*,2001) - else - write (*, 2005) - endif - - 2001 format(' Comparison of RMS-norms of residual') - 2005 format(' RMS-norms of residual') - do m = 1, 5 - if (class .eq. 'U') then - write(*, 2015) m, xcr(m) - else if (xcrdif(m) .le. epsilon) then - write (*,2011) m,xcr(m),xcrref(m),xcrdif(m) - else - verified = .false. - write (*,2010) m,xcr(m),xcrref(m),xcrdif(m) - endif - enddo - - if (class .ne. 'U') then - write (*,2002) - else - write (*,2006) - endif - 2002 format(' Comparison of RMS-norms of solution error') - 2006 format(' RMS-norms of solution error') - - do m = 1, 5 - if (class .eq. 'U') then - write(*, 2015) m, xce(m) - else if (xcedif(m) .le. epsilon) then - write (*,2011) m,xce(m),xceref(m),xcedif(m) - else - verified = .false. - write (*,2010) m,xce(m),xceref(m),xcedif(m) - endif - enddo - - 2010 format(' FAILURE: ', i2, E20.13, E20.13, E20.13) - 2011 format(' ', i2, E20.13, E20.13, E20.13) - 2015 format(' ', i2, E20.13) - - if (class .eq. 'U') then - write(*, 2022) - write(*, 2023) - 2022 format(' No reference values provided') - 2023 format(' No verification performed') - else if (verified) then - write(*, 2020) - 2020 format(' Verification Successful') - else - write(*, 2021) - 2021 format(' Verification failed') - endif - - return - - - end diff --git a/examples/smpi/NAS/SP/x_solve.f b/examples/smpi/NAS/SP/x_solve.f deleted file mode 100644 index cd40756ec3..0000000000 --- a/examples/smpi/NAS/SP/x_solve.f +++ /dev/null @@ -1,545 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine x_solve - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c this function performs the solution of the approximate factorization -c step in the x-direction for all five matrix components -c simultaneously. The Thomas algorithm is employed to solve the -c systems for the x-lines. Boundary conditions are non-periodic -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - - integer i, j, k, jp, kp, n, iend, jsize, ksize, i1, i2, - > buffer_size, c, m, p, istart, stage, error, - > requests(2), statuses(MPI_STATUS_SIZE, 2) - double precision r1, r2, d, e, s(5), sm1, sm2, - > fac1, fac2 - - - -c--------------------------------------------------------------------- -c OK, now we know that there are multiple processors -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells -c on this node in the direction of increasing i for the forward sweep, -c and after that reversing the direction for the backsubstitution. -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c FORWARD ELIMINATION -c--------------------------------------------------------------------- - do stage = 1, ncells - c = slice(1,stage) - - istart = 0 - iend = cell_size(1,c)-1 - - jsize = cell_size(2,c) - ksize = cell_size(3,c) - jp = cell_coord(2,c)-1 - kp = cell_coord(3,c)-1 - - buffer_size = (jsize-start(2,c)-end(2,c)) * - > (ksize-start(3,c)-end(3,c)) - - if ( stage .ne. 1) then - -c--------------------------------------------------------------------- -c if this is not the first processor in this row of cells, -c receive data from predecessor containing the right hand -c sides and the upper diagonal elements of the previous two rows -c--------------------------------------------------------------------- - call mpi_irecv(in_buffer, 22*buffer_size, - > dp_type, predecessor(1), - > DEFAULT_TAG, comm_solve, - > requests(1), error) - - -c--------------------------------------------------------------------- -c communication has already been started. -c compute the left hand side while waiting for the msg -c--------------------------------------------------------------------- - call lhsx(c) - -c--------------------------------------------------------------------- -c wait for pending communication to complete -c This waits on the current receive and on the send -c from the previous stage. They always come in pairs. -c--------------------------------------------------------------------- - - call mpi_waitall(2, requests, statuses, error) - -c--------------------------------------------------------------------- -c unpack the buffer -c--------------------------------------------------------------------- - i = istart - i1 = istart + 1 - n = 0 - -c--------------------------------------------------------------------- -c create a running pointer -c--------------------------------------------------------------------- - p = 0 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - - > in_buffer(p+1) * lhs(i,j,k,n+1,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - - > in_buffer(p+2) * lhs(i,j,k,n+1,c) - do m = 1, 3 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > in_buffer(p+2+m) * lhs(i,j,k,n+1,c) - end do - d = in_buffer(p+6) - e = in_buffer(p+7) - do m = 1, 3 - s(m) = in_buffer(p+7+m) - end do - r1 = lhs(i,j,k,n+2,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 - lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 - do m = 1, 3 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 - end do - r2 = lhs(i1,j,k,n+1,c) - lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2 - lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2 - do m = 1, 3 - rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2 - end do - p = p + 10 - end do - end do - - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - - > in_buffer(p+1) * lhs(i,j,k,n+1,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - - > in_buffer(p+2) * lhs(i,j,k,n+1,c) - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > in_buffer(p+3) * lhs(i,j,k,n+1,c) - d = in_buffer(p+4) - e = in_buffer(p+5) - s(m) = in_buffer(p+6) - r1 = lhs(i,j,k,n+2,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 - lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 - r2 = lhs(i1,j,k,n+1,c) - lhs(i1,j,k,n+2,c) = lhs(i1,j,k,n+2,c) - d * r2 - lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - e * r2 - rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - s(m) * r2 - p = p + 6 - end do - end do - end do - - else - -c--------------------------------------------------------------------- -c if this IS the first cell, we still compute the lhs -c--------------------------------------------------------------------- - call lhsx(c) - endif - -c--------------------------------------------------------------------- -c perform the Thomas algorithm; first, FORWARD ELIMINATION -c--------------------------------------------------------------------- - n = 0 - - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - do i = istart, iend-2 - i1 = i + 1 - i2 = i + 2 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - end do - lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - - > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) - - > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - - > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c) - end do - lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) - - > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c) - lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) - - > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) - - > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c The last two rows in this grid block are a bit different, -c since they do not have two more rows available for the -c elimination of off-diagonal entries -c--------------------------------------------------------------------- - - i = iend - 1 - i1 = iend - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - end do - lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - - > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) - - > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - - > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c) - end do -c--------------------------------------------------------------------- -c scale the last row immediately (some of this is -c overkill in case this is the last cell) -c--------------------------------------------------------------------- - fac2 = 1.d0/lhs(i1,j,k,n+3,c) - lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c) - lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c) - do m = 1, 3 - rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c) - end do - end do - end do - -c--------------------------------------------------------------------- -c do the u+c and the u-c factors -c--------------------------------------------------------------------- - - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - do i = istart, iend-2 - i1 = i + 1 - i2 = i + 2 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - - > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) - - > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c) - rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - - > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c) - lhs(i2,j,k,n+2,c) = lhs(i2,j,k,n+2,c) - - > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+4,c) - lhs(i2,j,k,n+3,c) = lhs(i2,j,k,n+3,c) - - > lhs(i2,j,k,n+1,c)*lhs(i,j,k,n+5,c) - rhs(i2,j,k,m,c) = rhs(i2,j,k,m,c) - - > lhs(i2,j,k,n+1,c)*rhs(i,j,k,m,c) - end do - end do - end do - -c--------------------------------------------------------------------- -c And again the last two rows separately -c--------------------------------------------------------------------- - i = iend - 1 - i1 = iend - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - lhs(i1,j,k,n+3,c) = lhs(i1,j,k,n+3,c) - - > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i1,j,k,n+4,c) = lhs(i1,j,k,n+4,c) - - > lhs(i1,j,k,n+2,c)*lhs(i,j,k,n+5,c) - rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - - > lhs(i1,j,k,n+2,c)*rhs(i,j,k,m,c) -c--------------------------------------------------------------------- -c Scale the last row immediately (some of this is overkill -c if this is the last cell) -c--------------------------------------------------------------------- - fac2 = 1.d0/lhs(i1,j,k,n+3,c) - lhs(i1,j,k,n+4,c) = fac2*lhs(i1,j,k,n+4,c) - lhs(i1,j,k,n+5,c) = fac2*lhs(i1,j,k,n+5,c) - rhs(i1,j,k,m,c) = fac2*rhs(i1,j,k,m,c) - - end do - end do - end do - -c--------------------------------------------------------------------- -c send information to the next processor, except when this -c is the last grid block -c--------------------------------------------------------------------- - if (stage .ne. ncells) then - -c--------------------------------------------------------------------- -c create a running pointer for the send buffer -c--------------------------------------------------------------------- - p = 0 - n = 0 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - do i = iend-1, iend - out_buffer(p+1) = lhs(i,j,k,n+4,c) - out_buffer(p+2) = lhs(i,j,k,n+5,c) - do m = 1, 3 - out_buffer(p+2+m) = rhs(i,j,k,m,c) - end do - p = p+5 - end do - end do - end do - - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - do i = iend-1, iend - out_buffer(p+1) = lhs(i,j,k,n+4,c) - out_buffer(p+2) = lhs(i,j,k,n+5,c) - out_buffer(p+3) = rhs(i,j,k,m,c) - p = p + 3 - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c send data to next phase -c can't receive data yet because buffer size will be wrong -c--------------------------------------------------------------------- - call mpi_isend(out_buffer, 22*buffer_size, - > dp_type, successor(1), - > DEFAULT_TAG, comm_solve, - > requests(2), error) - - endif - end do - -c--------------------------------------------------------------------- -c now go in the reverse direction -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c BACKSUBSTITUTION -c--------------------------------------------------------------------- - do stage = ncells, 1, -1 - c = slice(1,stage) - - istart = 0 - iend = cell_size(1,c)-1 - - jsize = cell_size(2,c) - ksize = cell_size(3,c) - jp = cell_coord(2,c)-1 - kp = cell_coord(3,c)-1 - - buffer_size = (jsize-start(2,c)-end(2,c)) * - > (ksize-start(3,c)-end(3,c)) - - if (stage .ne. ncells) then - -c--------------------------------------------------------------------- -c if this is not the starting cell in this row of cells, -c wait for a message to be received, containing the -c solution of the previous two stations -c--------------------------------------------------------------------- - call mpi_irecv(in_buffer, 10*buffer_size, - > dp_type, successor(1), - > DEFAULT_TAG, comm_solve, - > requests(1), error) - - -c--------------------------------------------------------------------- -c communication has already been started -c while waiting, do the block-diagonal inversion for the -c cell that was just finished -c--------------------------------------------------------------------- - - call ninvr(slice(1,stage+1)) - -c--------------------------------------------------------------------- -c wait for pending communication to complete -c--------------------------------------------------------------------- - call mpi_waitall(2, requests, statuses, error) - -c--------------------------------------------------------------------- -c unpack the buffer for the first three factors -c--------------------------------------------------------------------- - n = 0 - p = 0 - i = iend - i1 = i - 1 - do m = 1, 3 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - sm1 = in_buffer(p+1) - sm2 = in_buffer(p+2) - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*sm1 - - > lhs(i,j,k,n+5,c)*sm2 - rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - - > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - - > lhs(i1,j,k,n+5,c) * sm1 - p = p + 2 - end do - end do - end do - -c--------------------------------------------------------------------- -c now unpack the buffer for the remaining two factors -c--------------------------------------------------------------------- - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - sm1 = in_buffer(p+1) - sm2 = in_buffer(p+2) - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*sm1 - - > lhs(i,j,k,n+5,c)*sm2 - rhs(i1,j,k,m,c) = rhs(i1,j,k,m,c) - - > lhs(i1,j,k,n+4,c) * rhs(i,j,k,m,c) - - > lhs(i1,j,k,n+5,c) * sm1 - p = p + 2 - end do - end do - end do - - else - -c--------------------------------------------------------------------- -c now we know this is the first grid block on the back sweep, -c so we don't need a message to start the substitution. -c--------------------------------------------------------------------- - i = iend-1 - i1 = iend - n = 0 - do m = 1, 3 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) - end do - end do - end do - - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c Whether or not this is the last processor, we always have -c to complete the back-substitution -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c The first three factors -c--------------------------------------------------------------------- - n = 0 - do m = 1, 3 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - do i = iend-2, istart, -1 - i1 = i + 1 - i2 = i + 2 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) - - > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c And the remaining two -c--------------------------------------------------------------------- - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - do i = iend-2, istart, -1 - i1 = i + 1 - i2 = i + 2 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i1,j,k,m,c) - - > lhs(i,j,k,n+5,c)*rhs(i2,j,k,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c send on information to the previous processor, if needed -c--------------------------------------------------------------------- - if (stage .ne. 1) then - i = istart - i1 = istart+1 - p = 0 - do m = 1, 5 - do k = start(3,c), ksize-end(3,c)-1 - do j = start(2,c), jsize-end(2,c)-1 - out_buffer(p+1) = rhs(i,j,k,m,c) - out_buffer(p+2) = rhs(i1,j,k,m,c) - p = p + 2 - end do - end do - end do - -c--------------------------------------------------------------------- -c pack and send the buffer -c--------------------------------------------------------------------- - call mpi_isend(out_buffer, 10*buffer_size, - > dp_type, predecessor(1), - > DEFAULT_TAG, comm_solve, - > requests(2), error) - - endif - -c--------------------------------------------------------------------- -c If this was the last stage, do the block-diagonal inversion -c--------------------------------------------------------------------- - if (stage .eq. 1) call ninvr(c) - - end do - - return - end - - - - - - - diff --git a/examples/smpi/NAS/SP/y_solve.f b/examples/smpi/NAS/SP/y_solve.f deleted file mode 100644 index fdcbb4d03f..0000000000 --- a/examples/smpi/NAS/SP/y_solve.f +++ /dev/null @@ -1,538 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine y_solve - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c this function performs the solution of the approximate factorization -c step in the y-direction for all five matrix components -c simultaneously. The Thomas algorithm is employed to solve the -c systems for the y-lines. Boundary conditions are non-periodic -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i, j, k, stage, ip, kp, n, isize, jend, ksize, j1, j2, - > buffer_size, c, m, p, jstart, error, - > requests(2), statuses(MPI_STATUS_SIZE, 2) - double precision r1, r2, d, e, s(5), sm1, sm2, - > fac1, fac2 - - -c--------------------------------------------------------------------- -c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells -c on this node in the direction of increasing i for the forward sweep, -c and after that reversing the direction for the backsubstitution -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c FORWARD ELIMINATION -c--------------------------------------------------------------------- - do stage = 1, ncells - c = slice(2,stage) - - jstart = 0 - jend = cell_size(2,c)-1 - - isize = cell_size(1,c) - ksize = cell_size(3,c) - ip = cell_coord(1,c)-1 - kp = cell_coord(3,c)-1 - - buffer_size = (isize-start(1,c)-end(1,c)) * - > (ksize-start(3,c)-end(3,c)) - - if ( stage .ne. 1) then - -c--------------------------------------------------------------------- -c if this is not the first processor in this row of cells, -c receive data from predecessor containing the right hand -c sides and the upper diagonal elements of the previous two rows -c--------------------------------------------------------------------- - - call mpi_irecv(in_buffer, 22*buffer_size, - > dp_type, predecessor(2), - > DEFAULT_TAG, comm_solve, - > requests(1), error) - -c--------------------------------------------------------------------- -c communication has already been started. -c compute the left hand side while waiting for the msg -c--------------------------------------------------------------------- - call lhsy(c) - -c--------------------------------------------------------------------- -c wait for pending communication to complete -c This waits on the current receive and on the send -c from the previous stage. They always come in pairs. -c--------------------------------------------------------------------- - call mpi_waitall(2, requests, statuses, error) - -c--------------------------------------------------------------------- -c unpack the buffer -c--------------------------------------------------------------------- - j = jstart - j1 = jstart + 1 - n = 0 -c--------------------------------------------------------------------- -c create a running pointer -c--------------------------------------------------------------------- - p = 0 - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - - > in_buffer(p+1) * lhs(i,j,k,n+1,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - - > in_buffer(p+2) * lhs(i,j,k,n+1,c) - do m = 1, 3 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > in_buffer(p+2+m) * lhs(i,j,k,n+1,c) - end do - d = in_buffer(p+6) - e = in_buffer(p+7) - do m = 1, 3 - s(m) = in_buffer(p+7+m) - end do - r1 = lhs(i,j,k,n+2,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 - lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 - do m = 1, 3 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 - end do - r2 = lhs(i,j1,k,n+1,c) - lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2 - lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2 - do m = 1, 3 - rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2 - end do - p = p + 10 - end do - end do - - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - - > in_buffer(p+1) * lhs(i,j,k,n+1,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - - > in_buffer(p+2) * lhs(i,j,k,n+1,c) - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > in_buffer(p+3) * lhs(i,j,k,n+1,c) - d = in_buffer(p+4) - e = in_buffer(p+5) - s(m) = in_buffer(p+6) - r1 = lhs(i,j,k,n+2,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 - lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 - r2 = lhs(i,j1,k,n+1,c) - lhs(i,j1,k,n+2,c) = lhs(i,j1,k,n+2,c) - d * r2 - lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - e * r2 - rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - s(m) * r2 - p = p + 6 - end do - end do - end do - - else - -c--------------------------------------------------------------------- -c if this IS the first cell, we still compute the lhs -c--------------------------------------------------------------------- - call lhsy(c) - endif - -c--------------------------------------------------------------------- -c perform the Thomas algorithm; first, FORWARD ELIMINATION -c--------------------------------------------------------------------- - n = 0 - - do k = start(3,c), ksize-end(3,c)-1 - do j = jstart, jend-2 - do i = start(1,c), isize-end(1,c)-1 - j1 = j + 1 - j2 = j + 2 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - end do - lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - - > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) - - > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - - > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c) - end do - lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) - - > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c) - lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) - - > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) - - > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c The last two rows in this grid block are a bit different, -c since they do not have two more rows available for the -c elimination of off-diagonal entries -c--------------------------------------------------------------------- - - j = jend - 1 - j1 = jend - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - end do - lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - - > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) - - > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - - > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c) - end do -c--------------------------------------------------------------------- -c scale the last row immediately (some of this is -c overkill in case this is the last cell) -c--------------------------------------------------------------------- - fac2 = 1.d0/lhs(i,j1,k,n+3,c) - lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c) - lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c) - do m = 1, 3 - rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c) - end do - end do - end do - -c--------------------------------------------------------------------- -c do the u+c and the u-c factors -c--------------------------------------------------------------------- - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do j = jstart, jend-2 - do i = start(1,c), isize-end(1,c)-1 - j1 = j + 1 - j2 = j + 2 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - - > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) - - > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c) - rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - - > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c) - lhs(i,j2,k,n+2,c) = lhs(i,j2,k,n+2,c) - - > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+4,c) - lhs(i,j2,k,n+3,c) = lhs(i,j2,k,n+3,c) - - > lhs(i,j2,k,n+1,c)*lhs(i,j,k,n+5,c) - rhs(i,j2,k,m,c) = rhs(i,j2,k,m,c) - - > lhs(i,j2,k,n+1,c)*rhs(i,j,k,m,c) - end do - end do - end do - -c--------------------------------------------------------------------- -c And again the last two rows separately -c--------------------------------------------------------------------- - j = jend - 1 - j1 = jend - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - lhs(i,j1,k,n+3,c) = lhs(i,j1,k,n+3,c) - - > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i,j1,k,n+4,c) = lhs(i,j1,k,n+4,c) - - > lhs(i,j1,k,n+2,c)*lhs(i,j,k,n+5,c) - rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - - > lhs(i,j1,k,n+2,c)*rhs(i,j,k,m,c) -c--------------------------------------------------------------------- -c Scale the last row immediately (some of this is overkill -c if this is the last cell) -c--------------------------------------------------------------------- - fac2 = 1.d0/lhs(i,j1,k,n+3,c) - lhs(i,j1,k,n+4,c) = fac2*lhs(i,j1,k,n+4,c) - lhs(i,j1,k,n+5,c) = fac2*lhs(i,j1,k,n+5,c) - rhs(i,j1,k,m,c) = fac2*rhs(i,j1,k,m,c) - - end do - end do - end do - -c--------------------------------------------------------------------- -c send information to the next processor, except when this -c is the last grid block; -c--------------------------------------------------------------------- - - if (stage .ne. ncells) then - -c--------------------------------------------------------------------- -c create a running pointer for the send buffer -c--------------------------------------------------------------------- - p = 0 - n = 0 - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - do j = jend-1, jend - out_buffer(p+1) = lhs(i,j,k,n+4,c) - out_buffer(p+2) = lhs(i,j,k,n+5,c) - do m = 1, 3 - out_buffer(p+2+m) = rhs(i,j,k,m,c) - end do - p = p+5 - end do - end do - end do - - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - do j = jend-1, jend - out_buffer(p+1) = lhs(i,j,k,n+4,c) - out_buffer(p+2) = lhs(i,j,k,n+5,c) - out_buffer(p+3) = rhs(i,j,k,m,c) - p = p + 3 - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c pack and send the buffer -c--------------------------------------------------------------------- - call mpi_isend(out_buffer, 22*buffer_size, - > dp_type, successor(2), - > DEFAULT_TAG, comm_solve, - > requests(2), error) - - endif - end do - -c--------------------------------------------------------------------- -c now go in the reverse direction -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c BACKSUBSTITUTION -c--------------------------------------------------------------------- - do stage = ncells, 1, -1 - c = slice(2,stage) - - jstart = 0 - jend = cell_size(2,c)-1 - - isize = cell_size(1,c) - ksize = cell_size(3,c) - ip = cell_coord(1,c)-1 - kp = cell_coord(3,c)-1 - - buffer_size = (isize-start(1,c)-end(1,c)) * - > (ksize-start(3,c)-end(3,c)) - - if (stage .ne. ncells) then - -c--------------------------------------------------------------------- -c if this is not the starting cell in this row of cells, -c wait for a message to be received, containing the -c solution of the previous two stations -c--------------------------------------------------------------------- - - call mpi_irecv(in_buffer, 10*buffer_size, - > dp_type, successor(2), - > DEFAULT_TAG, comm_solve, - > requests(1), error) - - -c--------------------------------------------------------------------- -c communication has already been started -c while waiting, do the block-diagonal inversion for the -c cell that was just finished -c--------------------------------------------------------------------- - - call pinvr(slice(2,stage+1)) - -c--------------------------------------------------------------------- -c wait for pending communication to complete -c--------------------------------------------------------------------- - call mpi_waitall(2, requests, statuses, error) - -c--------------------------------------------------------------------- -c unpack the buffer for the first three factors -c--------------------------------------------------------------------- - n = 0 - p = 0 - j = jend - j1 = j - 1 - do m = 1, 3 - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - sm1 = in_buffer(p+1) - sm2 = in_buffer(p+2) - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*sm1 - - > lhs(i,j,k,n+5,c)*sm2 - rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - - > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) - - > lhs(i,j1,k,n+5,c) * sm1 - p = p + 2 - end do - end do - end do - -c--------------------------------------------------------------------- -c now unpack the buffer for the remaining two factors -c--------------------------------------------------------------------- - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - sm1 = in_buffer(p+1) - sm2 = in_buffer(p+2) - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*sm1 - - > lhs(i,j,k,n+5,c)*sm2 - rhs(i,j1,k,m,c) = rhs(i,j1,k,m,c) - - > lhs(i,j1,k,n+4,c) * rhs(i,j,k,m,c) - - > lhs(i,j1,k,n+5,c) * sm1 - p = p + 2 - end do - end do - end do - - else -c--------------------------------------------------------------------- -c now we know this is the first grid block on the back sweep, -c so we don't need a message to start the substitution. -c--------------------------------------------------------------------- - - j = jend - 1 - j1 = jend - n = 0 - do m = 1, 3 - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) - end do - end do - end do - - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c Whether or not this is the last processor, we always have -c to complete the back-substitution -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c The first three factors -c--------------------------------------------------------------------- - n = 0 - do m = 1, 3 - do k = start(3,c), ksize-end(3,c)-1 - do j = jend-2, jstart, -1 - do i = start(1,c), isize-end(1,c)-1 - j1 = j + 1 - j2 = j + 2 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) - - > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c And the remaining two -c--------------------------------------------------------------------- - do m = 4, 5 - n = (m-3)*5 - do k = start(3,c), ksize-end(3,c)-1 - do j = jend-2, jstart, -1 - do i = start(1,c), isize-end(1,c)-1 - j1 = j + 1 - j2 = j1 + 1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i,j1,k,m,c) - - > lhs(i,j,k,n+5,c)*rhs(i,j2,k,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c send on information to the previous processor, if needed -c--------------------------------------------------------------------- - if (stage .ne. 1) then - j = jstart - j1 = jstart + 1 - p = 0 - do m = 1, 5 - do k = start(3,c), ksize-end(3,c)-1 - do i = start(1,c), isize-end(1,c)-1 - out_buffer(p+1) = rhs(i,j,k,m,c) - out_buffer(p+2) = rhs(i,j1,k,m,c) - p = p + 2 - end do - end do - end do - -c--------------------------------------------------------------------- -c pack and send the buffer -c--------------------------------------------------------------------- - - call mpi_isend(out_buffer, 10*buffer_size, - > dp_type, predecessor(2), - > DEFAULT_TAG, comm_solve, - > requests(2), error) - - endif - -c--------------------------------------------------------------------- -c If this was the last stage, do the block-diagonal inversion -c--------------------------------------------------------------------- - if (stage .eq. 1) call pinvr(c) - - end do - - return - end - - - - - - - diff --git a/examples/smpi/NAS/SP/z_solve.f b/examples/smpi/NAS/SP/z_solve.f deleted file mode 100644 index ad0dc7e727..0000000000 --- a/examples/smpi/NAS/SP/z_solve.f +++ /dev/null @@ -1,532 +0,0 @@ - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - - subroutine z_solve - -c--------------------------------------------------------------------- -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c this function performs the solution of the approximate factorization -c step in the z-direction for all five matrix components -c simultaneously. The Thomas algorithm is employed to solve the -c systems for the z-lines. Boundary conditions are non-periodic -c--------------------------------------------------------------------- - - include 'header.h' - include 'mpinpb.h' - - integer i, j, k, stage, ip, jp, n, isize, jsize, kend, k1, k2, - > buffer_size, c, m, p, kstart, error, - > requests(2), statuses(MPI_STATUS_SIZE, 2) - double precision r1, r2, d, e, s(5), sm1, sm2, - > fac1, fac2 - -c--------------------------------------------------------------------- -c now do a sweep on a layer-by-layer basis, i.e. sweeping through cells -c on this node in the direction of increasing i for the forward sweep, -c and after that reversing the direction for the backsubstitution -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c FORWARD ELIMINATION -c--------------------------------------------------------------------- - do stage = 1, ncells - c = slice(3,stage) - - kstart = 0 - kend = cell_size(3,c)-1 - - isize = cell_size(1,c) - jsize = cell_size(2,c) - ip = cell_coord(1,c)-1 - jp = cell_coord(2,c)-1 - - buffer_size = (isize-start(1,c)-end(1,c)) * - > (jsize-start(2,c)-end(2,c)) - - if (stage .ne. 1) then - - -c--------------------------------------------------------------------- -c if this is not the first processor in this row of cells, -c receive data from predecessor containing the right hand -c sides and the upper diagonal elements of the previous two rows -c--------------------------------------------------------------------- - - call mpi_irecv(in_buffer, 22*buffer_size, - > dp_type, predecessor(3), - > DEFAULT_TAG, comm_solve, - > requests(1), error) - - -c--------------------------------------------------------------------- -c communication has already been started. -c compute the left hand side while waiting for the msg -c--------------------------------------------------------------------- - call lhsz(c) - -c--------------------------------------------------------------------- -c wait for pending communication to complete -c--------------------------------------------------------------------- - call mpi_waitall(2, requests, statuses, error) - -c--------------------------------------------------------------------- -c unpack the buffer -c--------------------------------------------------------------------- - k = kstart - k1 = kstart + 1 - n = 0 - -c--------------------------------------------------------------------- -c create a running pointer -c--------------------------------------------------------------------- - p = 0 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - - > in_buffer(p+1) * lhs(i,j,k,n+1,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - - > in_buffer(p+2) * lhs(i,j,k,n+1,c) - do m = 1, 3 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > in_buffer(p+2+m) * lhs(i,j,k,n+1,c) - end do - d = in_buffer(p+6) - e = in_buffer(p+7) - do m = 1, 3 - s(m) = in_buffer(p+7+m) - end do - r1 = lhs(i,j,k,n+2,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 - lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 - do m = 1, 3 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 - end do - r2 = lhs(i,j,k1,n+1,c) - lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2 - lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2 - do m = 1, 3 - rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2 - end do - p = p + 10 - end do - end do - - do m = 4, 5 - n = (m-3)*5 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - lhs(i,j,k,n+2,c) = lhs(i,j,k,n+2,c) - - > in_buffer(p+1) * lhs(i,j,k,n+1,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - - > in_buffer(p+2) * lhs(i,j,k,n+1,c) - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > in_buffer(p+3) * lhs(i,j,k,n+1,c) - d = in_buffer(p+4) - e = in_buffer(p+5) - s(m) = in_buffer(p+6) - r1 = lhs(i,j,k,n+2,c) - lhs(i,j,k,n+3,c) = lhs(i,j,k,n+3,c) - d * r1 - lhs(i,j,k,n+4,c) = lhs(i,j,k,n+4,c) - e * r1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - s(m) * r1 - r2 = lhs(i,j,k1,n+1,c) - lhs(i,j,k1,n+2,c) = lhs(i,j,k1,n+2,c) - d * r2 - lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - e * r2 - rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - s(m) * r2 - p = p + 6 - end do - end do - end do - - else - -c--------------------------------------------------------------------- -c if this IS the first cell, we still compute the lhs -c--------------------------------------------------------------------- - call lhsz(c) - endif - -c--------------------------------------------------------------------- -c perform the Thomas algorithm; first, FORWARD ELIMINATION -c--------------------------------------------------------------------- - n = 0 - - do k = kstart, kend-2 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - k1 = k + 1 - k2 = k + 2 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - end do - lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - - > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) - end do - lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) - - > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c) - lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) - - > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) - - > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c The last two rows in this grid block are a bit different, -c since they do not have two more rows available for the -c elimination of off-diagonal entries -c--------------------------------------------------------------------- - k = kend - 1 - k1 = kend - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - end do - lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) - do m = 1, 3 - rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - - > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) - end do -c--------------------------------------------------------------------- -c scale the last row immediately (some of this is -c overkill in case this is the last cell) -c--------------------------------------------------------------------- - fac2 = 1.d0/lhs(i,j,k1,n+3,c) - lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c) - lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c) - do m = 1, 3 - rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c) - end do - end do - end do - -c--------------------------------------------------------------------- -c do the u+c and the u-c factors -c--------------------------------------------------------------------- - do m = 4, 5 - n = (m-3)*5 - do k = kstart, kend-2 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - k1 = k + 1 - k2 = k + 2 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) - rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - - > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) - lhs(i,j,k2,n+2,c) = lhs(i,j,k2,n+2,c) - - > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+4,c) - lhs(i,j,k2,n+3,c) = lhs(i,j,k2,n+3,c) - - > lhs(i,j,k2,n+1,c)*lhs(i,j,k,n+5,c) - rhs(i,j,k2,m,c) = rhs(i,j,k2,m,c) - - > lhs(i,j,k2,n+1,c)*rhs(i,j,k,m,c) - end do - end do - end do - -c--------------------------------------------------------------------- -c And again the last two rows separately -c--------------------------------------------------------------------- - k = kend - 1 - k1 = kend - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - fac1 = 1.d0/lhs(i,j,k,n+3,c) - lhs(i,j,k,n+4,c) = fac1*lhs(i,j,k,n+4,c) - lhs(i,j,k,n+5,c) = fac1*lhs(i,j,k,n+5,c) - rhs(i,j,k,m,c) = fac1*rhs(i,j,k,m,c) - lhs(i,j,k1,n+3,c) = lhs(i,j,k1,n+3,c) - - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+4,c) - lhs(i,j,k1,n+4,c) = lhs(i,j,k1,n+4,c) - - > lhs(i,j,k1,n+2,c)*lhs(i,j,k,n+5,c) - rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - - > lhs(i,j,k1,n+2,c)*rhs(i,j,k,m,c) -c--------------------------------------------------------------------- -c Scale the last row immediately (some of this is overkill -c if this is the last cell) -c--------------------------------------------------------------------- - fac2 = 1.d0/lhs(i,j,k1,n+3,c) - lhs(i,j,k1,n+4,c) = fac2*lhs(i,j,k1,n+4,c) - lhs(i,j,k1,n+5,c) = fac2*lhs(i,j,k1,n+5,c) - rhs(i,j,k1,m,c) = fac2*rhs(i,j,k1,m,c) - - end do - end do - end do - -c--------------------------------------------------------------------- -c send information to the next processor, except when this -c is the last grid block, -c--------------------------------------------------------------------- - - if (stage .ne. ncells) then - -c--------------------------------------------------------------------- -c create a running pointer for the send buffer -c--------------------------------------------------------------------- - p = 0 - n = 0 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - do k = kend-1, kend - out_buffer(p+1) = lhs(i,j,k,n+4,c) - out_buffer(p+2) = lhs(i,j,k,n+5,c) - do m = 1, 3 - out_buffer(p+2+m) = rhs(i,j,k,m,c) - end do - p = p+5 - end do - end do - end do - - do m = 4, 5 - n = (m-3)*5 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - do k = kend-1, kend - out_buffer(p+1) = lhs(i,j,k,n+4,c) - out_buffer(p+2) = lhs(i,j,k,n+5,c) - out_buffer(p+3) = rhs(i,j,k,m,c) - p = p + 3 - end do - end do - end do - end do - - - call mpi_isend(out_buffer, 22*buffer_size, - > dp_type, successor(3), - > DEFAULT_TAG, comm_solve, - > requests(2), error) - - endif - end do - -c--------------------------------------------------------------------- -c now go in the reverse direction -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c BACKSUBSTITUTION -c--------------------------------------------------------------------- - do stage = ncells, 1, -1 - c = slice(3,stage) - - kstart = 0 - kend = cell_size(3,c)-1 - - isize = cell_size(1,c) - jsize = cell_size(2,c) - ip = cell_coord(1,c)-1 - jp = cell_coord(2,c)-1 - - buffer_size = (isize-start(1,c)-end(1,c)) * - > (jsize-start(2,c)-end(2,c)) - - if (stage .ne. ncells) then - -c--------------------------------------------------------------------- -c if this is not the starting cell in this row of cells, -c wait for a message to be received, containing the -c solution of the previous two stations -c--------------------------------------------------------------------- - - call mpi_irecv(in_buffer, 10*buffer_size, - > dp_type, successor(3), - > DEFAULT_TAG, comm_solve, - > requests(1), error) - - -c--------------------------------------------------------------------- -c communication has already been started -c while waiting, do the block-diagonal inversion for the -c cell that was just finished -c--------------------------------------------------------------------- - - call tzetar(slice(3,stage+1)) - -c--------------------------------------------------------------------- -c wait for pending communication to complete -c--------------------------------------------------------------------- - call mpi_waitall(2, requests, statuses, error) - -c--------------------------------------------------------------------- -c unpack the buffer for the first three factors -c--------------------------------------------------------------------- - n = 0 - p = 0 - k = kend - k1 = k - 1 - do m = 1, 3 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - sm1 = in_buffer(p+1) - sm2 = in_buffer(p+2) - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*sm1 - - > lhs(i,j,k,n+5,c)*sm2 - rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - - > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - - > lhs(i,j,k1,n+5,c) * sm1 - p = p + 2 - end do - end do - end do - -c--------------------------------------------------------------------- -c now unpack the buffer for the remaining two factors -c--------------------------------------------------------------------- - do m = 4, 5 - n = (m-3)*5 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - sm1 = in_buffer(p+1) - sm2 = in_buffer(p+2) - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*sm1 - - > lhs(i,j,k,n+5,c)*sm2 - rhs(i,j,k1,m,c) = rhs(i,j,k1,m,c) - - > lhs(i,j,k1,n+4,c) * rhs(i,j,k,m,c) - - > lhs(i,j,k1,n+5,c) * sm1 - p = p + 2 - end do - end do - end do - - else - -c--------------------------------------------------------------------- -c now we know this is the first grid block on the back sweep, -c so we don't need a message to start the substitution. -c--------------------------------------------------------------------- - - k = kend - 1 - k1 = kend - n = 0 - do m = 1, 3 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) - end do - end do - end do - - do m = 4, 5 - n = (m-3)*5 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) - end do - end do - end do - endif - -c--------------------------------------------------------------------- -c Whether or not this is the last processor, we always have -c to complete the back-substitution -c--------------------------------------------------------------------- - -c--------------------------------------------------------------------- -c The first three factors -c--------------------------------------------------------------------- - n = 0 - do m = 1, 3 - do k = kend-2, kstart, -1 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - k1 = k + 1 - k2 = k + 2 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) - - > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c And the remaining two -c--------------------------------------------------------------------- - do m = 4, 5 - n = (m-3)*5 - do k = kend-2, kstart, -1 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - k1 = k + 1 - k2 = k + 2 - rhs(i,j,k,m,c) = rhs(i,j,k,m,c) - - > lhs(i,j,k,n+4,c)*rhs(i,j,k1,m,c) - - > lhs(i,j,k,n+5,c)*rhs(i,j,k2,m,c) - end do - end do - end do - end do - -c--------------------------------------------------------------------- -c send on information to the previous processor, if needed -c--------------------------------------------------------------------- - if (stage .ne. 1) then - k = kstart - k1 = kstart + 1 - p = 0 - do m = 1, 5 - do j = start(2,c), jsize-end(2,c)-1 - do i = start(1,c), isize-end(1,c)-1 - out_buffer(p+1) = rhs(i,j,k,m,c) - out_buffer(p+2) = rhs(i,j,k1,m,c) - p = p + 2 - end do - end do - end do - - call mpi_isend(out_buffer, 10*buffer_size, - > dp_type, predecessor(3), - > DEFAULT_TAG, comm_solve, - > requests(2), error) - - endif - -c--------------------------------------------------------------------- -c If this was the last stage, do the block-diagonal inversion -c--------------------------------------------------------------------- - if (stage .eq. 1) call tzetar(c) - - end do - - return - end - - - - - - -