Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Added our tweaked version of NAS benchmarks.
authorpini <pini@48e7efb5-ca39-0410-a469-dd3cf9ba447f>
Thu, 11 Mar 2010 14:47:35 +0000 (14:47 +0000)
committerpini <pini@48e7efb5-ca39-0410-a469-dd3cf9ba447f>
Thu, 11 Mar 2010 14:47:35 +0000 (14:47 +0000)
git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/simgrid/simgrid/trunk@7221 48e7efb5-ca39-0410-a469-dd3cf9ba447f

178 files changed:
examples/smpi/NAS/BT/Makefile [new file with mode: 0644]
examples/smpi/NAS/BT/add.f [new file with mode: 0644]
examples/smpi/NAS/BT/adi.f [new file with mode: 0644]
examples/smpi/NAS/BT/bt.f [new file with mode: 0644]
examples/smpi/NAS/BT/btio.f [new file with mode: 0644]
examples/smpi/NAS/BT/btio_common.f [new file with mode: 0644]
examples/smpi/NAS/BT/copy_faces.f [new file with mode: 0644]
examples/smpi/NAS/BT/define.f [new file with mode: 0644]
examples/smpi/NAS/BT/epio.f [new file with mode: 0644]
examples/smpi/NAS/BT/error.f [new file with mode: 0644]
examples/smpi/NAS/BT/exact_rhs.f [new file with mode: 0644]
examples/smpi/NAS/BT/exact_solution.f [new file with mode: 0644]
examples/smpi/NAS/BT/fortran_io.f [new file with mode: 0644]
examples/smpi/NAS/BT/full_mpiio.f [new file with mode: 0644]
examples/smpi/NAS/BT/header.h [new file with mode: 0644]
examples/smpi/NAS/BT/initialize.f [new file with mode: 0644]
examples/smpi/NAS/BT/inputbt.data.sample [new file with mode: 0644]
examples/smpi/NAS/BT/make_set.f [new file with mode: 0644]
examples/smpi/NAS/BT/mpinpb.h [new file with mode: 0644]
examples/smpi/NAS/BT/rhs.f [new file with mode: 0644]
examples/smpi/NAS/BT/set_constants.f [new file with mode: 0644]
examples/smpi/NAS/BT/setup_mpi.f [new file with mode: 0644]
examples/smpi/NAS/BT/simple_mpiio.f [new file with mode: 0644]
examples/smpi/NAS/BT/solve_subs.f [new file with mode: 0644]
examples/smpi/NAS/BT/verify.f [new file with mode: 0644]
examples/smpi/NAS/BT/work_lhs.h [new file with mode: 0644]
examples/smpi/NAS/BT/work_lhs_vec.h [new file with mode: 0644]
examples/smpi/NAS/BT/x_solve.f [new file with mode: 0644]
examples/smpi/NAS/BT/x_solve_vec.f [new file with mode: 0644]
examples/smpi/NAS/BT/y_solve.f [new file with mode: 0644]
examples/smpi/NAS/BT/y_solve_vec.f [new file with mode: 0644]
examples/smpi/NAS/BT/z_solve.f [new file with mode: 0644]
examples/smpi/NAS/BT/z_solve_vec.f [new file with mode: 0644]
examples/smpi/NAS/CG/Makefile [new file with mode: 0644]
examples/smpi/NAS/CG/cg.f [new file with mode: 0644]
examples/smpi/NAS/CG/mpinpb.h [new file with mode: 0644]
examples/smpi/NAS/DT/DGraph.c [new file with mode: 0644]
examples/smpi/NAS/DT/DGraph.h [new file with mode: 0644]
examples/smpi/NAS/DT/Makefile [new file with mode: 0644]
examples/smpi/NAS/DT/README [new file with mode: 0644]
examples/smpi/NAS/DT/dt.c [new file with mode: 0644]
examples/smpi/NAS/EP/Makefile [new file with mode: 0644]
examples/smpi/NAS/EP/README [new file with mode: 0644]
examples/smpi/NAS/EP/ep.c [new file with mode: 0644]
examples/smpi/NAS/EP/ep.f [new file with mode: 0644]
examples/smpi/NAS/EP/mpinpb.h [new file with mode: 0644]
examples/smpi/NAS/EP/randlc.c [new file with mode: 0644]
examples/smpi/NAS/EP/randlc.h [new file with mode: 0644]
examples/smpi/NAS/FT/Makefile [new file with mode: 0644]
examples/smpi/NAS/FT/README [new file with mode: 0644]
examples/smpi/NAS/FT/ft.f [new file with mode: 0644]
examples/smpi/NAS/FT/global.h [new file with mode: 0644]
examples/smpi/NAS/FT/inputft.data.sample [new file with mode: 0644]
examples/smpi/NAS/FT/mpinpb.h [new file with mode: 0644]
examples/smpi/NAS/IS/Makefile [new file with mode: 0644]
examples/smpi/NAS/IS/is.c [new file with mode: 0644]
examples/smpi/NAS/LU/Makefile [new file with mode: 0644]
examples/smpi/NAS/LU/applu.incl [new file with mode: 0644]
examples/smpi/NAS/LU/bcast_inputs.f [new file with mode: 0644]
examples/smpi/NAS/LU/blts.f [new file with mode: 0644]
examples/smpi/NAS/LU/blts_vec.f [new file with mode: 0644]
examples/smpi/NAS/LU/buts.f [new file with mode: 0644]
examples/smpi/NAS/LU/buts_vec.f [new file with mode: 0644]
examples/smpi/NAS/LU/erhs.f [new file with mode: 0644]
examples/smpi/NAS/LU/error.f [new file with mode: 0644]
examples/smpi/NAS/LU/exact.f [new file with mode: 0644]
examples/smpi/NAS/LU/exchange_1.f [new file with mode: 0644]
examples/smpi/NAS/LU/exchange_3.f [new file with mode: 0644]
examples/smpi/NAS/LU/exchange_4.f [new file with mode: 0644]
examples/smpi/NAS/LU/exchange_5.f [new file with mode: 0644]
examples/smpi/NAS/LU/exchange_6.f [new file with mode: 0644]
examples/smpi/NAS/LU/init_comm.f [new file with mode: 0644]
examples/smpi/NAS/LU/inputlu.data.sample [new file with mode: 0644]
examples/smpi/NAS/LU/jacld.f [new file with mode: 0644]
examples/smpi/NAS/LU/jacu.f [new file with mode: 0644]
examples/smpi/NAS/LU/l2norm.f [new file with mode: 0644]
examples/smpi/NAS/LU/lu.f [new file with mode: 0644]
examples/smpi/NAS/LU/mpinpb.h [new file with mode: 0644]
examples/smpi/NAS/LU/neighbors.f [new file with mode: 0644]
examples/smpi/NAS/LU/nodedim.f [new file with mode: 0644]
examples/smpi/NAS/LU/pintgr.f [new file with mode: 0644]
examples/smpi/NAS/LU/proc_grid.f [new file with mode: 0644]
examples/smpi/NAS/LU/read_input.f [new file with mode: 0644]
examples/smpi/NAS/LU/rhs.f [new file with mode: 0644]
examples/smpi/NAS/LU/setbv.f [new file with mode: 0644]
examples/smpi/NAS/LU/setcoeff.f [new file with mode: 0644]
examples/smpi/NAS/LU/sethyper.f [new file with mode: 0644]
examples/smpi/NAS/LU/setiv.f [new file with mode: 0644]
examples/smpi/NAS/LU/ssor.f [new file with mode: 0644]
examples/smpi/NAS/LU/subdomain.f [new file with mode: 0644]
examples/smpi/NAS/LU/verify.f [new file with mode: 0644]
examples/smpi/NAS/MG/Makefile [new file with mode: 0644]
examples/smpi/NAS/MG/README [new file with mode: 0644]
examples/smpi/NAS/MG/globals.h [new file with mode: 0644]
examples/smpi/NAS/MG/mg.f [new file with mode: 0644]
examples/smpi/NAS/MG/mg.input.sample [new file with mode: 0644]
examples/smpi/NAS/MG/mpinpb.h [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/Makefile [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/README [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/mpi.h [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/mpi_dummy.c [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/mpi_dummy.f [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/mpif.h [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/test.f [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/wtime.c [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/wtime.f [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/wtime.h [new file with mode: 0644]
examples/smpi/NAS/MPI_dummy/wtime_sgi64.c [new file with mode: 0644]
examples/smpi/NAS/Makefile [new file with mode: 0644]
examples/smpi/NAS/README [new file with mode: 0644]
examples/smpi/NAS/README.install [new file with mode: 0644]
examples/smpi/NAS/SP/Makefile [new file with mode: 0644]
examples/smpi/NAS/SP/README [new file with mode: 0644]
examples/smpi/NAS/SP/add.f [new file with mode: 0644]
examples/smpi/NAS/SP/adi.f [new file with mode: 0644]
examples/smpi/NAS/SP/copy_faces.f [new file with mode: 0644]
examples/smpi/NAS/SP/define.f [new file with mode: 0644]
examples/smpi/NAS/SP/error.f [new file with mode: 0644]
examples/smpi/NAS/SP/exact_rhs.f [new file with mode: 0644]
examples/smpi/NAS/SP/exact_solution.f [new file with mode: 0644]
examples/smpi/NAS/SP/header.h [new file with mode: 0644]
examples/smpi/NAS/SP/initialize.f [new file with mode: 0644]
examples/smpi/NAS/SP/inputsp.data.sample [new file with mode: 0644]
examples/smpi/NAS/SP/lhsx.f [new file with mode: 0644]
examples/smpi/NAS/SP/lhsy.f [new file with mode: 0644]
examples/smpi/NAS/SP/lhsz.f [new file with mode: 0644]
examples/smpi/NAS/SP/make_set.f [new file with mode: 0644]
examples/smpi/NAS/SP/mpinpb.h [new file with mode: 0644]
examples/smpi/NAS/SP/ninvr.f [new file with mode: 0644]
examples/smpi/NAS/SP/pinvr.f [new file with mode: 0644]
examples/smpi/NAS/SP/rhs.f [new file with mode: 0644]
examples/smpi/NAS/SP/set_constants.f [new file with mode: 0644]
examples/smpi/NAS/SP/setup_mpi.f [new file with mode: 0644]
examples/smpi/NAS/SP/sp.f [new file with mode: 0644]
examples/smpi/NAS/SP/txinvr.f [new file with mode: 0644]
examples/smpi/NAS/SP/tzetar.f [new file with mode: 0644]
examples/smpi/NAS/SP/verify.f [new file with mode: 0644]
examples/smpi/NAS/SP/x_solve.f [new file with mode: 0644]
examples/smpi/NAS/SP/y_solve.f [new file with mode: 0644]
examples/smpi/NAS/SP/z_solve.f [new file with mode: 0644]
examples/smpi/NAS/common/c_print_results.c [new file with mode: 0644]
examples/smpi/NAS/common/c_timers.c [new file with mode: 0644]
examples/smpi/NAS/common/print_results.f [new file with mode: 0644]
examples/smpi/NAS/common/randdp.c [new file with mode: 0644]
examples/smpi/NAS/common/randdp.f [new file with mode: 0644]
examples/smpi/NAS/common/randdpvec.f [new file with mode: 0644]
examples/smpi/NAS/common/randi8.f [new file with mode: 0644]
examples/smpi/NAS/common/randi8_safe.f [new file with mode: 0644]
examples/smpi/NAS/common/timers.f [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/README [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/make.def.dec_alpha [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/make.def.irix6.2 [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/make.def.origin [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/make.def.sgi_powerchallenge [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/make.def.sp2_babbage [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/make.def.sun_ultra_sparc [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/make.def.t3d_cosmos [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/make.def_sun_mpich [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/suite.def.bt [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/suite.def.cg [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/suite.def.ep [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/suite.def.ft [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/suite.def.is [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/suite.def.lu [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/suite.def.mg [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/suite.def.small [new file with mode: 0644]
examples/smpi/NAS/config/NAS.samples/suite.def.sp [new file with mode: 0644]
examples/smpi/NAS/config/make.def [new file with mode: 0644]
examples/smpi/NAS/config/make.def.template [new file with mode: 0644]
examples/smpi/NAS/config/make.dummy [new file with mode: 0644]
examples/smpi/NAS/config/suite.def.template [new file with mode: 0644]
examples/smpi/NAS/sys/Makefile [new file with mode: 0644]
examples/smpi/NAS/sys/README [new file with mode: 0644]
examples/smpi/NAS/sys/make.common [new file with mode: 0644]
examples/smpi/NAS/sys/print_header [new file with mode: 0755]
examples/smpi/NAS/sys/print_instructions [new file with mode: 0755]
examples/smpi/NAS/sys/setparams.c [new file with mode: 0644]
examples/smpi/NAS/sys/suite.awk [new file with mode: 0644]

diff --git a/examples/smpi/NAS/BT/Makefile b/examples/smpi/NAS/BT/Makefile
new file mode 100644 (file)
index 0000000..dd27503
--- /dev/null
@@ -0,0 +1,106 @@
+SHELL=/bin/sh
+BENCHMARK=bt
+BENCHMARKU=BT
+VEC=
+
+include ../config/make.def
+
+
+OBJS = bt.o make_set.o initialize.o exact_solution.o exact_rhs.o \
+       set_constants.o adi.o define.o copy_faces.o rhs.o solve_subs.o \
+       x_solve$(VEC).o y_solve$(VEC).o z_solve$(VEC).o add.o error.o \
+       verify.o setup_mpi.o \
+       ${COMMON}/print_results.o ${COMMON}/timers.o
+
+include ../sys/make.common
+
+# npbparams.h is included by header.h
+# The following rule should do the trick but many make programs (not gmake)
+# will do the wrong thing and rebuild the world every time (because the
+# mod time on header.h is not changed. One solution would be to 
+# touch header.h but this might cause confusion if someone has
+# accidentally deleted it. Instead, make the dependency on npbparams.h
+# explicit in all the lines below (even though dependence is indirect). 
+
+# header.h: npbparams.h
+
+${PROGRAM}: config
+       @if [ x$(VERSION) = xvec ] ; then       \
+               ${MAKE} VEC=_vec exec;          \
+       elif [ x$(VERSION) = xVEC ] ; then      \
+               ${MAKE} VEC=_vec exec;          \
+       else                                    \
+               ${MAKE} exec;                   \
+       fi
+
+exec: $(OBJS)
+       @if [ x$(SUBTYPE) = xfull ] ; then      \
+               ${MAKE} bt-full;                \
+       elif [ x$(SUBTYPE) = xFULL ] ; then     \
+               ${MAKE} bt-full;                \
+       elif [ x$(SUBTYPE) = xsimple ] ; then   \
+               ${MAKE} bt-simple;              \
+       elif [ x$(SUBTYPE) = xSIMPLE ] ; then   \
+               ${MAKE} bt-simple;              \
+       elif [ x$(SUBTYPE) = xfortran ] ; then  \
+               ${MAKE} bt-fortran;             \
+       elif [ x$(SUBTYPE) = xFORTRAN ] ; then  \
+               ${MAKE} bt-fortran;             \
+       elif [ x$(SUBTYPE) = xepio ] ; then     \
+               ${MAKE} bt-epio;                \
+       elif [ x$(SUBTYPE) = xEPIO ] ; then     \
+               ${MAKE} bt-epio;                \
+       else                                    \
+               ${MAKE} bt-bt;                  \
+       fi
+
+bt-bt: ${OBJS} btio.o
+       ${FLINK} ${FLINKFLAGS} -o ${PROGRAM} ${OBJS} btio.o ${FMPI_LIB}
+
+bt-full: ${OBJS} full_mpiio.o btio_common.o
+       ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_full ${OBJS} btio_common.o full_mpiio.o ${FMPI_LIB}
+
+bt-simple: ${OBJS} simple_mpiio.o btio_common.o
+       ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.mpi_io_simple ${OBJS} btio_common.o simple_mpiio.o ${FMPI_LIB}
+
+bt-fortran: ${OBJS} fortran_io.o btio_common.o
+       ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.fortran_io ${OBJS} btio_common.o fortran_io.o ${FMPI_LIB}
+
+bt-epio: ${OBJS} epio.o btio_common.o
+       ${FLINK} ${FLINKFLAGS} -o ${PROGRAM}.ep_io ${OBJS} btio_common.o epio.o ${FMPI_LIB}
+
+.f.o:
+       ${FCOMPILE} $<
+
+.c.o:
+       ${CCOMPILE} $<
+
+
+bt.o:             bt.f  header.h npbparams.h  mpinpb.h
+make_set.o:       make_set.f  header.h npbparams.h  mpinpb.h
+initialize.o:     initialize.f  header.h npbparams.h
+exact_solution.o: exact_solution.f  header.h npbparams.h
+exact_rhs.o:      exact_rhs.f  header.h npbparams.h
+set_constants.o:  set_constants.f  header.h npbparams.h
+adi.o:            adi.f  header.h npbparams.h
+define.o:         define.f  header.h npbparams.h
+copy_faces.o:     copy_faces.f  header.h npbparams.h  mpinpb.h
+rhs.o:            rhs.f  header.h npbparams.h
+x_solve$(VEC).o:  x_solve$(VEC).f  header.h work_lhs$(VEC).h npbparams.h  mpinpb.h
+y_solve$(VEC).o:  y_solve$(VEC).f  header.h work_lhs$(VEC).h npbparams.h  mpinpb.h
+z_solve$(VEC).o:  z_solve$(VEC).f  header.h work_lhs$(VEC).h npbparams.h  mpinpb.h
+solve_subs.o:     solve_subs.f  npbparams.h
+add.o:            add.f  header.h npbparams.h
+error.o:          error.f  header.h npbparams.h  mpinpb.h
+verify.o:         verify.f  header.h npbparams.h  mpinpb.h
+setup_mpi.o:      setup_mpi.f mpinpb.h npbparams.h 
+btio.o:           btio.f  header.h npbparams.h
+btio_common.o:    btio_common.f mpinpb.h npbparams.h 
+fortran_io.o:     fortran_io.f mpinpb.h npbparams.h 
+simple_mpiio.o:   simple_mpiio.f mpinpb.h npbparams.h 
+full_mpiio.o:     full_mpiio.f mpinpb.h npbparams.h 
+epio.o:           epio.f mpinpb.h npbparams.h 
+
+clean:
+       - rm -f *.o *~ mputil*
+       - rm -f  npbparams.h core
diff --git a/examples/smpi/NAS/BT/add.f b/examples/smpi/NAS/BT/add.f
new file mode 100644 (file)
index 0000000..e14cde4
--- /dev/null
@@ -0,0 +1,30 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine  add
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     addition of update to the vector u
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer  c, i, j, k, m
+
+      do     c = 1, ncells
+         do     k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do     j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do     i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do    m = 1, 5
+                     u(m,i,j,k,c) = u(m,i,j,k,c) + rhs(m,i,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/adi.f b/examples/smpi/NAS/BT/adi.f
new file mode 100644 (file)
index 0000000..58450c0
--- /dev/null
@@ -0,0 +1,21 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine  adi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      call copy_faces
+
+      call x_solve
+
+      call y_solve
+
+      call z_solve
+
+      call add
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/bt.f b/examples/smpi/NAS/BT/bt.f
new file mode 100644 (file)
index 0000000..36e5078
--- /dev/null
@@ -0,0 +1,275 @@
+!-------------------------------------------------------------------------!
+!                                                                         !
+!        N  A  S     P A R A L L E L     B E N C H M A R K S  3.3         !
+!                                                                         !
+!                                   B T                                   !
+!                                                                         !
+!-------------------------------------------------------------------------!
+!                                                                         !
+!    This benchmark is part of the NAS Parallel Benchmark 3.3 suite.      !
+!    It is described in NAS Technical Reports 95-020 and 02-007.          !
+!                                                                         !
+!    Permission to use, copy, distribute and modify this software         !
+!    for any purpose with or without fee is hereby granted.  We           !
+!    request, however, that all derived work reference the NAS            !
+!    Parallel Benchmarks 3.3. This software is provided "as is"           !
+!    without express or implied warranty.                                 !
+!                                                                         !
+!    Information on NPB 3.3, including the technical report, the          !
+!    original specifications, source code, results and information        !
+!    on how to submit new results, is available at:                       !
+!                                                                         !
+!           http://www.nas.nasa.gov/Software/NPB/                         !
+!                                                                         !
+!    Send comments or suggestions to  npb@nas.nasa.gov                    !
+!                                                                         !
+!          NAS Parallel Benchmarks Group                                  !
+!          NASA Ames Research Center                                      !
+!          Mail Stop: T27A-1                                              !
+!          Moffett Field, CA   94035-1000                                 !
+!                                                                         !
+!          E-mail:  npb@nas.nasa.gov                                      !
+!          Fax:     (650) 604-3957                                        !
+!                                                                         !
+!-------------------------------------------------------------------------!
+
+c---------------------------------------------------------------------
+c
+c Authors: R. F. Van der Wijngaart
+c          T. Harris
+c          M. Yarrow
+c
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+       program MPBT
+c---------------------------------------------------------------------
+
+       include  'header.h'
+       include  'mpinpb.h'
+      
+       integer i, niter, step, c, error, fstatus
+       double precision navg, mflops, mbytes, n3
+
+       external timer_read
+       double precision t, tmax, tiominv, tpc, timer_read
+       logical verified
+       character class, cbuff*40
+
+       integer wr_interval
+
+       call setup_mpi
+       if (.not. active) goto 999
+
+c---------------------------------------------------------------------
+c      Root node reads input file (if it exists) else takes
+c      defaults from parameters
+c---------------------------------------------------------------------
+       if (node .eq. root) then
+          
+          write(*, 1000)
+          open (unit=2,file='inputbt.data',status='old', iostat=fstatus)
+c
+          rd_interval = 0
+          if (fstatus .eq. 0) then
+            write(*,233) 
+ 233        format(' Reading from input file inputbt.data')
+            read (2,*) niter
+            read (2,*) dt
+            read (2,*) grid_points(1), grid_points(2), grid_points(3)
+            if (iotype .ne. 0) then
+                read (2,'(A)') cbuff
+                read (cbuff,*,iostat=i) wr_interval, rd_interval
+                if (i .ne. 0) rd_interval = 0
+                if (wr_interval .le. 0) wr_interval = wr_default
+            endif
+            if (iotype .eq. 1) then
+                read (2,*) collbuf_nodes, collbuf_size
+                write(*,*) 'collbuf_nodes ', collbuf_nodes
+                write(*,*) 'collbuf_size  ', collbuf_size
+            endif
+            close(2)
+          else
+            write(*,234) 
+            niter = niter_default
+            dt    = dt_default
+            grid_points(1) = problem_size
+            grid_points(2) = problem_size
+            grid_points(3) = problem_size
+            wr_interval = wr_default
+            if (iotype .eq. 1) then
+c             set number of nodes involved in collective buffering to 4,
+c             unless total number of nodes is smaller than that.
+c             set buffer size for collective buffering to 1MB per node
+c             collbuf_nodes = min(4,no_nodes)
+c             set default to No-File-Hints with a value of 0
+              collbuf_nodes = 0
+              collbuf_size = 1000000
+            endif
+          endif
+ 234      format(' No input file inputbt.data. Using compiled defaults')
+
+          write(*, 1001) grid_points(1), grid_points(2), grid_points(3)
+          write(*, 1002) niter, dt
+          if (no_nodes .ne. total_nodes) write(*, 1004) total_nodes
+          if (no_nodes .ne. maxcells*maxcells) 
+     >        write(*, 1005) maxcells*maxcells
+          write(*, 1003) no_nodes
+
+          if (iotype .eq. 1) write(*, 1006) 'FULL MPI-IO', wr_interval
+          if (iotype .eq. 2) write(*, 1006) 'SIMPLE MPI-IO', wr_interval
+          if (iotype .eq. 3) write(*, 1006) 'EPIO', wr_interval
+          if (iotype .eq. 4) write(*, 1006) 'FORTRAN IO', wr_interval
+
+ 1000 format(//, ' NAS Parallel Benchmarks 3.3 -- BT Benchmark ',/)
+ 1001     format(' Size: ', i4, 'x', i4, 'x', i4)
+ 1002     format(' Iterations: ', i4, '    dt: ', F11.7)
+ 1004     format(' Total number of processes: ', i5)
+ 1005     format(' WARNING: compiled for ', i5, ' processes ')
+ 1003     format(' Number of active processes: ', i5, /)
+ 1006     format(' BTIO -- ', A, ' write interval: ', i3 /)
+
+       endif
+
+       call mpi_bcast(niter, 1, MPI_INTEGER,
+     >                root, comm_setup, error)
+
+       call mpi_bcast(dt, 1, dp_type, 
+     >                root, comm_setup, error)
+
+       call mpi_bcast(grid_points(1), 3, MPI_INTEGER, 
+     >                root, comm_setup, error)
+
+       call mpi_bcast(wr_interval, 1, MPI_INTEGER,
+     >                root, comm_setup, error)
+
+       call mpi_bcast(rd_interval, 1, MPI_INTEGER,
+     >                root, comm_setup, error)
+
+       call make_set
+
+       do  c = 1, maxcells
+          if ( (cell_size(1,c) .gt. IMAX) .or.
+     >         (cell_size(2,c) .gt. JMAX) .or.
+     >         (cell_size(3,c) .gt. KMAX) ) then
+             print *,node, c, (cell_size(i,c),i=1,3)
+             print *,' Problem size too big for compiled array sizes'
+             goto 999
+          endif
+       end do
+
+       call set_constants
+
+       call initialize
+
+       call setup_btio
+       idump = 0
+
+       call lhsinit
+
+       call exact_rhs
+
+       call compute_buffer_size(5)
+
+c---------------------------------------------------------------------
+c      do one time step to touch all code, and reinitialize
+c---------------------------------------------------------------------
+       call adi
+       call initialize
+
+       call timer_clear(2)
+
+c---------------------------------------------------------------------
+c      Synchronize before placing time stamp
+c---------------------------------------------------------------------
+       call mpi_barrier(comm_setup, error)
+
+       call timer_clear(1)
+       call timer_start(1)
+
+       do  step = 1, niter
+
+          if (node .eq. root) then
+             if (mod(step, 20) .eq. 0 .or. step .eq. niter .or.
+     >           step .eq. 1) then
+                write(*, 200) step
+ 200            format(' Time step ', i4)
+             endif
+          endif
+
+          call adi
+
+          if (iotype .ne. 0) then
+              call timer_start(2)
+              if (mod(step, wr_interval).eq.0 .or. step .eq. niter) then
+                  if (node .eq. root) then
+                      print *, 'Writing data set, time step', step
+                  endif
+                  if (step .eq. niter .and. rd_interval .gt. 1) then
+                      rd_interval = 1
+                  endif
+                  call output_timestep
+                  idump = idump + 1
+              endif
+              call timer_stop(2)
+          endif
+       end do
+
+       call btio_cleanup
+
+       call timer_stop(1)
+       t = timer_read(1)
+       
+       call verify(niter, class, verified)
+
+       call mpi_reduce(t, tmax, 1, 
+     >                 dp_type, MPI_MAX, 
+     >                 root, comm_setup, error)
+
+       if (iotype .ne. 0) then
+          t = timer_read(2)
+          if (t .ne. 0.d0) t = 1.0d0 / t
+          call mpi_reduce(t, tiominv, 1, 
+     >                    dp_type, MPI_SUM, 
+     >                    root, comm_setup, error)
+       endif
+
+       if( node .eq. root ) then
+          n3 = 1.0d0*grid_points(1)*grid_points(2)*grid_points(3)
+          navg = (grid_points(1)+grid_points(2)+grid_points(3))/3.0
+          if( tmax .ne. 0. ) then
+             mflops = 1.0e-6*float(niter)*
+     >     (3478.8*n3-17655.7*navg**2+28023.7*navg)
+     >     / tmax
+          else
+             mflops = 0.0
+          endif
+
+          if (iotype .ne. 0) then
+             mbytes = n3 * 40.0 * idump * 1.0d-6
+             tiominv = tiominv / no_nodes
+             t = 0.0
+             if (tiominv .ne. 0.) t = 1.d0 / tiominv
+             tpc = 0.0
+             if (tmax .ne. 0.) tpc = t * 100.0 / tmax
+             write(*,1100) t, tpc, mbytes, mbytes*tiominv
+ 1100        format(/' BTIO -- statistics:'/
+     >               '   I/O timing in seconds   : ', f14.2/
+     >               '   I/O timing percentage   : ', f14.2/
+     >               '   Total data written (MB) : ', f14.2/
+     >               '   I/O data rate  (MB/sec) : ', f14.2)
+          endif
+
+         call print_results('BT', class, grid_points(1), 
+     >     grid_points(2), grid_points(3), niter, maxcells*maxcells, 
+     >     total_nodes, tmax, mflops, '          floating point', 
+     >     verified, npbversion,compiletime, cs1, cs2, cs3, cs4, cs5, 
+     >     cs6, '(none)')
+       endif
+
+ 999   continue
+       call mpi_barrier(MPI_COMM_WORLD, error)
+       call mpi_finalize(error)
+
+       end
+
diff --git a/examples/smpi/NAS/BT/btio.f b/examples/smpi/NAS/BT/btio.f
new file mode 100644 (file)
index 0000000..1fb730b
--- /dev/null
@@ -0,0 +1,72 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_verify(verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      logical verified
+
+      verified = .true.
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      double precision xce_acc(5)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine checksum_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/btio_common.f b/examples/smpi/NAS/BT/btio_common.f
new file mode 100644 (file)
index 0000000..9227a12
--- /dev/null
@@ -0,0 +1,30 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine clear_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer cio, kio, jio, ix
+
+      do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  do ix=0,cell_size(1,cio)-1
+                            u(1,ix, jio,kio,cio) = 0
+                            u(2,ix, jio,kio,cio) = 0
+                            u(3,ix, jio,kio,cio) = 0
+                            u(4,ix, jio,kio,cio) = 0
+                            u(5,ix, jio,kio,cio) = 0
+                  enddo
+              enddo
+          enddo
+      enddo
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/copy_faces.f b/examples/smpi/NAS/BT/copy_faces.f
new file mode 100644 (file)
index 0000000..14b82ca
--- /dev/null
@@ -0,0 +1,316 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine copy_faces
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c This function copies the face values of a variable defined on a set 
+c of cells to the overlap locations of the adjacent sets of cells. 
+c Because a set of cells interfaces in each direction with exactly one 
+c other set, we only need to fill six different buffers. We could try to 
+c overlap communication with computation, by computing
+c some internal values while communicating boundary values, but this
+c adds so much overhead that it's not clearly useful. 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i, j, k, c, m, requests(0:11), p0, p1, 
+     >     p2, p3, p4, p5, b_size(0:5), ss(0:5), 
+     >     sr(0:5), error, statuses(MPI_STATUS_SIZE, 0:11)
+
+c---------------------------------------------------------------------
+c     exit immediately if there are no faces to be copied           
+c---------------------------------------------------------------------
+      if (no_nodes .eq. 1) then
+         call compute_rhs
+         return
+      endif
+
+      ss(0) = start_send_east
+      ss(1) = start_send_west
+      ss(2) = start_send_north
+      ss(3) = start_send_south
+      ss(4) = start_send_top
+      ss(5) = start_send_bottom
+
+      sr(0) = start_recv_east
+      sr(1) = start_recv_west
+      sr(2) = start_recv_north
+      sr(3) = start_recv_south
+      sr(4) = start_recv_top
+      sr(5) = start_recv_bottom
+
+      b_size(0) = east_size   
+      b_size(1) = west_size   
+      b_size(2) = north_size  
+      b_size(3) = south_size  
+      b_size(4) = top_size    
+      b_size(5) = bottom_size 
+
+c---------------------------------------------------------------------
+c     because the difference stencil for the diagonalized scheme is 
+c     orthogonal, we do not have to perform the staged copying of faces, 
+c     but can send all face information simultaneously to the neighboring 
+c     cells in all directions          
+c---------------------------------------------------------------------
+      p0 = 0
+      p1 = 0
+      p2 = 0
+      p3 = 0
+      p4 = 0
+      p5 = 0
+
+      do  c = 1, ncells
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to eastern neighbors (i-dir)
+c---------------------------------------------------------------------
+         if (cell_coord(1,c) .ne. ncells) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = cell_size(1,c)-2, cell_size(1,c)-1
+                     do   m = 1, 5
+                        out_buffer(ss(0)+p0) = u(m,i,j,k,c)
+                        p0 = p0 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to western neighbors 
+c---------------------------------------------------------------------
+         if (cell_coord(1,c) .ne. 1) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = 0, 1
+                     do   m = 1, 5
+                        out_buffer(ss(1)+p1) = u(m,i,j,k,c)
+                        p1 = p1 + 1
+                     end do
+                  end do
+               end do
+            end do
+
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to northern neighbors (j_dir)
+c---------------------------------------------------------------------
+         if (cell_coord(2,c) .ne. ncells) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = cell_size(2,c)-2, cell_size(2,c)-1
+                  do   i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        out_buffer(ss(2)+p2) = u(m,i,j,k,c)
+                        p2 = p2 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to southern neighbors 
+c---------------------------------------------------------------------
+         if (cell_coord(2,c).ne. 1) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = 0, 1
+                  do   i = 0, cell_size(1,c)-1   
+                     do   m = 1, 5
+                        out_buffer(ss(3)+p3) = u(m,i,j,k,c)
+                        p3 = p3 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to top neighbors (k-dir)
+c---------------------------------------------------------------------
+         if (cell_coord(3,c) .ne. ncells) then
+            do   k = cell_size(3,c)-2, cell_size(3,c)-1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        out_buffer(ss(4)+p4) = u(m,i,j,k,c)
+                        p4 = p4 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     fill the buffer to be sent to bottom neighbors
+c---------------------------------------------------------------------
+         if (cell_coord(3,c).ne. 1) then
+            do    k=0, 1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        out_buffer(ss(5)+p5) = u(m,i,j,k,c)
+                        p5 = p5 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     cell loop
+c---------------------------------------------------------------------
+      end do
+
+      call mpi_irecv(in_buffer(sr(0)), b_size(0), 
+     >     dp_type, successor(1), WEST,  
+     >     comm_rhs, requests(0), error)
+      call mpi_irecv(in_buffer(sr(1)), b_size(1), 
+     >     dp_type, predecessor(1), EAST,  
+     >     comm_rhs, requests(1), error)
+      call mpi_irecv(in_buffer(sr(2)), b_size(2), 
+     >     dp_type, successor(2), SOUTH, 
+     >     comm_rhs, requests(2), error)
+      call mpi_irecv(in_buffer(sr(3)), b_size(3), 
+     >     dp_type, predecessor(2), NORTH, 
+     >     comm_rhs, requests(3), error)
+      call mpi_irecv(in_buffer(sr(4)), b_size(4), 
+     >     dp_type, successor(3), BOTTOM,
+     >     comm_rhs, requests(4), error)
+      call mpi_irecv(in_buffer(sr(5)), b_size(5), 
+     >     dp_type, predecessor(3), TOP,   
+     >     comm_rhs, requests(5), error)
+
+      call mpi_isend(out_buffer(ss(0)), b_size(0), 
+     >     dp_type, successor(1),   EAST, 
+     >     comm_rhs, requests(6), error)
+      call mpi_isend(out_buffer(ss(1)), b_size(1), 
+     >     dp_type, predecessor(1), WEST, 
+     >     comm_rhs, requests(7), error)
+      call mpi_isend(out_buffer(ss(2)), b_size(2), 
+     >     dp_type,successor(2),   NORTH, 
+     >     comm_rhs, requests(8), error)
+      call mpi_isend(out_buffer(ss(3)), b_size(3), 
+     >     dp_type,predecessor(2), SOUTH, 
+     >     comm_rhs, requests(9), error)
+      call mpi_isend(out_buffer(ss(4)), b_size(4), 
+     >     dp_type,successor(3),   TOP, 
+     >     comm_rhs,   requests(10), error)
+      call mpi_isend(out_buffer(ss(5)), b_size(5), 
+     >     dp_type,predecessor(3), BOTTOM, 
+     >     comm_rhs,requests(11), error)
+
+
+      call mpi_waitall(12, requests, statuses, error)
+
+c---------------------------------------------------------------------
+c     unpack the data that has just been received;             
+c---------------------------------------------------------------------
+      p0 = 0
+      p1 = 0
+      p2 = 0
+      p3 = 0
+      p4 = 0
+      p5 = 0
+
+      do   c = 1, ncells
+
+         if (cell_coord(1,c) .ne. 1) then
+            do   k = 0, cell_size(3,c)-1
+               do   j = 0, cell_size(2,c)-1
+                  do   i = -2, -1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(1)+p0)
+                        p0 = p0 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+         if (cell_coord(1,c) .ne. ncells) then
+            do  k = 0, cell_size(3,c)-1
+               do  j = 0, cell_size(2,c)-1
+                  do  i = cell_size(1,c), cell_size(1,c)+1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(0)+p1)
+                        p1 = p1 + 1
+                     end do
+                  end do
+               end do
+            end do
+         end if
+            
+         if (cell_coord(2,c) .ne. 1) then
+            do  k = 0, cell_size(3,c)-1
+               do   j = -2, -1
+                  do  i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(3)+p2)
+                        p2 = p2 + 1
+                     end do
+                  end do
+               end do
+            end do
+
+         endif
+            
+         if (cell_coord(2,c) .ne. ncells) then
+            do  k = 0, cell_size(3,c)-1
+               do   j = cell_size(2,c), cell_size(2,c)+1
+                  do  i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(2)+p3)
+                        p3 = p3 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+         if (cell_coord(3,c) .ne. 1) then
+            do  k = -2, -1
+               do  j = 0, cell_size(2,c)-1
+                  do  i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(5)+p4)
+                        p4 = p4 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+         if (cell_coord(3,c) .ne. ncells) then
+            do  k = cell_size(3,c), cell_size(3,c)+1
+               do  j = 0, cell_size(2,c)-1
+                  do  i = 0, cell_size(1,c)-1
+                     do   m = 1, 5
+                        u(m,i,j,k,c) = in_buffer(sr(4)+p5)
+                        p5 = p5 + 1
+                     end do
+                  end do
+               end do
+            end do
+         endif
+
+c---------------------------------------------------------------------
+c     cells loop
+c---------------------------------------------------------------------
+      end do
+
+c---------------------------------------------------------------------
+c     do the rest of the rhs that uses the copied face values          
+c---------------------------------------------------------------------
+      call compute_rhs
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/define.f b/examples/smpi/NAS/BT/define.f
new file mode 100644 (file)
index 0000000..03c4c6e
--- /dev/null
@@ -0,0 +1,64 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine compute_buffer_size(dim)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer  c, dim, face_size
+
+      if (ncells .eq. 1) return
+
+c---------------------------------------------------------------------
+c     compute the actual sizes of the buffers; note that there is 
+c     always one cell face that doesn't need buffer space, because it 
+c     is at the boundary of the grid
+c---------------------------------------------------------------------
+      west_size = 0
+      east_size = 0
+
+      do   c = 1, ncells
+         face_size = cell_size(2,c) * cell_size(3,c) * dim * 2
+         if (cell_coord(1,c).ne.1) west_size = west_size + face_size
+         if (cell_coord(1,c).ne.ncells) east_size = east_size + 
+     >        face_size 
+      end do
+
+      north_size = 0
+      south_size = 0
+      do   c = 1, ncells
+         face_size = cell_size(1,c)*cell_size(3,c) * dim * 2
+         if (cell_coord(2,c).ne.1) south_size = south_size + face_size
+         if (cell_coord(2,c).ne.ncells) north_size = north_size + 
+     >        face_size 
+      end do
+
+      top_size = 0
+      bottom_size = 0
+      do   c = 1, ncells
+         face_size = cell_size(1,c) * cell_size(2,c) * dim * 2
+         if (cell_coord(3,c).ne.1) bottom_size = bottom_size + 
+     >        face_size
+         if (cell_coord(3,c).ne.ncells) top_size = top_size +
+     >        face_size     
+      end do
+
+      start_send_west   = 1
+      start_send_east   = start_send_west   + west_size
+      start_send_south  = start_send_east   + east_size
+      start_send_north  = start_send_south  + south_size
+      start_send_bottom = start_send_north  + north_size
+      start_send_top    = start_send_bottom + bottom_size
+      start_recv_west   = 1
+      start_recv_east   = start_recv_west   + west_size
+      start_recv_south  = start_recv_east   + east_size
+      start_recv_north  = start_recv_south  + south_size
+      start_recv_bottom = start_recv_north  + north_size
+      start_recv_top    = start_recv_bottom + bottom_size
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/epio.f b/examples/smpi/NAS/BT/epio.f
new file mode 100644 (file)
index 0000000..52b6309
--- /dev/null
@@ -0,0 +1,165 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      character*(128) newfilenm
+      integer m
+
+      if (node .lt. 10000) then
+          write (newfilenm, 996) filenm,node
+      else
+          print *, 'error generating file names (> 10000 nodes)'
+          stop
+      endif
+
+996   format (a,'.',i4.4)
+
+      open (unit=99, file=newfilenm, form='unformatted',
+     $       status='unknown')
+
+      do m = 1, 5
+         xce_sub(m) = 0.d0
+      end do
+
+      idump_sub = 0
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ix, iio, jio, kio, cio, aio
+
+      do cio=1,ncells
+          write(99)
+     $         ((((u(aio,ix, jio,kio,cio),aio=1,5),
+     $             ix=0, cell_size(1,cio)-1),
+     $             jio=0, cell_size(2,cio)-1),
+     $             kio=0, cell_size(3,cio)-1)
+      enddo
+
+      idump_sub = idump_sub + 1
+      if (rd_interval .gt. 0) then
+         if (idump_sub .ge. rd_interval) then
+
+            rewind(99)
+            call acc_sub_norms(idump+1)
+
+            rewind(99)
+            idump_sub = 0
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine acc_sub_norms(idump_cur)
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer idump_cur
+
+      integer ix, jio, kio, cio, ii, m, ichunk
+      double precision xce_single(5)
+
+      ichunk = idump_cur - idump_sub + 1
+      do ii=0, idump_sub-1
+        do cio=1,ncells
+          read(99)
+     $         ((((u(m,ix, jio,kio,cio),m=1,5),
+     $             ix=0, cell_size(1,cio)-1),
+     $             jio=0, cell_size(2,cio)-1),
+     $             kio=0, cell_size(3,cio)-1)
+        enddo
+
+        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+        call error_norm(xce_single)
+        do m = 1, 5
+           xce_sub(m) = xce_sub(m) + xce_single(m)
+        end do
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      close(unit=99)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      double precision xce_acc(5)
+
+      character*(128) newfilenm
+      integer m
+
+      if (rd_interval .gt. 0) goto 20
+
+      if (node .lt. 10000) then
+          write (newfilenm, 996) filenm,node
+      else
+          print *, 'error generating file names (> 10000 nodes)'
+          stop
+      endif
+
+996   format (a,'.',i4.4)
+
+      open (unit=99, file=newfilenm,
+     $      form='unformatted')
+
+c     clear the last time step
+
+      call clear_timestep
+
+c     read back the time steps and accumulate norms
+
+      call acc_sub_norms(idump)
+
+      close(unit=99)
+
+ 20   continue
+      do m = 1, 5
+         xce_acc(m) = xce_sub(m) / dble(idump)
+      end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/error.f b/examples/smpi/NAS/BT/error.f
new file mode 100644 (file)
index 0000000..147a582
--- /dev/null
@@ -0,0 +1,106 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine error_norm(rms)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     this function computes the norm of the difference between the
+c     computed solution and the exact solution
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer c, i, j, k, m, ii, jj, kk, d, error
+      double precision xi, eta, zeta, u_exact(5), rms(5), rms_work(5),
+     >     add
+
+      do m = 1, 5 
+         rms_work(m) = 0.0d0
+      enddo
+
+      do c = 1, ncells
+         kk = 0
+         do k = cell_low(3,c), cell_high(3,c)
+            zeta = dble(k) * dnzm1
+            jj = 0
+            do j = cell_low(2,c), cell_high(2,c)
+               eta = dble(j) * dnym1
+               ii = 0
+               do i = cell_low(1,c), cell_high(1,c)
+                  xi = dble(i) * dnxm1
+                  call exact_solution(xi, eta, zeta, u_exact)
+
+                  do m = 1, 5
+                     add = u(m,ii,jj,kk,c)-u_exact(m)
+                     rms_work(m) = rms_work(m) + add*add
+                  enddo
+                  ii = ii + 1
+               enddo
+               jj = jj + 1
+            enddo
+            kk = kk + 1
+         enddo
+      enddo
+
+      call mpi_allreduce(rms_work, rms, 5, dp_type, 
+     >     MPI_SUM, comm_setup, error)
+
+      do m = 1, 5
+         do d = 1, 3
+            rms(m) = rms(m) / dble(grid_points(d)-2)
+         enddo
+         rms(m) = dsqrt(rms(m))
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine rhs_norm(rms)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer c, i, j, k, d, m, error
+      double precision rms(5), rms_work(5), add
+
+      do m = 1, 5
+         rms_work(m) = 0.0d0
+      enddo 
+
+      do c = 1, ncells
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     add = rhs(m,i,j,k,c)
+                     rms_work(m) = rms_work(m) + add*add
+                  enddo 
+               enddo 
+            enddo 
+         enddo 
+      enddo 
+
+      call mpi_allreduce(rms_work, rms, 5, dp_type, 
+     >     MPI_SUM, comm_setup, error)
+
+      do m = 1, 5
+         do d = 1, 3
+            rms(m) = rms(m) / dble(grid_points(d)-2)
+         enddo 
+         rms(m) = dsqrt(rms(m))
+      enddo 
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/exact_rhs.f b/examples/smpi/NAS/BT/exact_rhs.f
new file mode 100644 (file)
index 0000000..26a2871
--- /dev/null
@@ -0,0 +1,360 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exact_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     compute the right hand side based on exact solution
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      double precision dtemp(5), xi, eta, zeta, dtpp
+      integer          c, m, i, j, k, ip1, im1, jp1, 
+     >     jm1, km1, kp1
+
+
+c---------------------------------------------------------------------
+c     loop over all cells owned by this node                   
+c---------------------------------------------------------------------
+      do c = 1, ncells
+
+c---------------------------------------------------------------------
+c     initialize                                  
+c---------------------------------------------------------------------
+         do k= 0, cell_size(3,c)-1
+            do j = 0, cell_size(2,c)-1
+               do i = 0, cell_size(1,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = 0.0d0
+                  enddo
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     xi-direction flux differences                      
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            zeta = dble(k+cell_low(3,c)) * dnzm1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               eta = dble(j+cell_low(2,c)) * dnym1
+
+               do i=-2*(1-start(1,c)), cell_size(1,c)+1-2*end(1,c)
+                  xi = dble(i+cell_low(1,c)) * dnxm1
+
+                  call exact_solution(xi, eta, zeta, dtemp)
+                  do m = 1, 5
+                     ue(i,m) = dtemp(m)
+                  enddo
+
+                  dtpp = 1.0d0 / dtemp(1)
+
+                  do m = 2, 5
+                     buf(i,m) = dtpp * dtemp(m)
+                  enddo
+
+                  cuf(i)   = buf(i,2) * buf(i,2)
+                  buf(i,1) = cuf(i) + buf(i,3) * buf(i,3) + 
+     >                 buf(i,4) * buf(i,4) 
+                  q(i) = 0.5d0*(buf(i,2)*ue(i,2) + buf(i,3)*ue(i,3) +
+     >                 buf(i,4)*ue(i,4))
+
+               enddo
+               
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  im1 = i-1
+                  ip1 = i+1
+
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+     >                 tx2*( ue(ip1,2)-ue(im1,2) )+
+     >                 dx1tx1*(ue(ip1,1)-2.0d0*ue(i,1)+ue(im1,1))
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tx2 * (
+     >                 (ue(ip1,2)*buf(ip1,2)+c2*(ue(ip1,5)-q(ip1)))-
+     >                 (ue(im1,2)*buf(im1,2)+c2*(ue(im1,5)-q(im1))))+
+     >                 xxcon1*(buf(ip1,2)-2.0d0*buf(i,2)+buf(im1,2))+
+     >                 dx2tx1*( ue(ip1,2)-2.0d0* ue(i,2)+ue(im1,2))
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tx2 * (
+     >                 ue(ip1,3)*buf(ip1,2)-ue(im1,3)*buf(im1,2))+
+     >                 xxcon2*(buf(ip1,3)-2.0d0*buf(i,3)+buf(im1,3))+
+     >                 dx3tx1*( ue(ip1,3)-2.0d0*ue(i,3) +ue(im1,3))
+                  
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tx2*(
+     >                 ue(ip1,4)*buf(ip1,2)-ue(im1,4)*buf(im1,2))+
+     >                 xxcon2*(buf(ip1,4)-2.0d0*buf(i,4)+buf(im1,4))+
+     >                 dx4tx1*( ue(ip1,4)-2.0d0* ue(i,4)+ ue(im1,4))
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tx2*(
+     >                 buf(ip1,2)*(c1*ue(ip1,5)-c2*q(ip1))-
+     >                 buf(im1,2)*(c1*ue(im1,5)-c2*q(im1)))+
+     >                 0.5d0*xxcon3*(buf(ip1,1)-2.0d0*buf(i,1)+
+     >                 buf(im1,1))+
+     >                 xxcon4*(cuf(ip1)-2.0d0*cuf(i)+cuf(im1))+
+     >                 xxcon5*(buf(ip1,5)-2.0d0*buf(i,5)+buf(im1,5))+
+     >                 dx5tx1*( ue(ip1,5)-2.0d0* ue(i,5)+ ue(im1,5))
+               enddo
+
+c---------------------------------------------------------------------
+c     Fourth-order dissipation                         
+c---------------------------------------------------------------------
+               if (start(1,c) .gt. 0) then
+                  do m = 1, 5
+                     i = 1
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (5.0d0*ue(i,m) - 4.0d0*ue(i+1,m) +ue(i+2,m))
+                     i = 2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (-4.0d0*ue(i-1,m) + 6.0d0*ue(i,m) -
+     >                    4.0d0*ue(i+1,m) +       ue(i+2,m))
+                  enddo
+               endif
+
+               do i = start(1,c)*3, cell_size(1,c)-3*end(1,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+     >                    (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m) + ue(i+2,m))
+                  enddo
+               enddo
+
+               if (end(1,c) .gt. 0) then
+                  do m = 1, 5
+                     i = cell_size(1,c)-3
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(i-2,m) - 4.0d0*ue(i-1,m) +
+     >                    6.0d0*ue(i,m) - 4.0d0*ue(i+1,m))
+                     i = cell_size(1,c)-2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(i-2,m) - 4.0d0*ue(i-1,m) + 5.0d0*ue(i,m))
+                  enddo
+               endif
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     eta-direction flux differences             
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1          
+            zeta = dble(k+cell_low(3,c)) * dnzm1
+            do i=start(1,c), cell_size(1,c)-end(1,c)-1
+               xi = dble(i+cell_low(1,c)) * dnxm1
+
+               do j=-2*(1-start(2,c)), cell_size(2,c)+1-2*end(2,c)
+                  eta = dble(j+cell_low(2,c)) * dnym1
+
+                  call exact_solution(xi, eta, zeta, dtemp)
+                  do m = 1, 5 
+                     ue(j,m) = dtemp(m)
+                  enddo
+                  
+                  dtpp = 1.0d0/dtemp(1)
+
+                  do m = 2, 5
+                     buf(j,m) = dtpp * dtemp(m)
+                  enddo
+
+                  cuf(j)   = buf(j,3) * buf(j,3)
+                  buf(j,1) = cuf(j) + buf(j,2) * buf(j,2) + 
+     >                 buf(j,4) * buf(j,4)
+                  q(j) = 0.5d0*(buf(j,2)*ue(j,2) + buf(j,3)*ue(j,3) +
+     >                 buf(j,4)*ue(j,4))
+               enddo
+
+               do j = start(2,c), cell_size(2,c)-end(2,c)-1
+                  jm1 = j-1
+                  jp1 = j+1
+                  
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+     >                 ty2*( ue(jp1,3)-ue(jm1,3) )+
+     >                 dy1ty1*(ue(jp1,1)-2.0d0*ue(j,1)+ue(jm1,1))
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - ty2*(
+     >                 ue(jp1,2)*buf(jp1,3)-ue(jm1,2)*buf(jm1,3))+
+     >                 yycon2*(buf(jp1,2)-2.0d0*buf(j,2)+buf(jm1,2))+
+     >                 dy2ty1*( ue(jp1,2)-2.0* ue(j,2)+ ue(jm1,2))
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - ty2*(
+     >                 (ue(jp1,3)*buf(jp1,3)+c2*(ue(jp1,5)-q(jp1)))-
+     >                 (ue(jm1,3)*buf(jm1,3)+c2*(ue(jm1,5)-q(jm1))))+
+     >                 yycon1*(buf(jp1,3)-2.0d0*buf(j,3)+buf(jm1,3))+
+     >                 dy3ty1*( ue(jp1,3)-2.0d0*ue(j,3) +ue(jm1,3))
+
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - ty2*(
+     >                 ue(jp1,4)*buf(jp1,3)-ue(jm1,4)*buf(jm1,3))+
+     >                 yycon2*(buf(jp1,4)-2.0d0*buf(j,4)+buf(jm1,4))+
+     >                 dy4ty1*( ue(jp1,4)-2.0d0*ue(j,4)+ ue(jm1,4))
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - ty2*(
+     >                 buf(jp1,3)*(c1*ue(jp1,5)-c2*q(jp1))-
+     >                 buf(jm1,3)*(c1*ue(jm1,5)-c2*q(jm1)))+
+     >                 0.5d0*yycon3*(buf(jp1,1)-2.0d0*buf(j,1)+
+     >                 buf(jm1,1))+
+     >                 yycon4*(cuf(jp1)-2.0d0*cuf(j)+cuf(jm1))+
+     >                 yycon5*(buf(jp1,5)-2.0d0*buf(j,5)+buf(jm1,5))+
+     >                 dy5ty1*(ue(jp1,5)-2.0d0*ue(j,5)+ue(jm1,5))
+               enddo
+
+c---------------------------------------------------------------------
+c     Fourth-order dissipation                      
+c---------------------------------------------------------------------
+               if (start(2,c) .gt. 0) then
+                  do m = 1, 5
+                     j = 1
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (5.0d0*ue(j,m) - 4.0d0*ue(j+1,m) +ue(j+2,m))
+                     j = 2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (-4.0d0*ue(j-1,m) + 6.0d0*ue(j,m) -
+     >                    4.0d0*ue(j+1,m) +       ue(j+2,m))
+                  enddo
+               endif
+
+               do j = start(2,c)*3, cell_size(2,c)-3*end(2,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+     >                    (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m) + ue(j+2,m))
+                  enddo
+               enddo
+
+               if (end(2,c) .gt. 0) then
+                  do m = 1, 5
+                     j = cell_size(2,c)-3
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(j-2,m) - 4.0d0*ue(j-1,m) +
+     >                    6.0d0*ue(j,m) - 4.0d0*ue(j+1,m))
+                     j = cell_size(2,c)-2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(j-2,m) - 4.0d0*ue(j-1,m) + 5.0d0*ue(j,m))
+
+                  enddo
+               endif
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     zeta-direction flux differences                      
+c---------------------------------------------------------------------
+         do j=start(2,c), cell_size(2,c)-end(2,c)-1
+            eta = dble(j+cell_low(2,c)) * dnym1
+            do i = start(1,c), cell_size(1,c)-end(1,c)-1
+               xi = dble(i+cell_low(1,c)) * dnxm1
+
+               do k=-2*(1-start(3,c)), cell_size(3,c)+1-2*end(3,c)
+                  zeta = dble(k+cell_low(3,c)) * dnzm1
+
+                  call exact_solution(xi, eta, zeta, dtemp)
+                  do m = 1, 5
+                     ue(k,m) = dtemp(m)
+                  enddo
+
+                  dtpp = 1.0d0/dtemp(1)
+
+                  do m = 2, 5
+                     buf(k,m) = dtpp * dtemp(m)
+                  enddo
+
+                  cuf(k)   = buf(k,4) * buf(k,4)
+                  buf(k,1) = cuf(k) + buf(k,2) * buf(k,2) + 
+     >                 buf(k,3) * buf(k,3)
+                  q(k) = 0.5d0*(buf(k,2)*ue(k,2) + buf(k,3)*ue(k,3) +
+     >                 buf(k,4)*ue(k,4))
+               enddo
+
+               do k=start(3,c), cell_size(3,c)-end(3,c)-1
+                  km1 = k-1
+                  kp1 = k+1
+                  
+                  forcing(1,i,j,k,c) = forcing(1,i,j,k,c) -
+     >                 tz2*( ue(kp1,4)-ue(km1,4) )+
+     >                 dz1tz1*(ue(kp1,1)-2.0d0*ue(k,1)+ue(km1,1))
+
+                  forcing(2,i,j,k,c) = forcing(2,i,j,k,c) - tz2 * (
+     >                 ue(kp1,2)*buf(kp1,4)-ue(km1,2)*buf(km1,4))+
+     >                 zzcon2*(buf(kp1,2)-2.0d0*buf(k,2)+buf(km1,2))+
+     >                 dz2tz1*( ue(kp1,2)-2.0d0* ue(k,2)+ ue(km1,2))
+
+                  forcing(3,i,j,k,c) = forcing(3,i,j,k,c) - tz2 * (
+     >                 ue(kp1,3)*buf(kp1,4)-ue(km1,3)*buf(km1,4))+
+     >                 zzcon2*(buf(kp1,3)-2.0d0*buf(k,3)+buf(km1,3))+
+     >                 dz3tz1*(ue(kp1,3)-2.0d0*ue(k,3)+ue(km1,3))
+
+                  forcing(4,i,j,k,c) = forcing(4,i,j,k,c) - tz2 * (
+     >                 (ue(kp1,4)*buf(kp1,4)+c2*(ue(kp1,5)-q(kp1)))-
+     >                 (ue(km1,4)*buf(km1,4)+c2*(ue(km1,5)-q(km1))))+
+     >                 zzcon1*(buf(kp1,4)-2.0d0*buf(k,4)+buf(km1,4))+
+     >                 dz4tz1*( ue(kp1,4)-2.0d0*ue(k,4) +ue(km1,4))
+
+                  forcing(5,i,j,k,c) = forcing(5,i,j,k,c) - tz2 * (
+     >                 buf(kp1,4)*(c1*ue(kp1,5)-c2*q(kp1))-
+     >                 buf(km1,4)*(c1*ue(km1,5)-c2*q(km1)))+
+     >                 0.5d0*zzcon3*(buf(kp1,1)-2.0d0*buf(k,1)
+     >                 +buf(km1,1))+
+     >                 zzcon4*(cuf(kp1)-2.0d0*cuf(k)+cuf(km1))+
+     >                 zzcon5*(buf(kp1,5)-2.0d0*buf(k,5)+buf(km1,5))+
+     >                 dz5tz1*( ue(kp1,5)-2.0d0*ue(k,5)+ ue(km1,5))
+               enddo
+
+c---------------------------------------------------------------------
+c     Fourth-order dissipation                        
+c---------------------------------------------------------------------
+               if (start(3,c) .gt. 0) then
+                  do m = 1, 5
+                     k = 1
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (5.0d0*ue(k,m) - 4.0d0*ue(k+1,m) +ue(k+2,m))
+                     k = 2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (-4.0d0*ue(k-1,m) + 6.0d0*ue(k,m) -
+     >                    4.0d0*ue(k+1,m) +       ue(k+2,m))
+                  enddo
+               endif
+
+               do k = start(3,c)*3, cell_size(3,c)-3*end(3,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp*
+     >                    (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m) + ue(k+2,m))
+                  enddo
+               enddo
+
+               if (end(3,c) .gt. 0) then
+                  do m = 1, 5
+                     k = cell_size(3,c)-3
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(k-2,m) - 4.0d0*ue(k-1,m) +
+     >                    6.0d0*ue(k,m) - 4.0d0*ue(k+1,m))
+                     k = cell_size(3,c)-2
+                     forcing(m,i,j,k,c) = forcing(m,i,j,k,c) - dssp *
+     >                    (ue(k-2,m) - 4.0d0*ue(k-1,m) + 5.0d0*ue(k,m))
+                  enddo
+               endif
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     now change the sign of the forcing function, 
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     forcing(m,i,j,k,c) = -1.d0 * forcing(m,i,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+
+      enddo
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/exact_solution.f b/examples/smpi/NAS/BT/exact_solution.f
new file mode 100644 (file)
index 0000000..b093b46
--- /dev/null
@@ -0,0 +1,29 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine exact_solution(xi,eta,zeta,dtemp)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     this function returns the exact solution at point xi, eta, zeta  
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      double precision  xi, eta, zeta, dtemp(5)
+      integer m
+
+      do m = 1, 5
+         dtemp(m) =  ce(m,1) +
+     >     xi*(ce(m,2) + xi*(ce(m,5) + xi*(ce(m,8) + xi*ce(m,11)))) +
+     >     eta*(ce(m,3) + eta*(ce(m,6) + eta*(ce(m,9) + eta*ce(m,12))))+
+     >     zeta*(ce(m,4) + zeta*(ce(m,7) + zeta*(ce(m,10) + 
+     >     zeta*ce(m,13))))
+      enddo
+
+      return
+      end
+
+
diff --git a/examples/smpi/NAS/BT/fortran_io.f b/examples/smpi/NAS/BT/fortran_io.f
new file mode 100644 (file)
index 0000000..d3085a0
--- /dev/null
@@ -0,0 +1,174 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      character*(128) newfilenm
+      integer m, ierr
+
+      if (node.eq.root) record_length = 40/fortran_rec_sz
+      call mpi_bcast(record_length, 1, MPI_INTEGER,
+     >                root, comm_setup, ierr)
+
+      open (unit=99, file=filenm,
+     $      form='unformatted', access='direct',
+     $      recl=record_length)
+
+      do m = 1, 5
+         xce_sub(m) = 0.d0
+      end do
+
+      idump_sub = 0
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ix, jio, kio, cio
+
+      do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  iseek=(cell_low(1,cio) +
+     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+     $                   PROBLEM_SIZE*idump_sub)))
+
+                  do ix=0,cell_size(1,cio)-1
+                      write(99, rec=iseek+ix+1)
+     $                      u(1,ix, jio,kio,cio),
+     $                      u(2,ix, jio,kio,cio),
+     $                      u(3,ix, jio,kio,cio),
+     $                      u(4,ix, jio,kio,cio),
+     $                      u(5,ix, jio,kio,cio)
+                  enddo
+              enddo
+          enddo
+      enddo
+
+      idump_sub = idump_sub + 1
+      if (rd_interval .gt. 0) then
+         if (idump_sub .ge. rd_interval) then
+
+            call acc_sub_norms(idump+1)
+
+            idump_sub = 0
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine acc_sub_norms(idump_cur)
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer idump_cur
+
+      integer ix, jio, kio, cio, ii, m, ichunk
+      double precision xce_single(5)
+
+      ichunk = idump_cur - idump_sub + 1
+      do ii=0, idump_sub-1
+        do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  iseek=(cell_low(1,cio) +
+     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+     $                   PROBLEM_SIZE*ii)))
+
+
+                  do ix=0,cell_size(1,cio)-1
+                      read(99, rec=iseek+ix+1)
+     $                      u(1,ix, jio,kio,cio),
+     $                      u(2,ix, jio,kio,cio),
+     $                      u(3,ix, jio,kio,cio),
+     $                      u(4,ix, jio,kio,cio),
+     $                      u(5,ix, jio,kio,cio)
+                  enddo
+              enddo
+          enddo
+        enddo
+
+        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+        call error_norm(xce_single)
+        do m = 1, 5
+           xce_sub(m) = xce_sub(m) + xce_single(m)
+        end do
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      close(unit=99)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      double precision xce_acc(5)
+      integer m
+
+      if (rd_interval .gt. 0) goto 20
+
+      open (unit=99, file=filenm,
+     $      form='unformatted', access='direct',
+     $      recl=record_length)
+
+c     clear the last time step
+
+      call clear_timestep
+
+c     read back the time steps and accumulate norms
+
+      call acc_sub_norms(idump)
+
+      close(unit=99)
+
+ 20   continue
+      do m = 1, 5
+         xce_acc(m) = xce_sub(m) / dble(idump)
+      end do
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/full_mpiio.f b/examples/smpi/NAS/BT/full_mpiio.f
new file mode 100644 (file)
index 0000000..ecfd41c
--- /dev/null
@@ -0,0 +1,307 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ierr
+      integer mstatus(MPI_STATUS_SIZE)
+      integer sizes(4), starts(4), subsizes(4)
+      integer cell_btype(maxcells), cell_ftype(maxcells)
+      integer cell_blength(maxcells)
+      integer info
+      character*20 cb_nodes, cb_size
+      integer c, m
+      integer cell_disp(maxcells)
+
+       call mpi_bcast(collbuf_nodes, 1, MPI_INTEGER,
+     >                root, comm_setup, ierr)
+
+       call mpi_bcast(collbuf_size, 1, MPI_INTEGER,
+     >                root, comm_setup, ierr)
+
+       if (collbuf_nodes .eq. 0) then
+          info = MPI_INFO_NULL
+       else
+          write (cb_nodes,*) collbuf_nodes
+          write (cb_size,*) collbuf_size
+          call MPI_Info_create(info, ierr)
+          call MPI_Info_set(info, 'cb_nodes', cb_nodes, ierr)
+          call MPI_Info_set(info, 'cb_buffer_size', cb_size, ierr)
+          call MPI_Info_set(info, 'collective_buffering', 'true', ierr)
+       endif
+
+       call MPI_Type_contiguous(5, MPI_DOUBLE_PRECISION,
+     $                          element, ierr)
+       call MPI_Type_commit(element, ierr)
+       call MPI_Type_extent(element, eltext, ierr)
+
+       do  c = 1, ncells
+c
+c Outer array dimensions ar same for every cell
+c
+           sizes(1) = IMAX+4
+           sizes(2) = JMAX+4
+           sizes(3) = KMAX+4
+c
+c 4th dimension is cell number, total of maxcells cells
+c
+           sizes(4) = maxcells
+c
+c Internal dimensions of cells can differ slightly between cells
+c
+           subsizes(1) = cell_size(1, c)
+           subsizes(2) = cell_size(2, c)
+           subsizes(3) = cell_size(3, c)
+c
+c Cell is 4th dimension, 1 cell per cell type to handle varying 
+c cell sub-array sizes
+c
+           subsizes(4) = 1
+
+c
+c type constructors use 0-based start addresses
+c
+           starts(1) = 2 
+           starts(2) = 2
+           starts(3) = 2
+           starts(4) = c-1
+
+c 
+c Create buftype for a cell
+c
+           call MPI_Type_create_subarray(4, sizes, subsizes, 
+     $          starts, MPI_ORDER_FORTRAN, element, 
+     $          cell_btype(c), ierr)
+c
+c block length and displacement for joining cells - 
+c 1 cell buftype per block, cell buftypes have own displacment
+c generated from cell number (4th array dimension)
+c
+           cell_blength(c) = 1
+           cell_disp(c) = 0
+
+       enddo
+c
+c Create combined buftype for all cells
+c
+       call MPI_Type_struct(ncells, cell_blength, cell_disp,
+     $            cell_btype, combined_btype, ierr)
+       call MPI_Type_commit(combined_btype, ierr)
+
+       do  c = 1, ncells
+c
+c Entire array size
+c
+           sizes(1) = PROBLEM_SIZE
+           sizes(2) = PROBLEM_SIZE
+           sizes(3) = PROBLEM_SIZE
+
+c
+c Size of c'th cell
+c
+           subsizes(1) = cell_size(1, c)
+           subsizes(2) = cell_size(2, c)
+           subsizes(3) = cell_size(3, c)
+
+c
+c Starting point in full array of c'th cell
+c
+           starts(1) = cell_low(1,c)
+           starts(2) = cell_low(2,c)
+           starts(3) = cell_low(3,c)
+
+           call MPI_Type_create_subarray(3, sizes, subsizes,
+     $          starts, MPI_ORDER_FORTRAN,
+     $          element, cell_ftype(c), ierr)
+           cell_blength(c) = 1
+           cell_disp(c) = 0
+       enddo
+
+       call MPI_Type_struct(ncells, cell_blength, cell_disp,
+     $            cell_ftype, combined_ftype, ierr)
+       call MPI_Type_commit(combined_ftype, ierr)
+
+       iseek=0
+       if (node .eq. root) then
+          call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
+       endif
+
+
+      call MPI_Barrier(comm_solve, ierr)
+
+       call MPI_File_open(comm_solve,
+     $          filenm,
+     $          MPI_MODE_RDWR+MPI_MODE_CREATE,
+     $          MPI_INFO_NULL, fp, ierr)
+
+       if (ierr .ne. MPI_SUCCESS) then
+                print *, 'Error opening file'
+                stop
+       endif
+
+        call MPI_File_set_view(fp, iseek, element, 
+     $          combined_ftype, 'native', info, ierr)
+
+       if (ierr .ne. MPI_SUCCESS) then
+                print *, 'Error setting file view'
+                stop
+       endif
+
+      do m = 1, 5
+         xce_sub(m) = 0.d0
+      end do
+
+      idump_sub = 0
+
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer mstatus(MPI_STATUS_SIZE)
+      integer ierr
+
+      call MPI_File_write_at_all(fp, iseek, u,
+     $                           1, combined_btype, mstatus, ierr)
+      if (ierr .ne. MPI_SUCCESS) then
+          print *, 'Error writing to file'
+          stop
+      endif
+
+      call MPI_Type_size(combined_btype, iosize, ierr)
+      iseek = iseek + iosize/eltext
+
+      idump_sub = idump_sub + 1
+      if (rd_interval .gt. 0) then
+         if (idump_sub .ge. rd_interval) then
+
+            iseek = 0
+            call acc_sub_norms(idump+1)
+
+            iseek = 0
+            idump_sub = 0
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine acc_sub_norms(idump_cur)
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer idump_cur
+
+      integer ii, m, ichunk
+      integer ierr
+      integer mstatus(MPI_STATUS_SIZE)
+      double precision xce_single(5)
+
+      ichunk = idump_cur - idump_sub + 1
+      do ii=0, idump_sub-1
+
+        call MPI_File_read_at_all(fp, iseek, u,
+     $                           1, combined_btype, mstatus, ierr)
+        if (ierr .ne. MPI_SUCCESS) then
+           print *, 'Error reading back file'
+           call MPI_File_close(fp, ierr)
+           stop
+        endif
+
+        call MPI_Type_size(combined_btype, iosize, ierr)
+        iseek = iseek + iosize/eltext
+
+        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+        call error_norm(xce_single)
+        do m = 1, 5
+           xce_sub(m) = xce_sub(m) + xce_single(m)
+        end do
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ierr
+
+      call MPI_File_close(fp, ierr)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      double precision xce_acc(5)
+      integer m, ierr
+
+      if (rd_interval .gt. 0) goto 20
+
+      call MPI_File_open(comm_solve,
+     $          filenm,
+     $          MPI_MODE_RDONLY,
+     $          MPI_INFO_NULL,
+     $          fp,
+     $          ierr)
+
+      iseek = 0
+      call MPI_File_set_view(fp, iseek, element, combined_ftype,
+     $          'native', MPI_INFO_NULL, ierr)
+
+c     clear the last time step
+
+      call clear_timestep
+
+c     read back the time steps and accumulate norms
+
+      call acc_sub_norms(idump)
+
+      call MPI_File_close(fp, ierr)
+
+ 20   continue
+      do m = 1, 5
+         xce_acc(m) = xce_sub(m) / dble(idump)
+      end do
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/header.h b/examples/smpi/NAS/BT/header.h
new file mode 100644 (file)
index 0000000..47719da
--- /dev/null
@@ -0,0 +1,137 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c
+c  header.h
+c
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      implicit none
+
+c---------------------------------------------------------------------
+c The following include file is generated automatically by the
+c "setparams" utility. It defines 
+c      maxcells:      the square root of the maximum number of processors
+c      problem_size:  12, 64, 102, 162 (for class T, A, B, C)
+c      dt_default:    default time step for this problem size if no
+c                     config file
+c      niter_default: default number of iterations for this problem size
+c---------------------------------------------------------------------
+
+      include 'npbparams.h'
+
+      integer           aa, bb, cc, BLOCK_SIZE
+      parameter (aa=1, bb=2, cc=3, BLOCK_SIZE=5)
+
+      integer           ncells, grid_points(3)
+      double precision  elapsed_time
+      common /global/   elapsed_time, ncells, grid_points
+
+      double precision  tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3, 
+     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+     >                  ce(5,13), dxmax, dymax, dzmax, xxcon1, xxcon2, 
+     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+      common /constants/ tx1, tx2, tx3, ty1, ty2, ty3, tz1, tz2, tz3,
+     >                  dx1, dx2, dx3, dx4, dx5, dy1, dy2, dy3, dy4, 
+     >                  dy5, dz1, dz2, dz3, dz4, dz5, dssp, dt, 
+     >                  ce, dxmax, dymax, dzmax, xxcon1, xxcon2, 
+     >                  xxcon3, xxcon4, xxcon5, dx1tx1, dx2tx1, dx3tx1,
+     >                  dx4tx1, dx5tx1, yycon1, yycon2, yycon3, yycon4,
+     >                  yycon5, dy1ty1, dy2ty1, dy3ty1, dy4ty1, dy5ty1,
+     >                  zzcon1, zzcon2, zzcon3, zzcon4, zzcon5, dz1tz1, 
+     >                  dz2tz1, dz3tz1, dz4tz1, dz5tz1, dnxm1, dnym1, 
+     >                  dnzm1, c1c2, c1c5, c3c4, c1345, conz1, c1, c2, 
+     >                  c3, c4, c5, c4dssp, c5dssp, dtdssp, dttx1, bt,
+     >                  dttx2, dtty1, dtty2, dttz1, dttz2, c2dttx1, 
+     >                  c2dtty1, c2dttz1, comz1, comz4, comz5, comz6, 
+     >                  c3c4tx3, c3c4ty3, c3c4tz3, c2iv, con43, con16
+
+      integer           EAST, WEST, NORTH, SOUTH, 
+     >                  BOTTOM, TOP
+
+      parameter (EAST=2000, WEST=3000,      NORTH=4000, SOUTH=5000,
+     >           BOTTOM=6000, TOP=7000)
+
+      integer cell_coord (3,maxcells), cell_low (3,maxcells), 
+     >        cell_high  (3,maxcells), cell_size(3,maxcells),
+     >        predecessor(3),          slice    (3,maxcells),
+     >        grid_size  (3),          successor(3)         ,
+     >        start      (3,maxcells), end      (3,maxcells)
+      common /partition/ cell_coord, cell_low, cell_high, cell_size,
+     >                   grid_size, successor, predecessor, slice,
+     >                   start, end
+
+      integer IMAX, JMAX, KMAX, MAX_CELL_DIM, BUF_SIZE
+
+      parameter (MAX_CELL_DIM = (problem_size/maxcells)+1)
+
+      parameter (IMAX=MAX_CELL_DIM,JMAX=MAX_CELL_DIM,KMAX=MAX_CELL_DIM)
+
+      parameter (BUF_SIZE=MAX_CELL_DIM*MAX_CELL_DIM*(maxcells-1)*60+1)
+
+      double precision 
+     >   us      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   vs      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   ws      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   qs      (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   rho_i   (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   square  (    -1:IMAX,  -1:JMAX,  -1:KMAX,   maxcells),
+     >   forcing (5,   0:IMAX-1, 0:JMAX-1, 0:KMAX-1, maxcells),
+     >   u       (5,  -2:IMAX+1,-2:JMAX+1,-2:KMAX+1, maxcells),
+     >   rhs     (5,  -1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
+     >   lhsc    (5,5,-1:IMAX-1,-1:JMAX-1,-1:KMAX-1, maxcells),
+     >   backsub_info (5, 0:MAX_CELL_DIM, 0:MAX_CELL_DIM, maxcells),
+     >   in_buffer(BUF_SIZE), out_buffer(BUF_SIZE)
+      common /fields/  u, us, vs, ws, qs, rho_i, square, 
+     >                 rhs, forcing, lhsc, in_buffer, out_buffer,
+     >                 backsub_info
+
+      double precision cv(-2:MAX_CELL_DIM+1),   rhon(-2:MAX_CELL_DIM+1),
+     >                 rhos(-2:MAX_CELL_DIM+1), rhoq(-2:MAX_CELL_DIM+1),
+     >                 cuf(-2:MAX_CELL_DIM+1),  q(-2:MAX_CELL_DIM+1),
+     >                 ue(-2:MAX_CELL_DIM+1,5), buf(-2:MAX_CELL_DIM+1,5)
+      common /work_1d/ cv, rhon, rhos, rhoq, cuf, q, ue, buf
+
+      integer  west_size, east_size, bottom_size, top_size,
+     >         north_size, south_size, start_send_west, 
+     >         start_send_east, start_send_south, start_send_north,
+     >         start_send_bottom, start_send_top, start_recv_west,
+     >         start_recv_east, start_recv_south, start_recv_north,
+     >         start_recv_bottom, start_recv_top
+      common /box/ west_size, east_size, bottom_size,
+     >             top_size, north_size, south_size, 
+     >             start_send_west, start_send_east, start_send_south,
+     >             start_send_north, start_send_bottom, start_send_top,
+     >             start_recv_west, start_recv_east, start_recv_south,
+     >             start_recv_north, start_recv_bottom, start_recv_top
+
+      double precision  tmp_block(5,5), b_inverse(5,5), tmp_vec(5)
+      common /work_solve/ tmp_block, b_inverse, tmp_vec
+
+c
+c     These are used by btio
+c
+      integer collbuf_nodes, collbuf_size, iosize, eltext,
+     $        combined_btype, fp, idump, record_length, element,
+     $        combined_ftype, idump_sub, rd_interval
+      common /btio/ collbuf_nodes, collbuf_size, iosize, eltext,
+     $              combined_btype, fp, idump, record_length,
+     $              idump_sub, rd_interval
+      double precision sum(niter_default), xce_sub(5)
+      common /btio/ sum, xce_sub
+      integer*8 iseek
+      common /btio/ iseek, element, combined_ftype
+
+
+
diff --git a/examples/smpi/NAS/BT/initialize.f b/examples/smpi/NAS/BT/initialize.f
new file mode 100644 (file)
index 0000000..274cdb1
--- /dev/null
@@ -0,0 +1,308 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine  initialize
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     This subroutine initializes the field variable u using 
+c     tri-linear transfinite interpolation of the boundary values     
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      
+      integer c, i, j, k, m, ii, jj, kk, ix, iy, iz
+      double precision  xi, eta, zeta, Pface(5,3,2), Pxi, Peta, 
+     >     Pzeta, temp(5)
+
+c---------------------------------------------------------------------
+c  Later (in compute_rhs) we compute 1/u for every element. A few of 
+c  the corner elements are not used, but it convenient (and faster) 
+c  to compute the whole thing with a simple loop. Make sure those 
+c  values are nonzero by initializing the whole thing here. 
+c---------------------------------------------------------------------
+      do c = 1, ncells
+         do kk = -1, KMAX
+            do jj = -1, JMAX
+               do ii = -1, IMAX
+                  do m = 1, 5
+                     u(m, ii, jj, kk, c) = 1.0
+                  end do
+               end do
+            end do
+         end do
+      end do
+c---------------------------------------------------------------------
+
+
+
+c---------------------------------------------------------------------
+c     first store the "interpolated" values everywhere on the grid    
+c---------------------------------------------------------------------
+      do c=1, ncells
+         kk = 0
+         do k = cell_low(3,c), cell_high(3,c)
+            zeta = dble(k) * dnzm1
+            jj = 0
+            do j = cell_low(2,c), cell_high(2,c)
+               eta = dble(j) * dnym1
+               ii = 0
+               do i = cell_low(1,c), cell_high(1,c)
+                  xi = dble(i) * dnxm1
+                  
+                  do ix = 1, 2
+                     call exact_solution(dble(ix-1), eta, zeta, 
+     >                    Pface(1,1,ix))
+                  enddo
+
+                  do iy = 1, 2
+                     call exact_solution(xi, dble(iy-1) , zeta, 
+     >                    Pface(1,2,iy))
+                  enddo
+
+                  do iz = 1, 2
+                     call exact_solution(xi, eta, dble(iz-1),   
+     >                    Pface(1,3,iz))
+                  enddo
+
+                  do m = 1, 5
+                     Pxi   = xi   * Pface(m,1,2) + 
+     >                    (1.0d0-xi)   * Pface(m,1,1)
+                     Peta  = eta  * Pface(m,2,2) + 
+     >                    (1.0d0-eta)  * Pface(m,2,1)
+                     Pzeta = zeta * Pface(m,3,2) + 
+     >                    (1.0d0-zeta) * Pface(m,3,1)
+                     
+                     u(m,ii,jj,kk,c) = Pxi + Peta + Pzeta - 
+     >                    Pxi*Peta - Pxi*Pzeta - Peta*Pzeta + 
+     >                    Pxi*Peta*Pzeta
+
+                  enddo
+                  ii = ii + 1
+               enddo
+               jj = jj + 1
+            enddo
+            kk = kk+1
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     now store the exact values on the boundaries        
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     west face                                                  
+c---------------------------------------------------------------------
+      c = slice(1,1)
+      ii = 0
+      xi = 0.0d0
+      kk = 0
+      do k = cell_low(3,c), cell_high(3,c)
+         zeta = dble(k) * dnzm1
+         jj = 0
+         do j = cell_low(2,c), cell_high(2,c)
+            eta = dble(j) * dnym1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            jj = jj + 1
+         enddo
+         kk = kk + 1
+      enddo
+
+c---------------------------------------------------------------------
+c     east face                                                      
+c---------------------------------------------------------------------
+      c  = slice(1,ncells)
+      ii = cell_size(1,c)-1
+      xi = 1.0d0
+      kk = 0
+      do k = cell_low(3,c), cell_high(3,c)
+         zeta = dble(k) * dnzm1
+         jj = 0
+         do j = cell_low(2,c), cell_high(2,c)
+            eta = dble(j) * dnym1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            jj = jj + 1
+         enddo
+         kk = kk + 1
+      enddo
+
+c---------------------------------------------------------------------
+c     south face                                                 
+c---------------------------------------------------------------------
+      c = slice(2,1)
+      jj = 0
+      eta = 0.0d0
+      kk = 0
+      do k = cell_low(3,c), cell_high(3,c)
+         zeta = dble(k) * dnzm1
+         ii = 0
+         do i = cell_low(1,c), cell_high(1,c)
+            xi = dble(i) * dnxm1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            ii = ii + 1
+         enddo
+         kk = kk + 1
+      enddo
+
+
+c---------------------------------------------------------------------
+c     north face                                    
+c---------------------------------------------------------------------
+      c = slice(2,ncells)
+      jj = cell_size(2,c)-1
+      eta = 1.0d0
+      kk = 0
+      do k = cell_low(3,c), cell_high(3,c)
+         zeta = dble(k) * dnzm1
+         ii = 0
+         do i = cell_low(1,c), cell_high(1,c)
+            xi = dble(i) * dnxm1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            ii = ii + 1
+         enddo
+         kk = kk + 1
+      enddo
+
+c---------------------------------------------------------------------
+c     bottom face                                       
+c---------------------------------------------------------------------
+      c = slice(3,1)
+      kk = 0
+      zeta = 0.0d0
+      jj = 0
+      do j = cell_low(2,c), cell_high(2,c)
+         eta = dble(j) * dnym1
+         ii = 0
+         do i =cell_low(1,c), cell_high(1,c)
+            xi = dble(i) *dnxm1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            ii = ii + 1
+         enddo
+         jj = jj + 1
+      enddo
+
+c---------------------------------------------------------------------
+c     top face     
+c---------------------------------------------------------------------
+      c = slice(3,ncells)
+      kk = cell_size(3,c)-1
+      zeta = 1.0d0
+      jj = 0
+      do j = cell_low(2,c), cell_high(2,c)
+         eta = dble(j) * dnym1
+         ii = 0
+         do i =cell_low(1,c), cell_high(1,c)
+            xi = dble(i) * dnxm1
+            call exact_solution(xi, eta, zeta, temp)
+            do m = 1, 5
+               u(m,ii,jj,kk,c) = temp(m)
+            enddo
+            ii = ii + 1
+         enddo
+         jj = jj + 1
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine lhsinit
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      
+      integer i, j, k, d, c, m, n
+
+c---------------------------------------------------------------------
+c     loop over all cells                                       
+c---------------------------------------------------------------------
+      do c = 1, ncells
+
+c---------------------------------------------------------------------
+c     first, initialize the start and end arrays
+c---------------------------------------------------------------------
+         do d = 1, 3
+            if (cell_coord(d,c) .eq. 1) then
+               start(d,c) = 1
+            else 
+               start(d,c) = 0
+            endif
+            if (cell_coord(d,c) .eq. ncells) then
+               end(d,c) = 1
+            else
+               end(d,c) = 0
+            endif
+         enddo
+
+c---------------------------------------------------------------------
+c     zero the whole left hand side for starters
+c---------------------------------------------------------------------
+         do k = 0, cell_size(3,c)-1
+            do j = 0, cell_size(2,c)-1
+               do i = 0, cell_size(1,c)-1
+                  do m = 1,5
+                     do n = 1, 5
+                        lhsc(m,n,i,j,k,c) = 0.0d0
+                     enddo
+                  enddo
+               enddo
+            enddo
+         enddo
+
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine lhsabinit(lhsa, lhsb, size)
+      implicit none
+
+      integer size
+      double precision lhsa(5, 5, -1:size), lhsb(5, 5, -1:size)
+
+      integer i, m, n
+
+c---------------------------------------------------------------------
+c     next, set all diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+      do i = 0, size
+         do m = 1, 5
+            do n = 1, 5
+               lhsa(m,n,i) = 0.0d0
+               lhsb(m,n,i) = 0.0d0
+            enddo
+            lhsb(m,m,i) = 1.0d0
+         enddo
+      enddo
+
+      return
+      end
+
+
+
diff --git a/examples/smpi/NAS/BT/inputbt.data.sample b/examples/smpi/NAS/BT/inputbt.data.sample
new file mode 100644 (file)
index 0000000..776654e
--- /dev/null
@@ -0,0 +1,5 @@
+200       number of time steps
+0.0008d0  dt for class A = 0.0008d0. class B = 0.0003d0  class C = 0.0001d0
+64 64 64
+5 0        write interval (optional read interval) for BTIO
+0 1000000  number of nodes in collective buffering and buffer size for BTIO
diff --git a/examples/smpi/NAS/BT/make_set.f b/examples/smpi/NAS/BT/make_set.f
new file mode 100644 (file)
index 0000000..b8d90c6
--- /dev/null
@@ -0,0 +1,124 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine make_set
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     This function allocates space for a set of cells and fills the set     
+c     such that communication between cells on different nodes is only
+c     nearest neighbor                                                   
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+
+      integer p, i, j, c, dir, size, excess, ierr,ierrcode
+
+c---------------------------------------------------------------------
+c     compute square root; add small number to allow for roundoff
+c     (note: this is computed in setup_mpi.f also, but prefer to do
+c     it twice because of some include file problems).
+c---------------------------------------------------------------------
+      ncells = dint(dsqrt(dble(no_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c     this makes coding easier
+c---------------------------------------------------------------------
+      p = ncells
+      
+c---------------------------------------------------------------------
+c     determine the location of the cell at the bottom of the 3D 
+c     array of cells
+c---------------------------------------------------------------------
+      cell_coord(1,1) = mod(node,p) 
+      cell_coord(2,1) = node/p 
+      cell_coord(3,1) = 0
+
+c---------------------------------------------------------------------
+c     set the cell_coords for cells in the rest of the z-layers; 
+c     this comes down to a simple linear numbering in the z-direct-
+c     ion, and to the doubly-cyclic numbering in the other dirs     
+c---------------------------------------------------------------------
+      do c=2, p
+         cell_coord(1,c) = mod(cell_coord(1,c-1)+1,p) 
+         cell_coord(2,c) = mod(cell_coord(2,c-1)-1+p,p) 
+         cell_coord(3,c) = c-1
+      end do
+
+c---------------------------------------------------------------------
+c     offset all the coordinates by 1 to adjust for Fortran arrays
+c---------------------------------------------------------------------
+      do dir = 1, 3
+         do c = 1, p
+            cell_coord(dir,c) = cell_coord(dir,c) + 1
+         end do
+      end do
+      
+c---------------------------------------------------------------------
+c     slice(dir,n) contains the sequence number of the cell that is in
+c     coordinate plane n in the dir direction
+c---------------------------------------------------------------------
+      do dir = 1, 3
+         do c = 1, p
+            slice(dir,cell_coord(dir,c)) = c
+         end do
+      end do
+
+
+c---------------------------------------------------------------------
+c     fill the predecessor and successor entries, using the indices 
+c     of the bottom cells (they are the same at each level of k 
+c     anyway) acting as if full periodicity pertains; note that p is
+c     added to those arguments to the mod functions that might
+c     otherwise return wrong values when using the modulo function
+c---------------------------------------------------------------------
+      i = cell_coord(1,1)-1
+      j = cell_coord(2,1)-1
+
+      predecessor(1) = mod(i-1+p,p) + p*j
+      predecessor(2) = i + p*mod(j-1+p,p)
+      predecessor(3) = mod(i+1,p) + p*mod(j-1+p,p)
+      successor(1)   = mod(i+1,p) + p*j
+      successor(2)   = i + p*mod(j+1,p)
+      successor(3)   = mod(i-1+p,p) + p*mod(j+1,p)
+
+c---------------------------------------------------------------------
+c     now compute the sizes of the cells                                    
+c---------------------------------------------------------------------
+      do dir= 1, 3
+c---------------------------------------------------------------------
+c     set cell_coord range for each direction                            
+c---------------------------------------------------------------------
+         size   = grid_points(dir)/p
+         excess = mod(grid_points(dir),p)
+         do c=1, ncells
+            if (cell_coord(dir,c) .le. excess) then
+               cell_size(dir,c) = size+1
+               cell_low(dir,c) = (cell_coord(dir,c)-1)*(size+1)
+               cell_high(dir,c) = cell_low(dir,c)+size
+            else 
+               cell_size(dir,c) = size
+               cell_low(dir,c)  = excess*(size+1)+
+     >              (cell_coord(dir,c)-excess-1)*size
+               cell_high(dir,c) = cell_low(dir,c)+size-1
+            endif
+            if (cell_size(dir, c) .le. 2) then
+               write(*,50)
+ 50            format(' Error: Cell size too small. Min size is 3')
+               call MPI_Abort(mpi_comm_world,ierrcode,ierr)
+               stop
+            endif
+         end do
+      end do
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+
diff --git a/examples/smpi/NAS/BT/mpinpb.h b/examples/smpi/NAS/BT/mpinpb.h
new file mode 100644 (file)
index 0000000..f621f08
--- /dev/null
@@ -0,0 +1,12 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'mpif.h'
+
+      integer           node, no_nodes, total_nodes, root, comm_setup, 
+     >                  comm_solve, comm_rhs, dp_type
+      logical           active
+      common /mpistuff/ node, no_nodes, total_nodes, root, comm_setup, 
+     >                  comm_solve, comm_rhs, dp_type, active
+
diff --git a/examples/smpi/NAS/BT/rhs.f b/examples/smpi/NAS/BT/rhs.f
new file mode 100644 (file)
index 0000000..89171a6
--- /dev/null
@@ -0,0 +1,425 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine compute_rhs
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer c, i, j, k, m
+      double precision rho_inv, uijk, up1, um1, vijk, vp1, vm1,
+     >     wijk, wp1, wm1
+
+
+c---------------------------------------------------------------------
+c     loop over all cells owned by this node                           
+c---------------------------------------------------------------------
+      do c = 1, ncells
+
+c---------------------------------------------------------------------
+c     compute the reciprocal of density, and the kinetic energy, 
+c     and the speed of sound.
+c---------------------------------------------------------------------
+         do k = -1, cell_size(3,c)
+            do j = -1, cell_size(2,c)
+               do i = -1, cell_size(1,c)
+                  rho_inv = 1.0d0/u(1,i,j,k,c)
+                  rho_i(i,j,k,c) = rho_inv
+                  us(i,j,k,c) = u(2,i,j,k,c) * rho_inv
+                  vs(i,j,k,c) = u(3,i,j,k,c) * rho_inv
+                  ws(i,j,k,c) = u(4,i,j,k,c) * rho_inv
+                  square(i,j,k,c)     = 0.5d0* (
+     >                 u(2,i,j,k,c)*u(2,i,j,k,c) + 
+     >                 u(3,i,j,k,c)*u(3,i,j,k,c) +
+     >                 u(4,i,j,k,c)*u(4,i,j,k,c) ) * rho_inv
+                  qs(i,j,k,c) = square(i,j,k,c) * rho_inv
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c copy the exact forcing term to the right hand side;  because 
+c this forcing term is known, we can store it on the whole of every 
+c cell,  including the boundary                   
+c---------------------------------------------------------------------
+
+         do k = 0, cell_size(3,c)-1
+            do j = 0, cell_size(2,c)-1
+               do i = 0, cell_size(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = forcing(m,i,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+
+
+c---------------------------------------------------------------------
+c     compute xi-direction fluxes 
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  uijk = us(i,j,k,c)
+                  up1  = us(i+1,j,k,c)
+                  um1  = us(i-1,j,k,c)
+
+                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dx1tx1 * 
+     >                 (u(1,i+1,j,k,c) - 2.0d0*u(1,i,j,k,c) + 
+     >                 u(1,i-1,j,k,c)) -
+     >                 tx2 * (u(2,i+1,j,k,c) - u(2,i-1,j,k,c))
+
+                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dx2tx1 * 
+     >                 (u(2,i+1,j,k,c) - 2.0d0*u(2,i,j,k,c) + 
+     >                 u(2,i-1,j,k,c)) +
+     >                 xxcon2*con43 * (up1 - 2.0d0*uijk + um1) -
+     >                 tx2 * (u(2,i+1,j,k,c)*up1 - 
+     >                 u(2,i-1,j,k,c)*um1 +
+     >                 (u(5,i+1,j,k,c)- square(i+1,j,k,c)-
+     >                 u(5,i-1,j,k,c)+ square(i-1,j,k,c))*
+     >                 c2)
+
+                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dx3tx1 * 
+     >                 (u(3,i+1,j,k,c) - 2.0d0*u(3,i,j,k,c) +
+     >                 u(3,i-1,j,k,c)) +
+     >                 xxcon2 * (vs(i+1,j,k,c) - 2.0d0*vs(i,j,k,c) +
+     >                 vs(i-1,j,k,c)) -
+     >                 tx2 * (u(3,i+1,j,k,c)*up1 - 
+     >                 u(3,i-1,j,k,c)*um1)
+
+                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dx4tx1 * 
+     >                 (u(4,i+1,j,k,c) - 2.0d0*u(4,i,j,k,c) +
+     >                 u(4,i-1,j,k,c)) +
+     >                 xxcon2 * (ws(i+1,j,k,c) - 2.0d0*ws(i,j,k,c) +
+     >                 ws(i-1,j,k,c)) -
+     >                 tx2 * (u(4,i+1,j,k,c)*up1 - 
+     >                 u(4,i-1,j,k,c)*um1)
+
+                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dx5tx1 * 
+     >                 (u(5,i+1,j,k,c) - 2.0d0*u(5,i,j,k,c) +
+     >                 u(5,i-1,j,k,c)) +
+     >                 xxcon3 * (qs(i+1,j,k,c) - 2.0d0*qs(i,j,k,c) +
+     >                 qs(i-1,j,k,c)) +
+     >                 xxcon4 * (up1*up1 -       2.0d0*uijk*uijk + 
+     >                 um1*um1) +
+     >                 xxcon5 * (u(5,i+1,j,k,c)*rho_i(i+1,j,k,c) - 
+     >                 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+     >                 u(5,i-1,j,k,c)*rho_i(i-1,j,k,c)) -
+     >                 tx2 * ( (c1*u(5,i+1,j,k,c) - 
+     >                 c2*square(i+1,j,k,c))*up1 -
+     >                 (c1*u(5,i-1,j,k,c) - 
+     >                 c2*square(i-1,j,k,c))*um1 )
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     add fourth order xi-direction dissipation               
+c---------------------------------------------------------------------
+         if (start(1,c) .gt. 0) then
+            do k = start(3,c), cell_size(3,c)-end(3,c)-1
+               do j = start(2,c), cell_size(2,c)-end(2,c)-1
+                  i = 1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
+     >                    ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) +
+     >                    u(m,i+2,j,k,c))
+                  enddo
+
+                  i = 2
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (-4.0d0*u(m,i-1,j,k,c) + 6.0d0*u(m,i,j,k,c) -
+     >                    4.0d0*u(m,i+1,j,k,c) + u(m,i+2,j,k,c))
+                  enddo
+               enddo
+            enddo
+         endif
+
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = 3*start(1,c),cell_size(1,c)-3*end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (  u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + 
+     >                    6.0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) + 
+     >                    u(m,i+2,j,k,c) )
+                  enddo
+               enddo
+            enddo
+         enddo
+         
+
+         if (end(1,c) .gt. 0) then
+            do k = start(3,c), cell_size(3,c)-end(3,c)-1
+               do j = start(2,c), cell_size(2,c)-end(2,c)-1
+                  i = cell_size(1,c)-3
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i-2,j,k,c) - 4.0d0*u(m,i-1,j,k,c) + 
+     >                    6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i+1,j,k,c) )
+                  enddo
+
+                  i = cell_size(1,c)-2
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i-2,j,k,c) - 4.d0*u(m,i-1,j,k,c) +
+     >                    5.d0*u(m,i,j,k,c) )
+                  enddo
+               enddo
+            enddo
+         endif
+
+c---------------------------------------------------------------------
+c     compute eta-direction fluxes 
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  vijk = vs(i,j,k,c)
+                  vp1  = vs(i,j+1,k,c)
+                  vm1  = vs(i,j-1,k,c)
+                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dy1ty1 * 
+     >                 (u(1,i,j+1,k,c) - 2.0d0*u(1,i,j,k,c) + 
+     >                 u(1,i,j-1,k,c)) -
+     >                 ty2 * (u(3,i,j+1,k,c) - u(3,i,j-1,k,c))
+                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dy2ty1 * 
+     >                 (u(2,i,j+1,k,c) - 2.0d0*u(2,i,j,k,c) + 
+     >                 u(2,i,j-1,k,c)) +
+     >                 yycon2 * (us(i,j+1,k,c) - 2.0d0*us(i,j,k,c) + 
+     >                 us(i,j-1,k,c)) -
+     >                 ty2 * (u(2,i,j+1,k,c)*vp1 - 
+     >                 u(2,i,j-1,k,c)*vm1)
+                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dy3ty1 * 
+     >                 (u(3,i,j+1,k,c) - 2.0d0*u(3,i,j,k,c) + 
+     >                 u(3,i,j-1,k,c)) +
+     >                 yycon2*con43 * (vp1 - 2.0d0*vijk + vm1) -
+     >                 ty2 * (u(3,i,j+1,k,c)*vp1 - 
+     >                 u(3,i,j-1,k,c)*vm1 +
+     >                 (u(5,i,j+1,k,c) - square(i,j+1,k,c) - 
+     >                 u(5,i,j-1,k,c) + square(i,j-1,k,c))
+     >                 *c2)
+                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dy4ty1 * 
+     >                 (u(4,i,j+1,k,c) - 2.0d0*u(4,i,j,k,c) + 
+     >                 u(4,i,j-1,k,c)) +
+     >                 yycon2 * (ws(i,j+1,k,c) - 2.0d0*ws(i,j,k,c) + 
+     >                 ws(i,j-1,k,c)) -
+     >                 ty2 * (u(4,i,j+1,k,c)*vp1 - 
+     >                 u(4,i,j-1,k,c)*vm1)
+                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dy5ty1 * 
+     >                 (u(5,i,j+1,k,c) - 2.0d0*u(5,i,j,k,c) + 
+     >                 u(5,i,j-1,k,c)) +
+     >                 yycon3 * (qs(i,j+1,k,c) - 2.0d0*qs(i,j,k,c) + 
+     >                 qs(i,j-1,k,c)) +
+     >                 yycon4 * (vp1*vp1       - 2.0d0*vijk*vijk + 
+     >                 vm1*vm1) +
+     >                 yycon5 * (u(5,i,j+1,k,c)*rho_i(i,j+1,k,c) - 
+     >                 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+     >                 u(5,i,j-1,k,c)*rho_i(i,j-1,k,c)) -
+     >                 ty2 * ((c1*u(5,i,j+1,k,c) - 
+     >                 c2*square(i,j+1,k,c)) * vp1 -
+     >                 (c1*u(5,i,j-1,k,c) - 
+     >                 c2*square(i,j-1,k,c)) * vm1)
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     add fourth order eta-direction dissipation         
+c---------------------------------------------------------------------
+         if (start(2,c) .gt. 0) then
+            do k = start(3,c), cell_size(3,c)-end(3,c)-1
+               j = 1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
+     >                    ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) +
+     >                    u(m,i,j+2,k,c))
+                  enddo
+               enddo
+
+               j = 2
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (-4.0d0*u(m,i,j-1,k,c) + 6.0d0*u(m,i,j,k,c) -
+     >                    4.0d0*u(m,i,j+1,k,c) + u(m,i,j+2,k,c))
+                  enddo
+               enddo
+            enddo
+         endif
+
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = 3*start(2,c), cell_size(2,c)-3*end(2,c)-1
+               do i = start(1,c),cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (  u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + 
+     >                    6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) + 
+     >                    u(m,i,j+2,k,c) )
+                  enddo
+               enddo
+            enddo
+         enddo
+         
+         if (end(2,c) .gt. 0) then
+            do k = start(3,c), cell_size(3,c)-end(3,c)-1
+               j = cell_size(2,c)-3
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i,j-2,k,c) - 4.0d0*u(m,i,j-1,k,c) + 
+     >                    6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j+1,k,c) )
+                  enddo
+               enddo
+
+               j = cell_size(2,c)-2
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i,j-2,k,c) - 4.d0*u(m,i,j-1,k,c) +
+     >                    5.d0*u(m,i,j,k,c) )
+                  enddo
+               enddo
+            enddo
+         endif
+
+c---------------------------------------------------------------------
+c     compute zeta-direction fluxes 
+c---------------------------------------------------------------------
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  wijk = ws(i,j,k,c)
+                  wp1  = ws(i,j,k+1,c)
+                  wm1  = ws(i,j,k-1,c)
+
+                  rhs(1,i,j,k,c) = rhs(1,i,j,k,c) + dz1tz1 * 
+     >                 (u(1,i,j,k+1,c) - 2.0d0*u(1,i,j,k,c) + 
+     >                 u(1,i,j,k-1,c)) -
+     >                 tz2 * (u(4,i,j,k+1,c) - u(4,i,j,k-1,c))
+                  rhs(2,i,j,k,c) = rhs(2,i,j,k,c) + dz2tz1 * 
+     >                 (u(2,i,j,k+1,c) - 2.0d0*u(2,i,j,k,c) + 
+     >                 u(2,i,j,k-1,c)) +
+     >                 zzcon2 * (us(i,j,k+1,c) - 2.0d0*us(i,j,k,c) + 
+     >                 us(i,j,k-1,c)) -
+     >                 tz2 * (u(2,i,j,k+1,c)*wp1 - 
+     >                 u(2,i,j,k-1,c)*wm1)
+                  rhs(3,i,j,k,c) = rhs(3,i,j,k,c) + dz3tz1 * 
+     >                 (u(3,i,j,k+1,c) - 2.0d0*u(3,i,j,k,c) + 
+     >                 u(3,i,j,k-1,c)) +
+     >                 zzcon2 * (vs(i,j,k+1,c) - 2.0d0*vs(i,j,k,c) + 
+     >                 vs(i,j,k-1,c)) -
+     >                 tz2 * (u(3,i,j,k+1,c)*wp1 - 
+     >                 u(3,i,j,k-1,c)*wm1)
+                  rhs(4,i,j,k,c) = rhs(4,i,j,k,c) + dz4tz1 * 
+     >                 (u(4,i,j,k+1,c) - 2.0d0*u(4,i,j,k,c) + 
+     >                 u(4,i,j,k-1,c)) +
+     >                 zzcon2*con43 * (wp1 - 2.0d0*wijk + wm1) -
+     >                 tz2 * (u(4,i,j,k+1,c)*wp1 - 
+     >                 u(4,i,j,k-1,c)*wm1 +
+     >                 (u(5,i,j,k+1,c) - square(i,j,k+1,c) - 
+     >                 u(5,i,j,k-1,c) + square(i,j,k-1,c))
+     >                 *c2)
+                  rhs(5,i,j,k,c) = rhs(5,i,j,k,c) + dz5tz1 * 
+     >                 (u(5,i,j,k+1,c) - 2.0d0*u(5,i,j,k,c) + 
+     >                 u(5,i,j,k-1,c)) +
+     >                 zzcon3 * (qs(i,j,k+1,c) - 2.0d0*qs(i,j,k,c) + 
+     >                 qs(i,j,k-1,c)) +
+     >                 zzcon4 * (wp1*wp1 - 2.0d0*wijk*wijk + 
+     >                 wm1*wm1) +
+     >                 zzcon5 * (u(5,i,j,k+1,c)*rho_i(i,j,k+1,c) - 
+     >                 2.0d0*u(5,i,j,k,c)*rho_i(i,j,k,c) +
+     >                 u(5,i,j,k-1,c)*rho_i(i,j,k-1,c)) -
+     >                 tz2 * ( (c1*u(5,i,j,k+1,c) - 
+     >                 c2*square(i,j,k+1,c))*wp1 -
+     >                 (c1*u(5,i,j,k-1,c) - 
+     >                 c2*square(i,j,k-1,c))*wm1)
+               enddo
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     add fourth order zeta-direction dissipation                
+c---------------------------------------------------------------------
+         if (start(3,c) .gt. 0) then
+            k = 1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c)- dssp * 
+     >                    ( 5.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) +
+     >                    u(m,i,j,k+2,c))
+                  enddo
+               enddo
+            enddo
+
+            k = 2
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (-4.0d0*u(m,i,j,k-1,c) + 6.0d0*u(m,i,j,k,c) -
+     >                    4.0d0*u(m,i,j,k+1,c) + u(m,i,j,k+2,c))
+                  enddo
+               enddo
+            enddo
+         endif
+
+         do k = 3*start(3,c), cell_size(3,c)-3*end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c),cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp * 
+     >                    (  u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + 
+     >                    6.0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) + 
+     >                    u(m,i,j,k+2,c) )
+                  enddo
+               enddo
+            enddo
+         enddo
+         
+         if (end(3,c) .gt. 0) then
+            k = cell_size(3,c)-3
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i,j,k-2,c) - 4.0d0*u(m,i,j,k-1,c) + 
+     >                    6.0d0*u(m,i,j,k,c) - 4.0d0*u(m,i,j,k+1,c) )
+                  enddo
+               enddo
+            enddo
+
+            k = cell_size(3,c)-2
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) - dssp *
+     >                    ( u(m,i,j,k-2,c) - 4.d0*u(m,i,j,k-1,c) +
+     >                    5.d0*u(m,i,j,k,c) )
+                  enddo
+               enddo
+            enddo
+         endif
+
+         do k = start(3,c), cell_size(3,c)-end(3,c)-1
+            do j = start(2,c), cell_size(2,c)-end(2,c)-1
+               do i = start(1,c), cell_size(1,c)-end(1,c)-1
+                  do m = 1, 5
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) * dt
+                  enddo
+               enddo
+            enddo
+         enddo
+
+      enddo
+      
+      return
+      end
+
+
+
+
diff --git a/examples/smpi/NAS/BT/set_constants.f b/examples/smpi/NAS/BT/set_constants.f
new file mode 100644 (file)
index 0000000..81397d4
--- /dev/null
@@ -0,0 +1,202 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine  set_constants
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      
+      ce(1,1)  = 2.0d0
+      ce(1,2)  = 0.0d0
+      ce(1,3)  = 0.0d0
+      ce(1,4)  = 4.0d0
+      ce(1,5)  = 5.0d0
+      ce(1,6)  = 3.0d0
+      ce(1,7)  = 0.5d0
+      ce(1,8)  = 0.02d0
+      ce(1,9)  = 0.01d0
+      ce(1,10) = 0.03d0
+      ce(1,11) = 0.5d0
+      ce(1,12) = 0.4d0
+      ce(1,13) = 0.3d0
+      
+      ce(2,1)  = 1.0d0
+      ce(2,2)  = 0.0d0
+      ce(2,3)  = 0.0d0
+      ce(2,4)  = 0.0d0
+      ce(2,5)  = 1.0d0
+      ce(2,6)  = 2.0d0
+      ce(2,7)  = 3.0d0
+      ce(2,8)  = 0.01d0
+      ce(2,9)  = 0.03d0
+      ce(2,10) = 0.02d0
+      ce(2,11) = 0.4d0
+      ce(2,12) = 0.3d0
+      ce(2,13) = 0.5d0
+
+      ce(3,1)  = 2.0d0
+      ce(3,2)  = 2.0d0
+      ce(3,3)  = 0.0d0
+      ce(3,4)  = 0.0d0
+      ce(3,5)  = 0.0d0
+      ce(3,6)  = 2.0d0
+      ce(3,7)  = 3.0d0
+      ce(3,8)  = 0.04d0
+      ce(3,9)  = 0.03d0
+      ce(3,10) = 0.05d0
+      ce(3,11) = 0.3d0
+      ce(3,12) = 0.5d0
+      ce(3,13) = 0.4d0
+
+      ce(4,1)  = 2.0d0
+      ce(4,2)  = 2.0d0
+      ce(4,3)  = 0.0d0
+      ce(4,4)  = 0.0d0
+      ce(4,5)  = 0.0d0
+      ce(4,6)  = 2.0d0
+      ce(4,7)  = 3.0d0
+      ce(4,8)  = 0.03d0
+      ce(4,9)  = 0.05d0
+      ce(4,10) = 0.04d0
+      ce(4,11) = 0.2d0
+      ce(4,12) = 0.1d0
+      ce(4,13) = 0.3d0
+
+      ce(5,1)  = 5.0d0
+      ce(5,2)  = 4.0d0
+      ce(5,3)  = 3.0d0
+      ce(5,4)  = 2.0d0
+      ce(5,5)  = 0.1d0
+      ce(5,6)  = 0.4d0
+      ce(5,7)  = 0.3d0
+      ce(5,8)  = 0.05d0
+      ce(5,9)  = 0.04d0
+      ce(5,10) = 0.03d0
+      ce(5,11) = 0.1d0
+      ce(5,12) = 0.3d0
+      ce(5,13) = 0.2d0
+
+      c1 = 1.4d0
+      c2 = 0.4d0
+      c3 = 0.1d0
+      c4 = 1.0d0
+      c5 = 1.4d0
+
+      bt = dsqrt(0.5d0)
+
+      dnxm1 = 1.0d0 / dble(grid_points(1)-1)
+      dnym1 = 1.0d0 / dble(grid_points(2)-1)
+      dnzm1 = 1.0d0 / dble(grid_points(3)-1)
+
+      c1c2 = c1 * c2
+      c1c5 = c1 * c5
+      c3c4 = c3 * c4
+      c1345 = c1c5 * c3c4
+
+      conz1 = (1.0d0-c1c5)
+
+      tx1 = 1.0d0 / (dnxm1 * dnxm1)
+      tx2 = 1.0d0 / (2.0d0 * dnxm1)
+      tx3 = 1.0d0 / dnxm1
+
+      ty1 = 1.0d0 / (dnym1 * dnym1)
+      ty2 = 1.0d0 / (2.0d0 * dnym1)
+      ty3 = 1.0d0 / dnym1
+      
+      tz1 = 1.0d0 / (dnzm1 * dnzm1)
+      tz2 = 1.0d0 / (2.0d0 * dnzm1)
+      tz3 = 1.0d0 / dnzm1
+
+      dx1 = 0.75d0
+      dx2 = 0.75d0
+      dx3 = 0.75d0
+      dx4 = 0.75d0
+      dx5 = 0.75d0
+
+      dy1 = 0.75d0
+      dy2 = 0.75d0
+      dy3 = 0.75d0
+      dy4 = 0.75d0
+      dy5 = 0.75d0
+
+      dz1 = 1.0d0
+      dz2 = 1.0d0
+      dz3 = 1.0d0
+      dz4 = 1.0d0
+      dz5 = 1.0d0
+
+      dxmax = dmax1(dx3, dx4)
+      dymax = dmax1(dy2, dy4)
+      dzmax = dmax1(dz2, dz3)
+
+      dssp = 0.25d0 * dmax1(dx1, dmax1(dy1, dz1) )
+
+      c4dssp = 4.0d0 * dssp
+      c5dssp = 5.0d0 * dssp
+
+      dttx1 = dt*tx1
+      dttx2 = dt*tx2
+      dtty1 = dt*ty1
+      dtty2 = dt*ty2
+      dttz1 = dt*tz1
+      dttz2 = dt*tz2
+
+      c2dttx1 = 2.0d0*dttx1
+      c2dtty1 = 2.0d0*dtty1
+      c2dttz1 = 2.0d0*dttz1
+
+      dtdssp = dt*dssp
+
+      comz1  = dtdssp
+      comz4  = 4.0d0*dtdssp
+      comz5  = 5.0d0*dtdssp
+      comz6  = 6.0d0*dtdssp
+
+      c3c4tx3 = c3c4*tx3
+      c3c4ty3 = c3c4*ty3
+      c3c4tz3 = c3c4*tz3
+
+      dx1tx1 = dx1*tx1
+      dx2tx1 = dx2*tx1
+      dx3tx1 = dx3*tx1
+      dx4tx1 = dx4*tx1
+      dx5tx1 = dx5*tx1
+      
+      dy1ty1 = dy1*ty1
+      dy2ty1 = dy2*ty1
+      dy3ty1 = dy3*ty1
+      dy4ty1 = dy4*ty1
+      dy5ty1 = dy5*ty1
+      
+      dz1tz1 = dz1*tz1
+      dz2tz1 = dz2*tz1
+      dz3tz1 = dz3*tz1
+      dz4tz1 = dz4*tz1
+      dz5tz1 = dz5*tz1
+
+      c2iv  = 2.5d0
+      con43 = 4.0d0/3.0d0
+      con16 = 1.0d0/6.0d0
+      
+      xxcon1 = c3c4tx3*con43*tx3
+      xxcon2 = c3c4tx3*tx3
+      xxcon3 = c3c4tx3*conz1*tx3
+      xxcon4 = c3c4tx3*con16*tx3
+      xxcon5 = c3c4tx3*c1c5*tx3
+
+      yycon1 = c3c4ty3*con43*ty3
+      yycon2 = c3c4ty3*ty3
+      yycon3 = c3c4ty3*conz1*ty3
+      yycon4 = c3c4ty3*con16*ty3
+      yycon5 = c3c4ty3*c1c5*ty3
+
+      zzcon1 = c3c4tz3*con43*tz3
+      zzcon2 = c3c4tz3*tz3
+      zzcon3 = c3c4tz3*conz1*tz3
+      zzcon4 = c3c4tz3*con16*tz3
+      zzcon5 = c3c4tz3*c1c5*tz3
+
+      return
+      end
diff --git a/examples/smpi/NAS/BT/setup_mpi.f b/examples/smpi/NAS/BT/setup_mpi.f
new file mode 100644 (file)
index 0000000..987c6bf
--- /dev/null
@@ -0,0 +1,64 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_mpi
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c set up MPI stuff
+c---------------------------------------------------------------------
+
+      implicit none
+      include 'mpinpb.h'
+      include 'npbparams.h'
+      integer error, color, nc
+
+      call mpi_init(error)
+      
+      call mpi_comm_size(MPI_COMM_WORLD, total_nodes, error)
+      call mpi_comm_rank(MPI_COMM_WORLD, node, error)
+
+      if (.not. convertdouble) then
+         dp_type = MPI_DOUBLE_PRECISION
+      else
+         dp_type = MPI_REAL
+      endif
+
+c---------------------------------------------------------------------
+c     compute square root; add small number to allow for roundoff
+c---------------------------------------------------------------------
+      nc = dint(dsqrt(dble(total_nodes) + 0.00001d0))
+
+c---------------------------------------------------------------------
+c We handle a non-square number of nodes by making the excess nodes
+c inactive. However, we can never handle more cells than were compiled
+c in. 
+c---------------------------------------------------------------------
+
+      if (nc .gt. maxcells) nc = maxcells
+      if (node .ge. nc*nc) then
+         active = .false.
+         color = 1
+      else
+         active = .true.
+         color = 0
+      end if
+      
+      call mpi_comm_split(MPI_COMM_WORLD,color,node,comm_setup,error)
+      if (.not. active) return
+
+      call mpi_comm_size(comm_setup, no_nodes, error)
+      call mpi_comm_dup(comm_setup, comm_solve, error)
+      call mpi_comm_dup(comm_setup, comm_rhs, error)
+      
+c---------------------------------------------------------------------
+c     let node 0 be the root for the group (there is only one)
+c---------------------------------------------------------------------
+      root = 0
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/simple_mpiio.f b/examples/smpi/NAS/BT/simple_mpiio.f
new file mode 100644 (file)
index 0000000..02e2700
--- /dev/null
@@ -0,0 +1,213 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine setup_btio
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer m, ierr
+
+      iseek=0
+
+      if (node .eq. root) then
+          call MPI_File_delete(filenm, MPI_INFO_NULL, ierr)
+      endif
+
+      call MPI_Barrier(comm_solve, ierr)
+
+      call MPI_File_open(comm_solve,
+     $          filenm,
+     $          MPI_MODE_RDWR + MPI_MODE_CREATE,
+     $          MPI_INFO_NULL,
+     $          fp,
+     $          ierr)
+
+      call MPI_File_set_view(fp,
+     $          iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
+     $          'native', MPI_INFO_NULL, ierr)
+
+      if (ierr .ne. MPI_SUCCESS) then
+          print *, 'Error opening file'
+          stop
+      endif
+
+      do m = 1, 5
+         xce_sub(m) = 0.d0
+      end do
+
+      idump_sub = 0
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine output_timestep
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer count, jio, kio, cio, aio
+      integer ierr
+      integer mstatus(MPI_STATUS_SIZE)
+
+      do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  iseek=5*(cell_low(1,cio) +
+     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+     $                   PROBLEM_SIZE*idump_sub)))
+
+                  count=5*cell_size(1,cio)
+
+                  call MPI_File_write_at(fp, iseek,
+     $                  u(1,0,jio,kio,cio),
+     $                  count, MPI_DOUBLE_PRECISION,
+     $                  mstatus, ierr)
+
+                  if (ierr .ne. MPI_SUCCESS) then
+                      print *, 'Error writing to file'
+                      stop
+                  endif
+              enddo
+          enddo
+      enddo
+
+      idump_sub = idump_sub + 1
+      if (rd_interval .gt. 0) then
+         if (idump_sub .ge. rd_interval) then
+
+            call acc_sub_norms(idump+1)
+
+            idump_sub = 0
+         endif
+      endif
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine acc_sub_norms(idump_cur)
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer idump_cur
+
+      integer count, jio, kio, cio, ii, m, ichunk
+      integer ierr
+      integer mstatus(MPI_STATUS_SIZE)
+      double precision xce_single(5)
+
+      ichunk = idump_cur - idump_sub + 1
+      do ii=0, idump_sub-1
+        do cio=1,ncells
+          do kio=0, cell_size(3,cio)-1
+              do jio=0, cell_size(2,cio)-1
+                  iseek=5*(cell_low(1,cio) +
+     $                   PROBLEM_SIZE*((cell_low(2,cio)+jio) +
+     $                   PROBLEM_SIZE*((cell_low(3,cio)+kio) +
+     $                   PROBLEM_SIZE*ii)))
+
+                  count=5*cell_size(1,cio)
+
+                  call MPI_File_read_at(fp, iseek,
+     $                  u(1,0,jio,kio,cio),
+     $                  count, MPI_DOUBLE_PRECISION,
+     $                  mstatus, ierr)
+
+                  if (ierr .ne. MPI_SUCCESS) then
+                      print *, 'Error reading back file'
+                      call MPI_File_close(fp, ierr)
+                      stop
+                  endif
+              enddo
+          enddo
+        enddo
+
+        if (node .eq. root) print *, 'Reading data set ', ii+ichunk
+
+        call error_norm(xce_single)
+        do m = 1, 5
+           xce_sub(m) = xce_sub(m) + xce_single(m)
+        end do
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine btio_cleanup
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ierr
+
+      call MPI_File_close(fp, ierr)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine accumulate_norms(xce_acc)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      double precision xce_acc(5)
+      integer m, ierr
+
+      if (rd_interval .gt. 0) goto 20
+
+      call MPI_File_open(comm_solve,
+     $          filenm,
+     $          MPI_MODE_RDONLY,
+     $          MPI_INFO_NULL,
+     $          fp,
+     $          ierr)
+
+      iseek = 0
+      call MPI_File_set_view(fp,
+     $          iseek, MPI_DOUBLE_PRECISION, MPI_DOUBLE_PRECISION,
+     $          'native', MPI_INFO_NULL, ierr)
+
+c     clear the last time step
+
+      call clear_timestep
+
+c     read back the time steps and accumulate norms
+
+      call acc_sub_norms(idump)
+
+      call MPI_File_close(fp, ierr)
+
+ 20   continue
+      do m = 1, 5
+         xce_acc(m) = xce_sub(m) / dble(idump)
+      end do
+
+      return
+      end
+
diff --git a/examples/smpi/NAS/BT/solve_subs.f b/examples/smpi/NAS/BT/solve_subs.f
new file mode 100644 (file)
index 0000000..351489a
--- /dev/null
@@ -0,0 +1,642 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine matvec_sub(ablock,avec,bvec)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     subtracts bvec=bvec - ablock*avec
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision ablock,avec,bvec
+      dimension ablock(5,5),avec(5),bvec(5)
+
+c---------------------------------------------------------------------
+c            rhs(i,ic,jc,kc,ccell) = rhs(i,ic,jc,kc,ccell) 
+c     $           - lhs(i,1,ablock,ia,ja,ka,acell)*
+c---------------------------------------------------------------------
+         bvec(1) = bvec(1) - ablock(1,1)*avec(1)
+     >                     - ablock(1,2)*avec(2)
+     >                     - ablock(1,3)*avec(3)
+     >                     - ablock(1,4)*avec(4)
+     >                     - ablock(1,5)*avec(5)
+         bvec(2) = bvec(2) - ablock(2,1)*avec(1)
+     >                     - ablock(2,2)*avec(2)
+     >                     - ablock(2,3)*avec(3)
+     >                     - ablock(2,4)*avec(4)
+     >                     - ablock(2,5)*avec(5)
+         bvec(3) = bvec(3) - ablock(3,1)*avec(1)
+     >                     - ablock(3,2)*avec(2)
+     >                     - ablock(3,3)*avec(3)
+     >                     - ablock(3,4)*avec(4)
+     >                     - ablock(3,5)*avec(5)
+         bvec(4) = bvec(4) - ablock(4,1)*avec(1)
+     >                     - ablock(4,2)*avec(2)
+     >                     - ablock(4,3)*avec(3)
+     >                     - ablock(4,4)*avec(4)
+     >                     - ablock(4,5)*avec(5)
+         bvec(5) = bvec(5) - ablock(5,1)*avec(1)
+     >                     - ablock(5,2)*avec(2)
+     >                     - ablock(5,3)*avec(3)
+     >                     - ablock(5,4)*avec(4)
+     >                     - ablock(5,5)*avec(5)
+
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine matmul_sub(ablock, bblock, cblock)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     subtracts a(i,j,k) X b(i,j,k) from c(i,j,k)
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision ablock, bblock, cblock
+      dimension ablock(5,5), bblock(5,5), cblock(5,5)
+
+
+         cblock(1,1) = cblock(1,1) - ablock(1,1)*bblock(1,1)
+     >                             - ablock(1,2)*bblock(2,1)
+     >                             - ablock(1,3)*bblock(3,1)
+     >                             - ablock(1,4)*bblock(4,1)
+     >                             - ablock(1,5)*bblock(5,1)
+         cblock(2,1) = cblock(2,1) - ablock(2,1)*bblock(1,1)
+     >                             - ablock(2,2)*bblock(2,1)
+     >                             - ablock(2,3)*bblock(3,1)
+     >                             - ablock(2,4)*bblock(4,1)
+     >                             - ablock(2,5)*bblock(5,1)
+         cblock(3,1) = cblock(3,1) - ablock(3,1)*bblock(1,1)
+     >                             - ablock(3,2)*bblock(2,1)
+     >                             - ablock(3,3)*bblock(3,1)
+     >                             - ablock(3,4)*bblock(4,1)
+     >                             - ablock(3,5)*bblock(5,1)
+         cblock(4,1) = cblock(4,1) - ablock(4,1)*bblock(1,1)
+     >                             - ablock(4,2)*bblock(2,1)
+     >                             - ablock(4,3)*bblock(3,1)
+     >                             - ablock(4,4)*bblock(4,1)
+     >                             - ablock(4,5)*bblock(5,1)
+         cblock(5,1) = cblock(5,1) - ablock(5,1)*bblock(1,1)
+     >                             - ablock(5,2)*bblock(2,1)
+     >                             - ablock(5,3)*bblock(3,1)
+     >                             - ablock(5,4)*bblock(4,1)
+     >                             - ablock(5,5)*bblock(5,1)
+         cblock(1,2) = cblock(1,2) - ablock(1,1)*bblock(1,2)
+     >                             - ablock(1,2)*bblock(2,2)
+     >                             - ablock(1,3)*bblock(3,2)
+     >                             - ablock(1,4)*bblock(4,2)
+     >                             - ablock(1,5)*bblock(5,2)
+         cblock(2,2) = cblock(2,2) - ablock(2,1)*bblock(1,2)
+     >                             - ablock(2,2)*bblock(2,2)
+     >                             - ablock(2,3)*bblock(3,2)
+     >                             - ablock(2,4)*bblock(4,2)
+     >                             - ablock(2,5)*bblock(5,2)
+         cblock(3,2) = cblock(3,2) - ablock(3,1)*bblock(1,2)
+     >                             - ablock(3,2)*bblock(2,2)
+     >                             - ablock(3,3)*bblock(3,2)
+     >                             - ablock(3,4)*bblock(4,2)
+     >                             - ablock(3,5)*bblock(5,2)
+         cblock(4,2) = cblock(4,2) - ablock(4,1)*bblock(1,2)
+     >                             - ablock(4,2)*bblock(2,2)
+     >                             - ablock(4,3)*bblock(3,2)
+     >                             - ablock(4,4)*bblock(4,2)
+     >                             - ablock(4,5)*bblock(5,2)
+         cblock(5,2) = cblock(5,2) - ablock(5,1)*bblock(1,2)
+     >                             - ablock(5,2)*bblock(2,2)
+     >                             - ablock(5,3)*bblock(3,2)
+     >                             - ablock(5,4)*bblock(4,2)
+     >                             - ablock(5,5)*bblock(5,2)
+         cblock(1,3) = cblock(1,3) - ablock(1,1)*bblock(1,3)
+     >                             - ablock(1,2)*bblock(2,3)
+     >                             - ablock(1,3)*bblock(3,3)
+     >                             - ablock(1,4)*bblock(4,3)
+     >                             - ablock(1,5)*bblock(5,3)
+         cblock(2,3) = cblock(2,3) - ablock(2,1)*bblock(1,3)
+     >                             - ablock(2,2)*bblock(2,3)
+     >                             - ablock(2,3)*bblock(3,3)
+     >                             - ablock(2,4)*bblock(4,3)
+     >                             - ablock(2,5)*bblock(5,3)
+         cblock(3,3) = cblock(3,3) - ablock(3,1)*bblock(1,3)
+     >                             - ablock(3,2)*bblock(2,3)
+     >                             - ablock(3,3)*bblock(3,3)
+     >                             - ablock(3,4)*bblock(4,3)
+     >                             - ablock(3,5)*bblock(5,3)
+         cblock(4,3) = cblock(4,3) - ablock(4,1)*bblock(1,3)
+     >                             - ablock(4,2)*bblock(2,3)
+     >                             - ablock(4,3)*bblock(3,3)
+     >                             - ablock(4,4)*bblock(4,3)
+     >                             - ablock(4,5)*bblock(5,3)
+         cblock(5,3) = cblock(5,3) - ablock(5,1)*bblock(1,3)
+     >                             - ablock(5,2)*bblock(2,3)
+     >                             - ablock(5,3)*bblock(3,3)
+     >                             - ablock(5,4)*bblock(4,3)
+     >                             - ablock(5,5)*bblock(5,3)
+         cblock(1,4) = cblock(1,4) - ablock(1,1)*bblock(1,4)
+     >                             - ablock(1,2)*bblock(2,4)
+     >                             - ablock(1,3)*bblock(3,4)
+     >                             - ablock(1,4)*bblock(4,4)
+     >                             - ablock(1,5)*bblock(5,4)
+         cblock(2,4) = cblock(2,4) - ablock(2,1)*bblock(1,4)
+     >                             - ablock(2,2)*bblock(2,4)
+     >                             - ablock(2,3)*bblock(3,4)
+     >                             - ablock(2,4)*bblock(4,4)
+     >                             - ablock(2,5)*bblock(5,4)
+         cblock(3,4) = cblock(3,4) - ablock(3,1)*bblock(1,4)
+     >                             - ablock(3,2)*bblock(2,4)
+     >                             - ablock(3,3)*bblock(3,4)
+     >                             - ablock(3,4)*bblock(4,4)
+     >                             - ablock(3,5)*bblock(5,4)
+         cblock(4,4) = cblock(4,4) - ablock(4,1)*bblock(1,4)
+     >                             - ablock(4,2)*bblock(2,4)
+     >                             - ablock(4,3)*bblock(3,4)
+     >                             - ablock(4,4)*bblock(4,4)
+     >                             - ablock(4,5)*bblock(5,4)
+         cblock(5,4) = cblock(5,4) - ablock(5,1)*bblock(1,4)
+     >                             - ablock(5,2)*bblock(2,4)
+     >                             - ablock(5,3)*bblock(3,4)
+     >                             - ablock(5,4)*bblock(4,4)
+     >                             - ablock(5,5)*bblock(5,4)
+         cblock(1,5) = cblock(1,5) - ablock(1,1)*bblock(1,5)
+     >                             - ablock(1,2)*bblock(2,5)
+     >                             - ablock(1,3)*bblock(3,5)
+     >                             - ablock(1,4)*bblock(4,5)
+     >                             - ablock(1,5)*bblock(5,5)
+         cblock(2,5) = cblock(2,5) - ablock(2,1)*bblock(1,5)
+     >                             - ablock(2,2)*bblock(2,5)
+     >                             - ablock(2,3)*bblock(3,5)
+     >                             - ablock(2,4)*bblock(4,5)
+     >                             - ablock(2,5)*bblock(5,5)
+         cblock(3,5) = cblock(3,5) - ablock(3,1)*bblock(1,5)
+     >                             - ablock(3,2)*bblock(2,5)
+     >                             - ablock(3,3)*bblock(3,5)
+     >                             - ablock(3,4)*bblock(4,5)
+     >                             - ablock(3,5)*bblock(5,5)
+         cblock(4,5) = cblock(4,5) - ablock(4,1)*bblock(1,5)
+     >                             - ablock(4,2)*bblock(2,5)
+     >                             - ablock(4,3)*bblock(3,5)
+     >                             - ablock(4,4)*bblock(4,5)
+     >                             - ablock(4,5)*bblock(5,5)
+         cblock(5,5) = cblock(5,5) - ablock(5,1)*bblock(1,5)
+     >                             - ablock(5,2)*bblock(2,5)
+     >                             - ablock(5,3)*bblock(3,5)
+     >                             - ablock(5,4)*bblock(4,5)
+     >                             - ablock(5,5)*bblock(5,5)
+
+              
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine binvcrhs( lhs,c,r )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision pivot, coeff, lhs
+      dimension lhs(5,5)
+      double precision c(5,5), r(5)
+
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+
+      pivot = 1.00d0/lhs(1,1)
+      lhs(1,2) = lhs(1,2)*pivot
+      lhs(1,3) = lhs(1,3)*pivot
+      lhs(1,4) = lhs(1,4)*pivot
+      lhs(1,5) = lhs(1,5)*pivot
+      c(1,1) = c(1,1)*pivot
+      c(1,2) = c(1,2)*pivot
+      c(1,3) = c(1,3)*pivot
+      c(1,4) = c(1,4)*pivot
+      c(1,5) = c(1,5)*pivot
+      r(1)   = r(1)  *pivot
+
+      coeff = lhs(2,1)
+      lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
+      lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
+      lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
+      c(2,1) = c(2,1) - coeff*c(1,1)
+      c(2,2) = c(2,2) - coeff*c(1,2)
+      c(2,3) = c(2,3) - coeff*c(1,3)
+      c(2,4) = c(2,4) - coeff*c(1,4)
+      c(2,5) = c(2,5) - coeff*c(1,5)
+      r(2)   = r(2)   - coeff*r(1)
+
+      coeff = lhs(3,1)
+      lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
+      lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
+      lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
+      c(3,1) = c(3,1) - coeff*c(1,1)
+      c(3,2) = c(3,2) - coeff*c(1,2)
+      c(3,3) = c(3,3) - coeff*c(1,3)
+      c(3,4) = c(3,4) - coeff*c(1,4)
+      c(3,5) = c(3,5) - coeff*c(1,5)
+      r(3)   = r(3)   - coeff*r(1)
+
+      coeff = lhs(4,1)
+      lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
+      lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
+      c(4,1) = c(4,1) - coeff*c(1,1)
+      c(4,2) = c(4,2) - coeff*c(1,2)
+      c(4,3) = c(4,3) - coeff*c(1,3)
+      c(4,4) = c(4,4) - coeff*c(1,4)
+      c(4,5) = c(4,5) - coeff*c(1,5)
+      r(4)   = r(4)   - coeff*r(1)
+
+      coeff = lhs(5,1)
+      lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
+      lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
+      c(5,1) = c(5,1) - coeff*c(1,1)
+      c(5,2) = c(5,2) - coeff*c(1,2)
+      c(5,3) = c(5,3) - coeff*c(1,3)
+      c(5,4) = c(5,4) - coeff*c(1,4)
+      c(5,5) = c(5,5) - coeff*c(1,5)
+      r(5)   = r(5)   - coeff*r(1)
+
+
+      pivot = 1.00d0/lhs(2,2)
+      lhs(2,3) = lhs(2,3)*pivot
+      lhs(2,4) = lhs(2,4)*pivot
+      lhs(2,5) = lhs(2,5)*pivot
+      c(2,1) = c(2,1)*pivot
+      c(2,2) = c(2,2)*pivot
+      c(2,3) = c(2,3)*pivot
+      c(2,4) = c(2,4)*pivot
+      c(2,5) = c(2,5)*pivot
+      r(2)   = r(2)  *pivot
+
+      coeff = lhs(1,2)
+      lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
+      lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
+      c(1,1) = c(1,1) - coeff*c(2,1)
+      c(1,2) = c(1,2) - coeff*c(2,2)
+      c(1,3) = c(1,3) - coeff*c(2,3)
+      c(1,4) = c(1,4) - coeff*c(2,4)
+      c(1,5) = c(1,5) - coeff*c(2,5)
+      r(1)   = r(1)   - coeff*r(2)
+
+      coeff = lhs(3,2)
+      lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
+      lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
+      c(3,1) = c(3,1) - coeff*c(2,1)
+      c(3,2) = c(3,2) - coeff*c(2,2)
+      c(3,3) = c(3,3) - coeff*c(2,3)
+      c(3,4) = c(3,4) - coeff*c(2,4)
+      c(3,5) = c(3,5) - coeff*c(2,5)
+      r(3)   = r(3)   - coeff*r(2)
+
+      coeff = lhs(4,2)
+      lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
+      c(4,1) = c(4,1) - coeff*c(2,1)
+      c(4,2) = c(4,2) - coeff*c(2,2)
+      c(4,3) = c(4,3) - coeff*c(2,3)
+      c(4,4) = c(4,4) - coeff*c(2,4)
+      c(4,5) = c(4,5) - coeff*c(2,5)
+      r(4)   = r(4)   - coeff*r(2)
+
+      coeff = lhs(5,2)
+      lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
+      c(5,1) = c(5,1) - coeff*c(2,1)
+      c(5,2) = c(5,2) - coeff*c(2,2)
+      c(5,3) = c(5,3) - coeff*c(2,3)
+      c(5,4) = c(5,4) - coeff*c(2,4)
+      c(5,5) = c(5,5) - coeff*c(2,5)
+      r(5)   = r(5)   - coeff*r(2)
+
+
+      pivot = 1.00d0/lhs(3,3)
+      lhs(3,4) = lhs(3,4)*pivot
+      lhs(3,5) = lhs(3,5)*pivot
+      c(3,1) = c(3,1)*pivot
+      c(3,2) = c(3,2)*pivot
+      c(3,3) = c(3,3)*pivot
+      c(3,4) = c(3,4)*pivot
+      c(3,5) = c(3,5)*pivot
+      r(3)   = r(3)  *pivot
+
+      coeff = lhs(1,3)
+      lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
+      c(1,1) = c(1,1) - coeff*c(3,1)
+      c(1,2) = c(1,2) - coeff*c(3,2)
+      c(1,3) = c(1,3) - coeff*c(3,3)
+      c(1,4) = c(1,4) - coeff*c(3,4)
+      c(1,5) = c(1,5) - coeff*c(3,5)
+      r(1)   = r(1)   - coeff*r(3)
+
+      coeff = lhs(2,3)
+      lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
+      c(2,1) = c(2,1) - coeff*c(3,1)
+      c(2,2) = c(2,2) - coeff*c(3,2)
+      c(2,3) = c(2,3) - coeff*c(3,3)
+      c(2,4) = c(2,4) - coeff*c(3,4)
+      c(2,5) = c(2,5) - coeff*c(3,5)
+      r(2)   = r(2)   - coeff*r(3)
+
+      coeff = lhs(4,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
+      c(4,1) = c(4,1) - coeff*c(3,1)
+      c(4,2) = c(4,2) - coeff*c(3,2)
+      c(4,3) = c(4,3) - coeff*c(3,3)
+      c(4,4) = c(4,4) - coeff*c(3,4)
+      c(4,5) = c(4,5) - coeff*c(3,5)
+      r(4)   = r(4)   - coeff*r(3)
+
+      coeff = lhs(5,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
+      c(5,1) = c(5,1) - coeff*c(3,1)
+      c(5,2) = c(5,2) - coeff*c(3,2)
+      c(5,3) = c(5,3) - coeff*c(3,3)
+      c(5,4) = c(5,4) - coeff*c(3,4)
+      c(5,5) = c(5,5) - coeff*c(3,5)
+      r(5)   = r(5)   - coeff*r(3)
+
+
+      pivot = 1.00d0/lhs(4,4)
+      lhs(4,5) = lhs(4,5)*pivot
+      c(4,1) = c(4,1)*pivot
+      c(4,2) = c(4,2)*pivot
+      c(4,3) = c(4,3)*pivot
+      c(4,4) = c(4,4)*pivot
+      c(4,5) = c(4,5)*pivot
+      r(4)   = r(4)  *pivot
+
+      coeff = lhs(1,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
+      c(1,1) = c(1,1) - coeff*c(4,1)
+      c(1,2) = c(1,2) - coeff*c(4,2)
+      c(1,3) = c(1,3) - coeff*c(4,3)
+      c(1,4) = c(1,4) - coeff*c(4,4)
+      c(1,5) = c(1,5) - coeff*c(4,5)
+      r(1)   = r(1)   - coeff*r(4)
+
+      coeff = lhs(2,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
+      c(2,1) = c(2,1) - coeff*c(4,1)
+      c(2,2) = c(2,2) - coeff*c(4,2)
+      c(2,3) = c(2,3) - coeff*c(4,3)
+      c(2,4) = c(2,4) - coeff*c(4,4)
+      c(2,5) = c(2,5) - coeff*c(4,5)
+      r(2)   = r(2)   - coeff*r(4)
+
+      coeff = lhs(3,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
+      c(3,1) = c(3,1) - coeff*c(4,1)
+      c(3,2) = c(3,2) - coeff*c(4,2)
+      c(3,3) = c(3,3) - coeff*c(4,3)
+      c(3,4) = c(3,4) - coeff*c(4,4)
+      c(3,5) = c(3,5) - coeff*c(4,5)
+      r(3)   = r(3)   - coeff*r(4)
+
+      coeff = lhs(5,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
+      c(5,1) = c(5,1) - coeff*c(4,1)
+      c(5,2) = c(5,2) - coeff*c(4,2)
+      c(5,3) = c(5,3) - coeff*c(4,3)
+      c(5,4) = c(5,4) - coeff*c(4,4)
+      c(5,5) = c(5,5) - coeff*c(4,5)
+      r(5)   = r(5)   - coeff*r(4)
+
+
+      pivot = 1.00d0/lhs(5,5)
+      c(5,1) = c(5,1)*pivot
+      c(5,2) = c(5,2)*pivot
+      c(5,3) = c(5,3)*pivot
+      c(5,4) = c(5,4)*pivot
+      c(5,5) = c(5,5)*pivot
+      r(5)   = r(5)  *pivot
+
+      coeff = lhs(1,5)
+      c(1,1) = c(1,1) - coeff*c(5,1)
+      c(1,2) = c(1,2) - coeff*c(5,2)
+      c(1,3) = c(1,3) - coeff*c(5,3)
+      c(1,4) = c(1,4) - coeff*c(5,4)
+      c(1,5) = c(1,5) - coeff*c(5,5)
+      r(1)   = r(1)   - coeff*r(5)
+
+      coeff = lhs(2,5)
+      c(2,1) = c(2,1) - coeff*c(5,1)
+      c(2,2) = c(2,2) - coeff*c(5,2)
+      c(2,3) = c(2,3) - coeff*c(5,3)
+      c(2,4) = c(2,4) - coeff*c(5,4)
+      c(2,5) = c(2,5) - coeff*c(5,5)
+      r(2)   = r(2)   - coeff*r(5)
+
+      coeff = lhs(3,5)
+      c(3,1) = c(3,1) - coeff*c(5,1)
+      c(3,2) = c(3,2) - coeff*c(5,2)
+      c(3,3) = c(3,3) - coeff*c(5,3)
+      c(3,4) = c(3,4) - coeff*c(5,4)
+      c(3,5) = c(3,5) - coeff*c(5,5)
+      r(3)   = r(3)   - coeff*r(5)
+
+      coeff = lhs(4,5)
+      c(4,1) = c(4,1) - coeff*c(5,1)
+      c(4,2) = c(4,2) - coeff*c(5,2)
+      c(4,3) = c(4,3) - coeff*c(5,3)
+      c(4,4) = c(4,4) - coeff*c(5,4)
+      c(4,5) = c(4,5) - coeff*c(5,5)
+      r(4)   = r(4)   - coeff*r(5)
+
+
+      return
+      end
+
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine binvrhs( lhs,r )
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+
+      implicit none
+
+      double precision pivot, coeff, lhs
+      dimension lhs(5,5)
+      double precision r(5)
+
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+
+
+      pivot = 1.00d0/lhs(1,1)
+      lhs(1,2) = lhs(1,2)*pivot
+      lhs(1,3) = lhs(1,3)*pivot
+      lhs(1,4) = lhs(1,4)*pivot
+      lhs(1,5) = lhs(1,5)*pivot
+      r(1)   = r(1)  *pivot
+
+      coeff = lhs(2,1)
+      lhs(2,2)= lhs(2,2) - coeff*lhs(1,2)
+      lhs(2,3)= lhs(2,3) - coeff*lhs(1,3)
+      lhs(2,4)= lhs(2,4) - coeff*lhs(1,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(1,5)
+      r(2)   = r(2)   - coeff*r(1)
+
+      coeff = lhs(3,1)
+      lhs(3,2)= lhs(3,2) - coeff*lhs(1,2)
+      lhs(3,3)= lhs(3,3) - coeff*lhs(1,3)
+      lhs(3,4)= lhs(3,4) - coeff*lhs(1,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(1,5)
+      r(3)   = r(3)   - coeff*r(1)
+
+      coeff = lhs(4,1)
+      lhs(4,2)= lhs(4,2) - coeff*lhs(1,2)
+      lhs(4,3)= lhs(4,3) - coeff*lhs(1,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(1,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(1,5)
+      r(4)   = r(4)   - coeff*r(1)
+
+      coeff = lhs(5,1)
+      lhs(5,2)= lhs(5,2) - coeff*lhs(1,2)
+      lhs(5,3)= lhs(5,3) - coeff*lhs(1,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(1,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(1,5)
+      r(5)   = r(5)   - coeff*r(1)
+
+
+      pivot = 1.00d0/lhs(2,2)
+      lhs(2,3) = lhs(2,3)*pivot
+      lhs(2,4) = lhs(2,4)*pivot
+      lhs(2,5) = lhs(2,5)*pivot
+      r(2)   = r(2)  *pivot
+
+      coeff = lhs(1,2)
+      lhs(1,3)= lhs(1,3) - coeff*lhs(2,3)
+      lhs(1,4)= lhs(1,4) - coeff*lhs(2,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(2,5)
+      r(1)   = r(1)   - coeff*r(2)
+
+      coeff = lhs(3,2)
+      lhs(3,3)= lhs(3,3) - coeff*lhs(2,3)
+      lhs(3,4)= lhs(3,4) - coeff*lhs(2,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(2,5)
+      r(3)   = r(3)   - coeff*r(2)
+
+      coeff = lhs(4,2)
+      lhs(4,3)= lhs(4,3) - coeff*lhs(2,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(2,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(2,5)
+      r(4)   = r(4)   - coeff*r(2)
+
+      coeff = lhs(5,2)
+      lhs(5,3)= lhs(5,3) - coeff*lhs(2,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(2,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(2,5)
+      r(5)   = r(5)   - coeff*r(2)
+
+
+      pivot = 1.00d0/lhs(3,3)
+      lhs(3,4) = lhs(3,4)*pivot
+      lhs(3,5) = lhs(3,5)*pivot
+      r(3)   = r(3)  *pivot
+
+      coeff = lhs(1,3)
+      lhs(1,4)= lhs(1,4) - coeff*lhs(3,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(3,5)
+      r(1)   = r(1)   - coeff*r(3)
+
+      coeff = lhs(2,3)
+      lhs(2,4)= lhs(2,4) - coeff*lhs(3,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(3,5)
+      r(2)   = r(2)   - coeff*r(3)
+
+      coeff = lhs(4,3)
+      lhs(4,4)= lhs(4,4) - coeff*lhs(3,4)
+      lhs(4,5)= lhs(4,5) - coeff*lhs(3,5)
+      r(4)   = r(4)   - coeff*r(3)
+
+      coeff = lhs(5,3)
+      lhs(5,4)= lhs(5,4) - coeff*lhs(3,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(3,5)
+      r(5)   = r(5)   - coeff*r(3)
+
+
+      pivot = 1.00d0/lhs(4,4)
+      lhs(4,5) = lhs(4,5)*pivot
+      r(4)   = r(4)  *pivot
+
+      coeff = lhs(1,4)
+      lhs(1,5)= lhs(1,5) - coeff*lhs(4,5)
+      r(1)   = r(1)   - coeff*r(4)
+
+      coeff = lhs(2,4)
+      lhs(2,5)= lhs(2,5) - coeff*lhs(4,5)
+      r(2)   = r(2)   - coeff*r(4)
+
+      coeff = lhs(3,4)
+      lhs(3,5)= lhs(3,5) - coeff*lhs(4,5)
+      r(3)   = r(3)   - coeff*r(4)
+
+      coeff = lhs(5,4)
+      lhs(5,5)= lhs(5,5) - coeff*lhs(4,5)
+      r(5)   = r(5)   - coeff*r(4)
+
+
+      pivot = 1.00d0/lhs(5,5)
+      r(5)   = r(5)  *pivot
+
+      coeff = lhs(1,5)
+      r(1)   = r(1)   - coeff*r(5)
+
+      coeff = lhs(2,5)
+      r(2)   = r(2)   - coeff*r(5)
+
+      coeff = lhs(3,5)
+      r(3)   = r(3)   - coeff*r(5)
+
+      coeff = lhs(4,5)
+      r(4)   = r(4)   - coeff*r(5)
+
+
+      return
+      end
+
+
+
diff --git a/examples/smpi/NAS/BT/verify.f b/examples/smpi/NAS/BT/verify.f
new file mode 100644 (file)
index 0000000..7dbc8a9
--- /dev/null
@@ -0,0 +1,435 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+        subroutine verify(no_time_steps, class, verified)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c  verification routine                         
+c---------------------------------------------------------------------
+
+        include 'header.h'
+        include 'mpinpb.h'
+
+        double precision xcrref(5),xceref(5),xcrdif(5),xcedif(5), 
+     >                   epsilon, xce(5), xcr(5), dtref
+        integer m, no_time_steps
+        character class
+        logical verified
+
+c---------------------------------------------------------------------
+c   tolerance level
+c---------------------------------------------------------------------
+        epsilon = 1.0d-08
+        verified = .true.
+
+c---------------------------------------------------------------------
+c   compute the error norm and the residual norm, and exit if not printing
+c---------------------------------------------------------------------
+
+        if (iotype .ne. 0) then
+           call accumulate_norms(xce)
+        else
+           call error_norm(xce)
+        endif
+
+        call copy_faces
+
+        call rhs_norm(xcr)
+
+        do m = 1, 5
+           xcr(m) = xcr(m) / dt
+        enddo
+
+        if (node .ne. 0) return
+
+        class = 'U'
+
+        do m = 1,5
+           xcrref(m) = 1.0
+           xceref(m) = 1.0
+        end do
+
+c---------------------------------------------------------------------
+c    reference data for 12X12X12 grids after 60 time steps, with DT = 1.0d-02
+c---------------------------------------------------------------------
+        if ( (grid_points(1)  .eq. 12     ) .and. 
+     >       (grid_points(2)  .eq. 12     ) .and.
+     >       (grid_points(3)  .eq. 12     ) .and.
+     >       (no_time_steps   .eq. 60    ))  then
+
+           class = 'S'
+           dtref = 1.0d-2
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 1.7034283709541311d-01
+         xcrref(2) = 1.2975252070034097d-02
+         xcrref(3) = 3.2527926989486055d-02
+         xcrref(4) = 2.6436421275166801d-02
+         xcrref(5) = 1.9211784131744430d-01
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 4.9976913345811579d-04
+           xceref(2) = 4.5195666782961927d-05
+           xceref(3) = 7.3973765172921357d-05
+           xceref(4) = 7.3821238632439731d-05
+           xceref(5) = 8.9269630987491446d-04
+         else
+           xceref(1) = 0.1149036328945d+02
+           xceref(2) = 0.9156788904727d+00
+           xceref(3) = 0.2857899428614d+01
+           xceref(4) = 0.2598273346734d+01
+           xceref(5) = 0.2652795397547d+02
+         endif
+
+c---------------------------------------------------------------------
+c    reference data for 24X24X24 grids after 200 time steps, with DT = 0.8d-3
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 24) .and. 
+     >           (grid_points(2) .eq. 24) .and.
+     >           (grid_points(3) .eq. 24) .and.
+     >           (no_time_steps . eq. 200) ) then
+
+           class = 'W'
+           dtref = 0.8d-3
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+           xcrref(1) = 0.1125590409344d+03
+           xcrref(2) = 0.1180007595731d+02
+           xcrref(3) = 0.2710329767846d+02
+           xcrref(4) = 0.2469174937669d+02
+           xcrref(5) = 0.2638427874317d+03
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 0.4419655736008d+01
+           xceref(2) = 0.4638531260002d+00
+           xceref(3) = 0.1011551749967d+01
+           xceref(4) = 0.9235878729944d+00
+           xceref(5) = 0.1018045837718d+02
+         else
+           xceref(1) = 0.6729594398612d+02
+           xceref(2) = 0.5264523081690d+01
+           xceref(3) = 0.1677107142637d+02
+           xceref(4) = 0.1508721463436d+02
+           xceref(5) = 0.1477018363393d+03
+         endif
+
+
+c---------------------------------------------------------------------
+c    reference data for 64X64X64 grids after 200 time steps, with DT = 0.8d-3
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 64) .and. 
+     >           (grid_points(2) .eq. 64) .and.
+     >           (grid_points(3) .eq. 64) .and.
+     >           (no_time_steps . eq. 200) ) then
+
+           class = 'A'
+           dtref = 0.8d-3
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 1.0806346714637264d+02
+         xcrref(2) = 1.1319730901220813d+01
+         xcrref(3) = 2.5974354511582465d+01
+         xcrref(4) = 2.3665622544678910d+01
+         xcrref(5) = 2.5278963211748344d+02
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 4.2348416040525025d+00
+           xceref(2) = 4.4390282496995698d-01
+           xceref(3) = 9.6692480136345650d-01
+           xceref(4) = 8.8302063039765474d-01
+           xceref(5) = 9.7379901770829278d+00
+         else
+           xceref(1) = 0.6482218724961d+02
+           xceref(2) = 0.5066461714527d+01
+           xceref(3) = 0.1613931961359d+02
+           xceref(4) = 0.1452010201481d+02
+           xceref(5) = 0.1420099377681d+03
+         endif
+
+c---------------------------------------------------------------------
+c    reference data for 102X102X102 grids after 200 time steps,
+c    with DT = 3.0d-04
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 102) .and. 
+     >           (grid_points(2) .eq. 102) .and.
+     >           (grid_points(3) .eq. 102) .and.
+     >           (no_time_steps . eq. 200) ) then
+
+           class = 'B'
+           dtref = 3.0d-4
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 1.4233597229287254d+03
+         xcrref(2) = 9.9330522590150238d+01
+         xcrref(3) = 3.5646025644535285d+02
+         xcrref(4) = 3.2485447959084092d+02
+         xcrref(5) = 3.2707541254659363d+03
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 5.2969847140936856d+01
+           xceref(2) = 4.4632896115670668d+00
+           xceref(3) = 1.3122573342210174d+01
+           xceref(4) = 1.2006925323559144d+01
+           xceref(5) = 1.2459576151035986d+02
+         else
+           xceref(1) = 0.1477545106464d+03
+           xceref(2) = 0.1108895555053d+02
+           xceref(3) = 0.3698065590331d+02
+           xceref(4) = 0.3310505581440d+02
+           xceref(5) = 0.3157928282563d+03
+         endif
+
+c---------------------------------------------------------------------
+c    reference data for 162X162X162 grids after 200 time steps,
+c    with DT = 1.0d-04
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 162) .and. 
+     >           (grid_points(2) .eq. 162) .and.
+     >           (grid_points(3) .eq. 162) .and.
+     >           (no_time_steps . eq. 200) ) then
+
+           class = 'C'
+           dtref = 1.0d-4
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 0.62398116551764615d+04
+         xcrref(2) = 0.50793239190423964d+03
+         xcrref(3) = 0.15423530093013596d+04
+         xcrref(4) = 0.13302387929291190d+04
+         xcrref(5) = 0.11604087428436455d+05
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 0.16462008369091265d+03
+           xceref(2) = 0.11497107903824313d+02
+           xceref(3) = 0.41207446207461508d+02
+           xceref(4) = 0.37087651059694167d+02
+           xceref(5) = 0.36211053051841265d+03
+         else
+           xceref(1) = 0.2597156483475d+03
+           xceref(2) = 0.1985384289495d+02
+           xceref(3) = 0.6517950485788d+02
+           xceref(4) = 0.5757235541520d+02
+           xceref(5) = 0.5215668188726d+03
+         endif 
+
+
+c---------------------------------------------------------------------
+c    reference data for 408x408x408 grids after 250 time steps,
+c    with DT = 0.2d-04
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 408) .and. 
+     >           (grid_points(2) .eq. 408) .and.
+     >           (grid_points(3) .eq. 408) .and.
+     >           (no_time_steps . eq. 250) ) then
+
+           class = 'D'
+           dtref = 0.2d-4
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 0.2533188551738d+05
+         xcrref(2) = 0.2346393716980d+04
+         xcrref(3) = 0.6294554366904d+04
+         xcrref(4) = 0.5352565376030d+04
+         xcrref(5) = 0.3905864038618d+05
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 0.3100009377557d+03
+           xceref(2) = 0.2424086324913d+02
+           xceref(3) = 0.7782212022645d+02
+           xceref(4) = 0.6835623860116d+02
+           xceref(5) = 0.6065737200368d+03
+         else
+           xceref(1) = 0.3813781566713d+03
+           xceref(2) = 0.3160872966198d+02
+           xceref(3) = 0.9593576357290d+02
+           xceref(4) = 0.8363391989815d+02
+           xceref(5) = 0.7063466087423d+03
+         endif
+
+
+c---------------------------------------------------------------------
+c    reference data for 1020x1020x1020 grids after 250 time steps,
+c    with DT = 0.4d-05
+c---------------------------------------------------------------------
+        elseif ( (grid_points(1) .eq. 1020) .and. 
+     >           (grid_points(2) .eq. 1020) .and.
+     >           (grid_points(3) .eq. 1020) .and.
+     >           (no_time_steps . eq. 250) ) then
+
+           class = 'E'
+           dtref = 0.4d-5
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of residual.
+c---------------------------------------------------------------------
+         xcrref(1) = 0.9795372484517d+05
+         xcrref(2) = 0.9739814511521d+04
+         xcrref(3) = 0.2467606342965d+05
+         xcrref(4) = 0.2092419572860d+05
+         xcrref(5) = 0.1392138856939d+06
+
+c---------------------------------------------------------------------
+c  Reference values of RMS-norms of solution error.
+c---------------------------------------------------------------------
+
+         if (iotype .eq. 0) then
+           xceref(1) = 0.4327562208414d+03
+           xceref(2) = 0.3699051964887d+02
+           xceref(3) = 0.1089845040954d+03
+           xceref(4) = 0.9462517622043d+02
+           xceref(5) = 0.7765512765309d+03
+         else
+c  wr_interval = 5
+           xceref(1) = 0.4729898413058d+03
+           xceref(2) = 0.4145899331704d+02
+           xceref(3) = 0.1192850917138d+03
+           xceref(4) = 0.1032746026932d+03
+           xceref(5) = 0.8270322177634d+03
+c  wr_interval = 10
+c          xceref(1) = 0.4718135916251d+03
+c          xceref(2) = 0.4132620259096d+02
+c          xceref(3) = 0.1189831133503d+03
+c          xceref(4) = 0.1030212798803d+03
+c          xceref(5) = 0.8255924078458d+03
+        endif
+
+        else
+           verified = .false.
+        endif
+
+c---------------------------------------------------------------------
+c    verification test for residuals if gridsize is one of 
+c    the defined grid sizes above (class .ne. 'U')
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c    Compute the difference of solution values and the known reference 
+c    values.
+c---------------------------------------------------------------------
+        do m = 1, 5
+           
+           xcrdif(m) = dabs((xcr(m)-xcrref(m))/xcrref(m)) 
+           xcedif(m) = dabs((xce(m)-xceref(m))/xceref(m))
+           
+        enddo
+
+c---------------------------------------------------------------------
+c    Output the comparison of computed results to known cases.
+c---------------------------------------------------------------------
+
+        if (class .ne. 'U') then
+           write(*, 1990) class
+ 1990      format(' Verification being performed for class ', a)
+           write (*,2000) epsilon
+ 2000      format(' accuracy setting for epsilon = ', E20.13)
+           verified = (dabs(dt-dtref) .le. epsilon)
+           if (.not.verified) then  
+              verified = .false.
+              class = 'U'
+              write (*,1000) dtref
+ 1000         format(' DT does not match the reference value of ', 
+     >                 E15.8)
+           endif
+        else 
+           write(*, 1995)
+ 1995      format(' Unknown class')
+        endif
+
+
+        if (class .ne. 'U') then
+           write (*,2001) 
+        else
+           write (*, 2005)
+        endif
+
+ 2001   format(' Comparison of RMS-norms of residual')
+ 2005   format(' RMS-norms of residual')
+        do m = 1, 5
+           if (class .eq. 'U') then
+              write(*, 2015) m, xcr(m)
+           else if (xcrdif(m) .le. epsilon) then
+              write (*,2011) m,xcr(m),xcrref(m),xcrdif(m)
+           else 
+              verified = .false.
+              write (*,2010) m,xcr(m),xcrref(m),xcrdif(m)
+           endif
+        enddo
+
+        if (class .ne. 'U') then
+           write (*,2002)
+        else
+           write (*,2006)
+        endif
+ 2002   format(' Comparison of RMS-norms of solution error')
+ 2006   format(' RMS-norms of solution error')
+        
+        do m = 1, 5
+           if (class .eq. 'U') then
+              write(*, 2015) m, xce(m)
+           else if (xcedif(m) .le. epsilon) then
+              write (*,2011) m,xce(m),xceref(m),xcedif(m)
+           else
+              verified = .false.
+              write (*,2010) m,xce(m),xceref(m),xcedif(m)
+           endif
+        enddo
+        
+ 2010   format(' FAILURE: ', i2, E20.13, E20.13, E20.13)
+ 2011   format('          ', i2, E20.13, E20.13, E20.13)
+ 2015   format('          ', i2, E20.13)
+        
+        if (class .eq. 'U') then
+           write(*, 2022)
+           write(*, 2023)
+ 2022      format(' No reference values provided')
+ 2023      format(' No verification performed')
+        else if (verified) then
+           write(*, 2020)
+ 2020      format(' Verification Successful')
+        else
+           write(*, 2021)
+ 2021      format(' Verification failed')
+        endif
+
+        return
+
+
+        end
diff --git a/examples/smpi/NAS/BT/work_lhs.h b/examples/smpi/NAS/BT/work_lhs.h
new file mode 100644 (file)
index 0000000..d9bc9e4
--- /dev/null
@@ -0,0 +1,14 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c
+c  work_lhs.h
+c
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      double precision fjac(5, 5, -2:MAX_CELL_DIM+1),
+     >                 njac(5, 5, -2:MAX_CELL_DIM+1),
+     >                 lhsa(5, 5, -1:MAX_CELL_DIM),
+     >                 lhsb(5, 5, -1:MAX_CELL_DIM),
+     >                 tmp1, tmp2, tmp3
+      common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
diff --git a/examples/smpi/NAS/BT/work_lhs_vec.h b/examples/smpi/NAS/BT/work_lhs_vec.h
new file mode 100644 (file)
index 0000000..a97054f
--- /dev/null
@@ -0,0 +1,14 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+c
+c  work_lhs_vec.h
+c
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      double precision fjac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
+     >                 njac(5, 5, -2:MAX_CELL_DIM+1, -2:MAX_CELL_DIM+1),
+     >                 lhsa(5, 5, -1:MAX_CELL_DIM,   -1:MAX_CELL_DIM),
+     >                 lhsb(5, 5, -1:MAX_CELL_DIM,   -1:MAX_CELL_DIM),
+     >                 tmp1, tmp2, tmp3
+      common /work_lhs/ fjac, njac, lhsa, lhsb, tmp1, tmp2, tmp3
diff --git a/examples/smpi/NAS/BT/x_solve.f b/examples/smpi/NAS/BT/x_solve.f
new file mode 100644 (file)
index 0000000..5386732
--- /dev/null
@@ -0,0 +1,761 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c     Performs line solves in X direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c     
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+      integer  c, istart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      istart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the x-direction
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(1,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+         
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsx(c)
+            call x_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call x_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsx(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(istart) and rhs'(istart) to be used in this cell
+c---------------------------------------------------------------------
+            call x_unpack_solve_info(c)
+            call x_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call x_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(1,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call x_backsubstitute(first, last,c)
+         else
+            call x_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call x_unpack_backsub_info(c)
+            call x_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call x_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      integer j,k,m,n,ptr,c,istart 
+
+      istart = 0
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine x_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(iend) and rhs'(iend) for
+c     all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer j,k,m,n,isize,ptr,c,jp,kp
+      integer error,send_id,buffer_size 
+
+      isize = cell_size(1,c)-1
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,isize,j,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(1),
+     >     WEST+jp+kp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(istart) for all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer j,k,n,ptr,c,istart,jp,kp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      istart = 0
+      jp = cell_coord(2,c)-1
+      kp = cell_coord(3,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,istart,j,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(1), 
+     >     EAST+jp+kp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(isize) for all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      integer j,k,n,ptr,c
+
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,j,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,jp,kp,c,buffer_size
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(1), 
+     >     EAST+jp+kp*NCELLS, comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer jp,kp,recv_id,error,c,buffer_size
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, predecessor(1), 
+     >     WEST+jp+kp*NCELLS,  comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine x_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(isize)=rhs(isize)
+c     else assume U(isize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(istart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, j, k
+      integer m,n,isize,jsize,ksize,istart
+      
+      istart = 0
+      isize = cell_size(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1      
+      ksize = cell_size(3,c)-end(3,c)-1
+      if (last .eq. 0) then
+         do k=start(3,c),ksize
+            do j=start(2,c),jsize
+c---------------------------------------------------------------------
+c     U(isize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
+     >                    - lhsc(m,n,isize,j,k,c)*
+     >                    backsub_info(n,j,k,c)
+c---------------------------------------------------------------------
+c     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
+c     $                    - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
+c---------------------------------------------------------------------
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=start(3,c),ksize
+         do j=start(2,c),jsize
+            do i=isize-1,istart,-1
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(IMAX) and rhs'(IMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs.h'
+
+      integer first,last,c
+      integer i,j,k,isize,ksize,jsize,istart
+
+      istart = 0
+      isize = cell_size(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+
+      call lhsabinit(lhsa, lhsb, isize)
+
+      do k=start(3,c),ksize 
+         do j=start(2,c),jsize
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side in the xi-direction
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+            do i = start(1,c)-1, cell_size(1,c) - end(1,c)
+
+               tmp1 = rho_i(i,j,k,c)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+               fjac(1,1,i) = 0.0d+00
+               fjac(1,2,i) = 1.0d+00
+               fjac(1,3,i) = 0.0d+00
+               fjac(1,4,i) = 0.0d+00
+               fjac(1,5,i) = 0.0d+00
+
+               fjac(2,1,i) = -(u(2,i,j,k,c) * tmp2 * 
+     >              u(2,i,j,k,c))
+     >              + c2 * qs(i,j,k,c)
+               fjac(2,2,i) = ( 2.0d+00 - c2 )
+     >              * ( u(2,i,j,k,c) * tmp1 )
+               fjac(2,3,i) = - c2 * ( u(3,i,j,k,c) * tmp1 )
+               fjac(2,4,i) = - c2 * ( u(4,i,j,k,c) * tmp1 )
+               fjac(2,5,i) = c2
+
+               fjac(3,1,i) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
+               fjac(3,2,i) = u(3,i,j,k,c) * tmp1
+               fjac(3,3,i) = u(2,i,j,k,c) * tmp1
+               fjac(3,4,i) = 0.0d+00
+               fjac(3,5,i) = 0.0d+00
+
+               fjac(4,1,i) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
+               fjac(4,2,i) = u(4,i,j,k,c) * tmp1
+               fjac(4,3,i) = 0.0d+00
+               fjac(4,4,i) = u(2,i,j,k,c) * tmp1
+               fjac(4,5,i) = 0.0d+00
+
+               fjac(5,1,i) = ( c2 * 2.0d0 * qs(i,j,k,c)
+     >              - c1 * ( u(5,i,j,k,c) * tmp1 ) )
+     >              * ( u(2,i,j,k,c) * tmp1 )
+               fjac(5,2,i) = c1 *  u(5,i,j,k,c) * tmp1 
+     >              - c2
+     >              * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
+     >              + qs(i,j,k,c) )
+               fjac(5,3,i) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
+     >              * tmp2
+               fjac(5,4,i) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
+     >              * tmp2
+               fjac(5,5,i) = c1 * ( u(2,i,j,k,c) * tmp1 )
+
+               njac(1,1,i) = 0.0d+00
+               njac(1,2,i) = 0.0d+00
+               njac(1,3,i) = 0.0d+00
+               njac(1,4,i) = 0.0d+00
+               njac(1,5,i) = 0.0d+00
+
+               njac(2,1,i) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
+               njac(2,2,i) =   con43 * c3c4 * tmp1
+               njac(2,3,i) =   0.0d+00
+               njac(2,4,i) =   0.0d+00
+               njac(2,5,i) =   0.0d+00
+
+               njac(3,1,i) = - c3c4 * tmp2 * u(3,i,j,k,c)
+               njac(3,2,i) =   0.0d+00
+               njac(3,3,i) =   c3c4 * tmp1
+               njac(3,4,i) =   0.0d+00
+               njac(3,5,i) =   0.0d+00
+
+               njac(4,1,i) = - c3c4 * tmp2 * u(4,i,j,k,c)
+               njac(4,2,i) =   0.0d+00 
+               njac(4,3,i) =   0.0d+00
+               njac(4,4,i) =   c3c4 * tmp1
+               njac(4,5,i) =   0.0d+00
+
+               njac(5,1,i) = - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+     >              - c1345 * tmp2 * u(5,i,j,k,c)
+
+               njac(5,2,i) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * u(2,i,j,k,c)
+               njac(5,3,i) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
+               njac(5,4,i) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
+               njac(5,5,i) = ( c1345 ) * tmp1
+
+            enddo
+c---------------------------------------------------------------------
+c     now jacobians set, so form left hand side in x direction
+c---------------------------------------------------------------------
+            do i = start(1,c), isize - end(1,c)
+
+               tmp1 = dt * tx1
+               tmp2 = dt * tx2
+
+               lhsa(1,1,i) = - tmp2 * fjac(1,1,i-1)
+     >              - tmp1 * njac(1,1,i-1)
+     >              - tmp1 * dx1 
+               lhsa(1,2,i) = - tmp2 * fjac(1,2,i-1)
+     >              - tmp1 * njac(1,2,i-1)
+               lhsa(1,3,i) = - tmp2 * fjac(1,3,i-1)
+     >              - tmp1 * njac(1,3,i-1)
+               lhsa(1,4,i) = - tmp2 * fjac(1,4,i-1)
+     >              - tmp1 * njac(1,4,i-1)
+               lhsa(1,5,i) = - tmp2 * fjac(1,5,i-1)
+     >              - tmp1 * njac(1,5,i-1)
+
+               lhsa(2,1,i) = - tmp2 * fjac(2,1,i-1)
+     >              - tmp1 * njac(2,1,i-1)
+               lhsa(2,2,i) = - tmp2 * fjac(2,2,i-1)
+     >              - tmp1 * njac(2,2,i-1)
+     >              - tmp1 * dx2
+               lhsa(2,3,i) = - tmp2 * fjac(2,3,i-1)
+     >              - tmp1 * njac(2,3,i-1)
+               lhsa(2,4,i) = - tmp2 * fjac(2,4,i-1)
+     >              - tmp1 * njac(2,4,i-1)
+               lhsa(2,5,i) = - tmp2 * fjac(2,5,i-1)
+     >              - tmp1 * njac(2,5,i-1)
+
+               lhsa(3,1,i) = - tmp2 * fjac(3,1,i-1)
+     >              - tmp1 * njac(3,1,i-1)
+               lhsa(3,2,i) = - tmp2 * fjac(3,2,i-1)
+     >              - tmp1 * njac(3,2,i-1)
+               lhsa(3,3,i) = - tmp2 * fjac(3,3,i-1)
+     >              - tmp1 * njac(3,3,i-1)
+     >              - tmp1 * dx3 
+               lhsa(3,4,i) = - tmp2 * fjac(3,4,i-1)
+     >              - tmp1 * njac(3,4,i-1)
+               lhsa(3,5,i) = - tmp2 * fjac(3,5,i-1)
+     >              - tmp1 * njac(3,5,i-1)
+
+               lhsa(4,1,i) = - tmp2 * fjac(4,1,i-1)
+     >              - tmp1 * njac(4,1,i-1)
+               lhsa(4,2,i) = - tmp2 * fjac(4,2,i-1)
+     >              - tmp1 * njac(4,2,i-1)
+               lhsa(4,3,i) = - tmp2 * fjac(4,3,i-1)
+     >              - tmp1 * njac(4,3,i-1)
+               lhsa(4,4,i) = - tmp2 * fjac(4,4,i-1)
+     >              - tmp1 * njac(4,4,i-1)
+     >              - tmp1 * dx4
+               lhsa(4,5,i) = - tmp2 * fjac(4,5,i-1)
+     >              - tmp1 * njac(4,5,i-1)
+
+               lhsa(5,1,i) = - tmp2 * fjac(5,1,i-1)
+     >              - tmp1 * njac(5,1,i-1)
+               lhsa(5,2,i) = - tmp2 * fjac(5,2,i-1)
+     >              - tmp1 * njac(5,2,i-1)
+               lhsa(5,3,i) = - tmp2 * fjac(5,3,i-1)
+     >              - tmp1 * njac(5,3,i-1)
+               lhsa(5,4,i) = - tmp2 * fjac(5,4,i-1)
+     >              - tmp1 * njac(5,4,i-1)
+               lhsa(5,5,i) = - tmp2 * fjac(5,5,i-1)
+     >              - tmp1 * njac(5,5,i-1)
+     >              - tmp1 * dx5
+
+               lhsb(1,1,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,i)
+     >              + tmp1 * 2.0d+00 * dx1
+               lhsb(1,2,i) = tmp1 * 2.0d+00 * njac(1,2,i)
+               lhsb(1,3,i) = tmp1 * 2.0d+00 * njac(1,3,i)
+               lhsb(1,4,i) = tmp1 * 2.0d+00 * njac(1,4,i)
+               lhsb(1,5,i) = tmp1 * 2.0d+00 * njac(1,5,i)
+
+               lhsb(2,1,i) = tmp1 * 2.0d+00 * njac(2,1,i)
+               lhsb(2,2,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,i)
+     >              + tmp1 * 2.0d+00 * dx2
+               lhsb(2,3,i) = tmp1 * 2.0d+00 * njac(2,3,i)
+               lhsb(2,4,i) = tmp1 * 2.0d+00 * njac(2,4,i)
+               lhsb(2,5,i) = tmp1 * 2.0d+00 * njac(2,5,i)
+
+               lhsb(3,1,i) = tmp1 * 2.0d+00 * njac(3,1,i)
+               lhsb(3,2,i) = tmp1 * 2.0d+00 * njac(3,2,i)
+               lhsb(3,3,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,i)
+     >              + tmp1 * 2.0d+00 * dx3
+               lhsb(3,4,i) = tmp1 * 2.0d+00 * njac(3,4,i)
+               lhsb(3,5,i) = tmp1 * 2.0d+00 * njac(3,5,i)
+
+               lhsb(4,1,i) = tmp1 * 2.0d+00 * njac(4,1,i)
+               lhsb(4,2,i) = tmp1 * 2.0d+00 * njac(4,2,i)
+               lhsb(4,3,i) = tmp1 * 2.0d+00 * njac(4,3,i)
+               lhsb(4,4,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,i)
+     >              + tmp1 * 2.0d+00 * dx4
+               lhsb(4,5,i) = tmp1 * 2.0d+00 * njac(4,5,i)
+
+               lhsb(5,1,i) = tmp1 * 2.0d+00 * njac(5,1,i)
+               lhsb(5,2,i) = tmp1 * 2.0d+00 * njac(5,2,i)
+               lhsb(5,3,i) = tmp1 * 2.0d+00 * njac(5,3,i)
+               lhsb(5,4,i) = tmp1 * 2.0d+00 * njac(5,4,i)
+               lhsb(5,5,i) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,i)
+     >              + tmp1 * 2.0d+00 * dx5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i+1)
+     >              - tmp1 * njac(1,1,i+1)
+     >              - tmp1 * dx1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i+1)
+     >              - tmp1 * njac(1,2,i+1)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i+1)
+     >              - tmp1 * njac(1,3,i+1)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i+1)
+     >              - tmp1 * njac(1,4,i+1)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i+1)
+     >              - tmp1 * njac(1,5,i+1)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i+1)
+     >              - tmp1 * njac(2,1,i+1)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i+1)
+     >              - tmp1 * njac(2,2,i+1)
+     >              - tmp1 * dx2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i+1)
+     >              - tmp1 * njac(2,3,i+1)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i+1)
+     >              - tmp1 * njac(2,4,i+1)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i+1)
+     >              - tmp1 * njac(2,5,i+1)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i+1)
+     >              - tmp1 * njac(3,1,i+1)
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i+1)
+     >              - tmp1 * njac(3,2,i+1)
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i+1)
+     >              - tmp1 * njac(3,3,i+1)
+     >              - tmp1 * dx3
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i+1)
+     >              - tmp1 * njac(3,4,i+1)
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i+1)
+     >              - tmp1 * njac(3,5,i+1)
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i+1)
+     >              - tmp1 * njac(4,1,i+1)
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i+1)
+     >              - tmp1 * njac(4,2,i+1)
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i+1)
+     >              - tmp1 * njac(4,3,i+1)
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i+1)
+     >              - tmp1 * njac(4,4,i+1)
+     >              - tmp1 * dx4
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i+1)
+     >              - tmp1 * njac(4,5,i+1)
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i+1)
+     >              - tmp1 * njac(5,1,i+1)
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i+1)
+     >              - tmp1 * njac(5,2,i+1)
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i+1)
+     >              - tmp1 * njac(5,3,i+1)
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i+1)
+     >              - tmp1 * njac(5,4,i+1)
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i+1)
+     >              - tmp1 * njac(5,5,i+1)
+     >              - tmp1 * dx5
+
+            enddo
+
+
+c---------------------------------------------------------------------
+c     outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+            if (first .eq. 1) then 
+
+c---------------------------------------------------------------------
+c     multiply c(istart,j,k) by b_inverse and copy back to c
+c     multiply rhs(istart) by b_inverse(istart) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,istart),
+     >                        lhsc(1,1,istart,j,k,c),
+     >                        rhs(1,istart,j,k,c) )
+
+            endif
+
+c---------------------------------------------------------------------
+c     begin inner most do loop
+c     do all the elements of the cell unless last 
+c---------------------------------------------------------------------
+            do i=istart+first,isize-last
+
+c---------------------------------------------------------------------
+c     rhs(i) = rhs(i) - A*rhs(i-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,i),
+     >                         rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(i) = B(i) - C(i-1)*A(i)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,i),
+     >                         lhsc(1,1,i-1,j,k,c),
+     >                         lhsb(1,1,i))
+
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,k) by b_inverse and copy back to c
+c     multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,i),
+     >                        lhsc(1,1,i,j,k,c),
+     >                        rhs(1,i,j,k,c) )
+
+            enddo
+
+c---------------------------------------------------------------------
+c     Now finish up special cases for last cell
+c---------------------------------------------------------------------
+            if (last .eq. 1) then
+
+c---------------------------------------------------------------------
+c     rhs(isize) = rhs(isize) - A*rhs(isize-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,isize),
+     >                         rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(isize) = B(isize) - C(isize-1)*A(isize)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,isize),
+     >                         lhsc(1,1,isize-1,j,k,c),
+     >                         lhsb(1,1,isize))
+
+c---------------------------------------------------------------------
+c     multiply rhs() by b_inverse() and copy to rhs
+c---------------------------------------------------------------------
+               call binvrhs( lhsb(1,1,isize),
+     >                       rhs(1,isize,j,k,c) )
+
+            endif
+         enddo
+      enddo
+
+
+      return
+      end
+      
diff --git a/examples/smpi/NAS/BT/x_solve_vec.f b/examples/smpi/NAS/BT/x_solve_vec.f
new file mode 100644 (file)
index 0000000..8f1c137
--- /dev/null
@@ -0,0 +1,789 @@
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     
+c     Performs line solves in X direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c     
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+      integer  c, istart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      istart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the x-direct
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(1,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+         
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsx(c)
+            call x_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call x_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsx(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(istart) and rhs'(istart) to be used in this cell
+c---------------------------------------------------------------------
+            call x_unpack_solve_info(c)
+            call x_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call x_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(1,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call x_backsubstitute(first, last,c)
+         else
+            call x_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call x_unpack_backsub_info(c)
+            call x_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call x_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      integer j,k,m,n,ptr,c,istart 
+
+      istart = 0
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,istart-1,j,k,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,istart-1,j,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine x_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(iend) and rhs'(iend) for
+c     all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer j,k,m,n,isize,ptr,c,jp,kp
+      integer error,send_id,buffer_size 
+
+      isize = cell_size(1,c)-1
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,isize,j,k,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,isize,j,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(1),
+     >     WEST+jp+kp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(istart) for all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer j,k,n,ptr,c,istart,jp,kp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      istart = 0
+      jp = cell_coord(2,c)-1
+      kp = cell_coord(3,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,istart,j,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(1), 
+     >     EAST+jp+kp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(isize) for all j and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      integer j,k,n,ptr,c
+
+      ptr = 0
+      do k=0,KMAX-1
+         do j=0,JMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,j,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,jp,kp,c,buffer_size
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(1), 
+     >     EAST+jp+kp*NCELLS, comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer jp,kp,recv_id,error,c,buffer_size
+      jp = cell_coord(2,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, predecessor(1), 
+     >     WEST+jp+kp*NCELLS,  comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine x_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(isize)=rhs(isize)
+c     else assume U(isize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(istart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, j, k
+      integer m,n,isize,jsize,ksize,istart
+      
+      istart = 0
+      isize = cell_size(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1      
+      ksize = cell_size(3,c)-end(3,c)-1
+      if (last .eq. 0) then
+         do k=start(3,c),ksize
+            do j=start(2,c),jsize
+c---------------------------------------------------------------------
+c     U(isize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
+     >                    - lhsc(m,n,isize,j,k,c)*
+     >                    backsub_info(n,j,k,c)
+c---------------------------------------------------------------------
+c     rhs(m,isize,j,k,c) = rhs(m,isize,j,k,c) 
+c     $                    - lhsc(m,n,isize,j,k,c)*rhs(n,isize+1,j,k,c)
+c---------------------------------------------------------------------
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=start(3,c),ksize
+         do j=start(2,c),jsize
+            do i=isize-1,istart,-1
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i+1,j,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine x_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(IMAX) and rhs'(IMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs_vec.h'
+
+      integer first,last,c
+      integer i,j,k,m,n,isize,ksize,jsize,istart
+
+      istart = 0
+      isize = cell_size(1,c)-1
+      jsize = cell_size(2,c)-end(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+
+c---------------------------------------------------------------------
+c     zero the left hand side for starters
+c     set diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+      do j = 0, jsize
+         do m = 1, 5
+            do n = 1, 5
+               lhsa(m,n,0,j) = 0.0d0
+               lhsb(m,n,0,j) = 0.0d0
+               lhsa(m,n,isize,j) = 0.0d0
+               lhsb(m,n,isize,j) = 0.0d0
+            enddo
+            lhsb(m,m,0,j) = 1.0d0
+            lhsb(m,m,isize,j) = 1.0d0
+         enddo
+      enddo
+
+      do k=start(3,c),ksize 
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side in the xi-direction
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+         do j=start(2,c),jsize
+            do i = start(1,c)-1, cell_size(1,c) - end(1,c)
+
+               tmp1 = rho_i(i,j,k,c)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+c---------------------------------------------------------------------
+c     
+c---------------------------------------------------------------------
+               fjac(1,1,i,j) = 0.0d+00
+               fjac(1,2,i,j) = 1.0d+00
+               fjac(1,3,i,j) = 0.0d+00
+               fjac(1,4,i,j) = 0.0d+00
+               fjac(1,5,i,j) = 0.0d+00
+
+               fjac(2,1,i,j) = -(u(2,i,j,k,c) * tmp2 * 
+     >              u(2,i,j,k,c))
+     >              + c2 * qs(i,j,k,c)
+               fjac(2,2,i,j) = ( 2.0d+00 - c2 )
+     >              * ( u(2,i,j,k,c) * tmp1 )
+               fjac(2,3,i,j) = - c2 * ( u(3,i,j,k,c) * tmp1 )
+               fjac(2,4,i,j) = - c2 * ( u(4,i,j,k,c) * tmp1 )
+               fjac(2,5,i,j) = c2
+
+               fjac(3,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) ) * tmp2
+               fjac(3,2,i,j) = u(3,i,j,k,c) * tmp1
+               fjac(3,3,i,j) = u(2,i,j,k,c) * tmp1
+               fjac(3,4,i,j) = 0.0d+00
+               fjac(3,5,i,j) = 0.0d+00
+
+               fjac(4,1,i,j) = - ( u(2,i,j,k,c)*u(4,i,j,k,c) ) * tmp2
+               fjac(4,2,i,j) = u(4,i,j,k,c) * tmp1
+               fjac(4,3,i,j) = 0.0d+00
+               fjac(4,4,i,j) = u(2,i,j,k,c) * tmp1
+               fjac(4,5,i,j) = 0.0d+00
+
+               fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
+     >              - c1 * ( u(5,i,j,k,c) * tmp1 ) )
+     >              * ( u(2,i,j,k,c) * tmp1 )
+               fjac(5,2,i,j) = c1 *  u(5,i,j,k,c) * tmp1 
+     >              - c2
+     >              * ( u(2,i,j,k,c)*u(2,i,j,k,c) * tmp2
+     >              + qs(i,j,k,c) )
+               fjac(5,3,i,j) = - c2 * ( u(3,i,j,k,c)*u(2,i,j,k,c) )
+     >              * tmp2
+               fjac(5,4,i,j) = - c2 * ( u(4,i,j,k,c)*u(2,i,j,k,c) )
+     >              * tmp2
+               fjac(5,5,i,j) = c1 * ( u(2,i,j,k,c) * tmp1 )
+
+               njac(1,1,i,j) = 0.0d+00
+               njac(1,2,i,j) = 0.0d+00
+               njac(1,3,i,j) = 0.0d+00
+               njac(1,4,i,j) = 0.0d+00
+               njac(1,5,i,j) = 0.0d+00
+
+               njac(2,1,i,j) = - con43 * c3c4 * tmp2 * u(2,i,j,k,c)
+               njac(2,2,i,j) =   con43 * c3c4 * tmp1
+               njac(2,3,i,j) =   0.0d+00
+               njac(2,4,i,j) =   0.0d+00
+               njac(2,5,i,j) =   0.0d+00
+
+               njac(3,1,i,j) = - c3c4 * tmp2 * u(3,i,j,k,c)
+               njac(3,2,i,j) =   0.0d+00
+               njac(3,3,i,j) =   c3c4 * tmp1
+               njac(3,4,i,j) =   0.0d+00
+               njac(3,5,i,j) =   0.0d+00
+
+               njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
+               njac(4,2,i,j) =   0.0d+00 
+               njac(4,3,i,j) =   0.0d+00
+               njac(4,4,i,j) =   c3c4 * tmp1
+               njac(4,5,i,j) =   0.0d+00
+
+               njac(5,1,i,j) = - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+     >              - c1345 * tmp2 * u(5,i,j,k,c)
+
+               njac(5,2,i,j) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * u(2,i,j,k,c)
+               njac(5,3,i,j) = ( c3c4 - c1345 ) * tmp2 * u(3,i,j,k,c)
+               njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
+               njac(5,5,i,j) = ( c1345 ) * tmp1
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     now jacobians set, so form left hand side in x direction
+c---------------------------------------------------------------------
+         do j=start(2,c),jsize
+            do i = start(1,c), isize - end(1,c)
+
+               tmp1 = dt * tx1
+               tmp2 = dt * tx2
+
+               lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i-1,j)
+     >              - tmp1 * njac(1,1,i-1,j)
+     >              - tmp1 * dx1 
+               lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i-1,j)
+     >              - tmp1 * njac(1,2,i-1,j)
+               lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i-1,j)
+     >              - tmp1 * njac(1,3,i-1,j)
+               lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i-1,j)
+     >              - tmp1 * njac(1,4,i-1,j)
+               lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i-1,j)
+     >              - tmp1 * njac(1,5,i-1,j)
+
+               lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i-1,j)
+     >              - tmp1 * njac(2,1,i-1,j)
+               lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i-1,j)
+     >              - tmp1 * njac(2,2,i-1,j)
+     >              - tmp1 * dx2
+               lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i-1,j)
+     >              - tmp1 * njac(2,3,i-1,j)
+               lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i-1,j)
+     >              - tmp1 * njac(2,4,i-1,j)
+               lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i-1,j)
+     >              - tmp1 * njac(2,5,i-1,j)
+
+               lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i-1,j)
+     >              - tmp1 * njac(3,1,i-1,j)
+               lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i-1,j)
+     >              - tmp1 * njac(3,2,i-1,j)
+               lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i-1,j)
+     >              - tmp1 * njac(3,3,i-1,j)
+     >              - tmp1 * dx3 
+               lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i-1,j)
+     >              - tmp1 * njac(3,4,i-1,j)
+               lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i-1,j)
+     >              - tmp1 * njac(3,5,i-1,j)
+
+               lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i-1,j)
+     >              - tmp1 * njac(4,1,i-1,j)
+               lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i-1,j)
+     >              - tmp1 * njac(4,2,i-1,j)
+               lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i-1,j)
+     >              - tmp1 * njac(4,3,i-1,j)
+               lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i-1,j)
+     >              - tmp1 * njac(4,4,i-1,j)
+     >              - tmp1 * dx4
+               lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i-1,j)
+     >              - tmp1 * njac(4,5,i-1,j)
+
+               lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i-1,j)
+     >              - tmp1 * njac(5,1,i-1,j)
+               lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i-1,j)
+     >              - tmp1 * njac(5,2,i-1,j)
+               lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i-1,j)
+     >              - tmp1 * njac(5,3,i-1,j)
+               lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i-1,j)
+     >              - tmp1 * njac(5,4,i-1,j)
+               lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i-1,j)
+     >              - tmp1 * njac(5,5,i-1,j)
+     >              - tmp1 * dx5
+
+               lhsb(1,1,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,i,j)
+     >              + tmp1 * 2.0d+00 * dx1
+               lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
+               lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
+               lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
+               lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
+
+               lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
+               lhsb(2,2,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,i,j)
+     >              + tmp1 * 2.0d+00 * dx2
+               lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
+               lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
+               lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
+
+               lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
+               lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
+               lhsb(3,3,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,i,j)
+     >              + tmp1 * 2.0d+00 * dx3
+               lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
+               lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
+
+               lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
+               lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
+               lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
+               lhsb(4,4,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,i,j)
+     >              + tmp1 * 2.0d+00 * dx4
+               lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
+
+               lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
+               lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
+               lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
+               lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
+               lhsb(5,5,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,i,j)
+     >              + tmp1 * 2.0d+00 * dx5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i+1,j)
+     >              - tmp1 * njac(1,1,i+1,j)
+     >              - tmp1 * dx1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i+1,j)
+     >              - tmp1 * njac(1,2,i+1,j)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i+1,j)
+     >              - tmp1 * njac(1,3,i+1,j)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i+1,j)
+     >              - tmp1 * njac(1,4,i+1,j)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i+1,j)
+     >              - tmp1 * njac(1,5,i+1,j)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i+1,j)
+     >              - tmp1 * njac(2,1,i+1,j)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i+1,j)
+     >              - tmp1 * njac(2,2,i+1,j)
+     >              - tmp1 * dx2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i+1,j)
+     >              - tmp1 * njac(2,3,i+1,j)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i+1,j)
+     >              - tmp1 * njac(2,4,i+1,j)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i+1,j)
+     >              - tmp1 * njac(2,5,i+1,j)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i+1,j)
+     >              - tmp1 * njac(3,1,i+1,j)
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,i+1,j)
+     >              - tmp1 * njac(3,2,i+1,j)
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,i+1,j)
+     >              - tmp1 * njac(3,3,i+1,j)
+     >              - tmp1 * dx3
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,i+1,j)
+     >              - tmp1 * njac(3,4,i+1,j)
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,i+1,j)
+     >              - tmp1 * njac(3,5,i+1,j)
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,i+1,j)
+     >              - tmp1 * njac(4,1,i+1,j)
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,i+1,j)
+     >              - tmp1 * njac(4,2,i+1,j)
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,i+1,j)
+     >              - tmp1 * njac(4,3,i+1,j)
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,i+1,j)
+     >              - tmp1 * njac(4,4,i+1,j)
+     >              - tmp1 * dx4
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,i+1,j)
+     >              - tmp1 * njac(4,5,i+1,j)
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,i+1,j)
+     >              - tmp1 * njac(5,1,i+1,j)
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,i+1,j)
+     >              - tmp1 * njac(5,2,i+1,j)
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,i+1,j)
+     >              - tmp1 * njac(5,3,i+1,j)
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,i+1,j)
+     >              - tmp1 * njac(5,4,i+1,j)
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,i+1,j)
+     >              - tmp1 * njac(5,5,i+1,j)
+     >              - tmp1 * dx5
+
+            enddo
+         enddo
+
+
+c---------------------------------------------------------------------
+c     outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+         if (first .eq. 1) then 
+
+c---------------------------------------------------------------------
+c     multiply c(istart,j,k) by b_inverse and copy back to c
+c     multiply rhs(istart) by b_inverse(istart) and copy to rhs
+c---------------------------------------------------------------------
+!dir$ ivdep
+            do j=start(2,c),jsize
+               call binvcrhs( lhsb(1,1,istart,j),
+     >                        lhsc(1,1,istart,j,k,c),
+     >                        rhs(1,istart,j,k,c) )
+            enddo
+
+         endif
+
+c---------------------------------------------------------------------
+c     begin inner most do loop
+c     do all the elements of the cell unless last 
+c---------------------------------------------------------------------
+!dir$ ivdep
+!dir$ interchange(i,j)
+         do j=start(2,c),jsize
+            do i=istart+first,isize-last
+
+c---------------------------------------------------------------------
+c     rhs(i) = rhs(i) - A*rhs(i-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,i,j),
+     >                         rhs(1,i-1,j,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(i) = B(i) - C(i-1)*A(i)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,i,j),
+     >                         lhsc(1,1,i-1,j,k,c),
+     >                         lhsb(1,1,i,j))
+
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,k) by b_inverse and copy back to c
+c     multiply rhs(1,j,k) by b_inverse(1,j,k) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,i,j),
+     >                        lhsc(1,1,i,j,k,c),
+     >                        rhs(1,i,j,k,c) )
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     Now finish up special cases for last cell
+c---------------------------------------------------------------------
+         if (last .eq. 1) then
+
+!dir$ ivdep
+            do j=start(2,c),jsize
+c---------------------------------------------------------------------
+c     rhs(isize) = rhs(isize) - A*rhs(isize-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,isize,j),
+     >                         rhs(1,isize-1,j,k,c),rhs(1,isize,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(isize) = B(isize) - C(isize-1)*A(isize)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,isize,j),
+     >                         lhsc(1,1,isize-1,j,k,c),
+     >                         lhsb(1,1,isize,j))
+
+c---------------------------------------------------------------------
+c     multiply rhs() by b_inverse() and copy to rhs
+c---------------------------------------------------------------------
+               call binvrhs( lhsb(1,1,isize,j),
+     >                       rhs(1,isize,j,k,c) )
+            enddo
+
+         endif
+      enddo
+
+
+      return
+      end
+      
diff --git a/examples/smpi/NAS/BT/y_solve.f b/examples/smpi/NAS/BT/y_solve.f
new file mode 100644 (file)
index 0000000..33e2ebc
--- /dev/null
@@ -0,0 +1,771 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Performs line solves in Y direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer 
+     >     c, jstart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      jstart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the y-direction
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(2,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsy(c)
+            call y_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call y_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsy(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
+c---------------------------------------------------------------------
+            call y_unpack_solve_info(c)
+            call y_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call y_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(2,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call y_backsubstitute(first, last,c)
+         else
+            call y_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call y_unpack_backsub_info(c)
+            call y_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call y_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine y_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,k,m,n,ptr,c,jstart 
+
+      jstart = 0
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine y_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(jend) and rhs'(jend) for
+c     all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,k,m,n,jsize,ptr,c,ip,kp
+      integer error,send_id,buffer_size 
+
+      jsize = cell_size(2,c)-1
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(2),
+     >     SOUTH+ip+kp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(jstart) for all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,k,n,ptr,c,jstart,ip,kp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      jstart = 0
+      ip = cell_coord(1,c)-1
+      kp = cell_coord(3,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(2), 
+     >     NORTH+ip+kp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(jsize) for all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,k,n,ptr,c 
+
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,i,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,ip,kp,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(2), 
+     >     NORTH+ip+kp*NCELLS, comm_solve, 
+     >     recv_id, error)
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ip,kp,recv_id,error,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size, 
+     >     dp_type, predecessor(2), 
+     >     SOUTH+ip+kp*NCELLS,  comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(jsize)=rhs(jsize)
+c     else assume U(jsize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(jstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, k
+      integer m,n,j,jsize,isize,ksize,jstart
+      
+      jstart = 0
+      isize = cell_size(1,c)-end(1,c)-1      
+      jsize = cell_size(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+      if (last .eq. 0) then
+         do k=start(3,c),ksize
+            do i=start(1,c),isize
+c---------------------------------------------------------------------
+c     U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) 
+     >                    - lhsc(m,n,i,jsize,k,c)*
+     >                    backsub_info(n,i,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=start(3,c),ksize
+         do j=jsize-1,jstart,-1
+            do i=start(1,c),isize
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(JMAX) and rhs'(JMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs.h'
+
+      integer first,last,c
+      integer i,j,k,isize,ksize,jsize,jstart
+      double precision utmp(6,-2:JMAX+1)
+
+      jstart = 0
+      isize = cell_size(1,c)-end(1,c)-1
+      jsize = cell_size(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+
+      call lhsabinit(lhsa, lhsb, jsize)
+
+      do k=start(3,c),ksize 
+         do i=start(1,c),isize
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side for the three y-factors   
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Compute the indices for storing the tri-diagonal matrix;
+c     determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+            do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+               utmp(1,j) = 1.0d0 / u(1,i,j,k,c)
+               utmp(2,j) = u(2,i,j,k,c)
+               utmp(3,j) = u(3,i,j,k,c)
+               utmp(4,j) = u(4,i,j,k,c)
+               utmp(5,j) = u(5,i,j,k,c)
+               utmp(6,j) = qs(i,j,k,c)
+            end do
+
+            do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+
+               tmp1 = utmp(1,j)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               fjac(1,1,j) = 0.0d+00
+               fjac(1,2,j) = 0.0d+00
+               fjac(1,3,j) = 1.0d+00
+               fjac(1,4,j) = 0.0d+00
+               fjac(1,5,j) = 0.0d+00
+
+               fjac(2,1,j) = - ( utmp(2,j)*utmp(3,j) )
+     >              * tmp2
+               fjac(2,2,j) = utmp(3,j) * tmp1
+               fjac(2,3,j) = utmp(2,j) * tmp1
+               fjac(2,4,j) = 0.0d+00
+               fjac(2,5,j) = 0.0d+00
+
+               fjac(3,1,j) = - ( utmp(3,j)*utmp(3,j)*tmp2)
+     >              + c2 * utmp(6,j)
+               fjac(3,2,j) = - c2 *  utmp(2,j) * tmp1
+               fjac(3,3,j) = ( 2.0d+00 - c2 )
+     >              *  utmp(3,j) * tmp1 
+               fjac(3,4,j) = - c2 * utmp(4,j) * tmp1 
+               fjac(3,5,j) = c2
+
+               fjac(4,1,j) = - ( utmp(3,j)*utmp(4,j) )
+     >              * tmp2
+               fjac(4,2,j) = 0.0d+00
+               fjac(4,3,j) = utmp(4,j) * tmp1
+               fjac(4,4,j) = utmp(3,j) * tmp1
+               fjac(4,5,j) = 0.0d+00
+
+               fjac(5,1,j) = ( c2 * 2.0d0 * utmp(6,j)
+     >              - c1 * utmp(5,j) * tmp1 ) 
+     >              * utmp(3,j) * tmp1 
+               fjac(5,2,j) = - c2 * utmp(2,j)*utmp(3,j) 
+     >              * tmp2
+               fjac(5,3,j) = c1 * utmp(5,j) * tmp1 
+     >              - c2 * ( utmp(6,j)
+     >              + utmp(3,j)*utmp(3,j) * tmp2 )
+               fjac(5,4,j) = - c2 * ( utmp(3,j)*utmp(4,j) )
+     >              * tmp2
+               fjac(5,5,j) = c1 * utmp(3,j) * tmp1 
+
+               njac(1,1,j) = 0.0d+00
+               njac(1,2,j) = 0.0d+00
+               njac(1,3,j) = 0.0d+00
+               njac(1,4,j) = 0.0d+00
+               njac(1,5,j) = 0.0d+00
+
+               njac(2,1,j) = - c3c4 * tmp2 * utmp(2,j)
+               njac(2,2,j) =   c3c4 * tmp1
+               njac(2,3,j) =   0.0d+00
+               njac(2,4,j) =   0.0d+00
+               njac(2,5,j) =   0.0d+00
+
+               njac(3,1,j) = - con43 * c3c4 * tmp2 * utmp(3,j)
+               njac(3,2,j) =   0.0d+00
+               njac(3,3,j) =   con43 * c3c4 * tmp1
+               njac(3,4,j) =   0.0d+00
+               njac(3,5,j) =   0.0d+00
+
+               njac(4,1,j) = - c3c4 * tmp2 * utmp(4,j)
+               njac(4,2,j) =   0.0d+00
+               njac(4,3,j) =   0.0d+00
+               njac(4,4,j) =   c3c4 * tmp1
+               njac(4,5,j) =   0.0d+00
+
+               njac(5,1,j) = - (  c3c4
+     >              - c1345 ) * tmp3 * (utmp(2,j)**2)
+     >              - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (utmp(3,j)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (utmp(4,j)**2)
+     >              - c1345 * tmp2 * utmp(5,j)
+
+               njac(5,2,j) = (  c3c4 - c1345 ) * tmp2 * utmp(2,j)
+               njac(5,3,j) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * utmp(3,j)
+               njac(5,4,j) = ( c3c4 - c1345 ) * tmp2 * utmp(4,j)
+               njac(5,5,j) = ( c1345 ) * tmp1
+
+            enddo
+
+c---------------------------------------------------------------------
+c     now joacobians set, so form left hand side in y direction
+c---------------------------------------------------------------------
+            do j = start(2,c), jsize-end(2,c)
+
+               tmp1 = dt * ty1
+               tmp2 = dt * ty2
+
+               lhsa(1,1,j) = - tmp2 * fjac(1,1,j-1)
+     >              - tmp1 * njac(1,1,j-1)
+     >              - tmp1 * dy1 
+               lhsa(1,2,j) = - tmp2 * fjac(1,2,j-1)
+     >              - tmp1 * njac(1,2,j-1)
+               lhsa(1,3,j) = - tmp2 * fjac(1,3,j-1)
+     >              - tmp1 * njac(1,3,j-1)
+               lhsa(1,4,j) = - tmp2 * fjac(1,4,j-1)
+     >              - tmp1 * njac(1,4,j-1)
+               lhsa(1,5,j) = - tmp2 * fjac(1,5,j-1)
+     >              - tmp1 * njac(1,5,j-1)
+
+               lhsa(2,1,j) = - tmp2 * fjac(2,1,j-1)
+     >              - tmp1 * njac(2,1,j-1)
+               lhsa(2,2,j) = - tmp2 * fjac(2,2,j-1)
+     >              - tmp1 * njac(2,2,j-1)
+     >              - tmp1 * dy2
+               lhsa(2,3,j) = - tmp2 * fjac(2,3,j-1)
+     >              - tmp1 * njac(2,3,j-1)
+               lhsa(2,4,j) = - tmp2 * fjac(2,4,j-1)
+     >              - tmp1 * njac(2,4,j-1)
+               lhsa(2,5,j) = - tmp2 * fjac(2,5,j-1)
+     >              - tmp1 * njac(2,5,j-1)
+
+               lhsa(3,1,j) = - tmp2 * fjac(3,1,j-1)
+     >              - tmp1 * njac(3,1,j-1)
+               lhsa(3,2,j) = - tmp2 * fjac(3,2,j-1)
+     >              - tmp1 * njac(3,2,j-1)
+               lhsa(3,3,j) = - tmp2 * fjac(3,3,j-1)
+     >              - tmp1 * njac(3,3,j-1)
+     >              - tmp1 * dy3 
+               lhsa(3,4,j) = - tmp2 * fjac(3,4,j-1)
+     >              - tmp1 * njac(3,4,j-1)
+               lhsa(3,5,j) = - tmp2 * fjac(3,5,j-1)
+     >              - tmp1 * njac(3,5,j-1)
+
+               lhsa(4,1,j) = - tmp2 * fjac(4,1,j-1)
+     >              - tmp1 * njac(4,1,j-1)
+               lhsa(4,2,j) = - tmp2 * fjac(4,2,j-1)
+     >              - tmp1 * njac(4,2,j-1)
+               lhsa(4,3,j) = - tmp2 * fjac(4,3,j-1)
+     >              - tmp1 * njac(4,3,j-1)
+               lhsa(4,4,j) = - tmp2 * fjac(4,4,j-1)
+     >              - tmp1 * njac(4,4,j-1)
+     >              - tmp1 * dy4
+               lhsa(4,5,j) = - tmp2 * fjac(4,5,j-1)
+     >              - tmp1 * njac(4,5,j-1)
+
+               lhsa(5,1,j) = - tmp2 * fjac(5,1,j-1)
+     >              - tmp1 * njac(5,1,j-1)
+               lhsa(5,2,j) = - tmp2 * fjac(5,2,j-1)
+     >              - tmp1 * njac(5,2,j-1)
+               lhsa(5,3,j) = - tmp2 * fjac(5,3,j-1)
+     >              - tmp1 * njac(5,3,j-1)
+               lhsa(5,4,j) = - tmp2 * fjac(5,4,j-1)
+     >              - tmp1 * njac(5,4,j-1)
+               lhsa(5,5,j) = - tmp2 * fjac(5,5,j-1)
+     >              - tmp1 * njac(5,5,j-1)
+     >              - tmp1 * dy5
+
+               lhsb(1,1,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,j)
+     >              + tmp1 * 2.0d+00 * dy1
+               lhsb(1,2,j) = tmp1 * 2.0d+00 * njac(1,2,j)
+               lhsb(1,3,j) = tmp1 * 2.0d+00 * njac(1,3,j)
+               lhsb(1,4,j) = tmp1 * 2.0d+00 * njac(1,4,j)
+               lhsb(1,5,j) = tmp1 * 2.0d+00 * njac(1,5,j)
+
+               lhsb(2,1,j) = tmp1 * 2.0d+00 * njac(2,1,j)
+               lhsb(2,2,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,j)
+     >              + tmp1 * 2.0d+00 * dy2
+               lhsb(2,3,j) = tmp1 * 2.0d+00 * njac(2,3,j)
+               lhsb(2,4,j) = tmp1 * 2.0d+00 * njac(2,4,j)
+               lhsb(2,5,j) = tmp1 * 2.0d+00 * njac(2,5,j)
+
+               lhsb(3,1,j) = tmp1 * 2.0d+00 * njac(3,1,j)
+               lhsb(3,2,j) = tmp1 * 2.0d+00 * njac(3,2,j)
+               lhsb(3,3,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,j)
+     >              + tmp1 * 2.0d+00 * dy3
+               lhsb(3,4,j) = tmp1 * 2.0d+00 * njac(3,4,j)
+               lhsb(3,5,j) = tmp1 * 2.0d+00 * njac(3,5,j)
+
+               lhsb(4,1,j) = tmp1 * 2.0d+00 * njac(4,1,j)
+               lhsb(4,2,j) = tmp1 * 2.0d+00 * njac(4,2,j)
+               lhsb(4,3,j) = tmp1 * 2.0d+00 * njac(4,3,j)
+               lhsb(4,4,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,j)
+     >              + tmp1 * 2.0d+00 * dy4
+               lhsb(4,5,j) = tmp1 * 2.0d+00 * njac(4,5,j)
+
+               lhsb(5,1,j) = tmp1 * 2.0d+00 * njac(5,1,j)
+               lhsb(5,2,j) = tmp1 * 2.0d+00 * njac(5,2,j)
+               lhsb(5,3,j) = tmp1 * 2.0d+00 * njac(5,3,j)
+               lhsb(5,4,j) = tmp1 * 2.0d+00 * njac(5,4,j)
+               lhsb(5,5,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,j) 
+     >              + tmp1 * 2.0d+00 * dy5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,j+1)
+     >              - tmp1 * njac(1,1,j+1)
+     >              - tmp1 * dy1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,j+1)
+     >              - tmp1 * njac(1,2,j+1)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,j+1)
+     >              - tmp1 * njac(1,3,j+1)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,j+1)
+     >              - tmp1 * njac(1,4,j+1)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,j+1)
+     >              - tmp1 * njac(1,5,j+1)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,j+1)
+     >              - tmp1 * njac(2,1,j+1)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,j+1)
+     >              - tmp1 * njac(2,2,j+1)
+     >              - tmp1 * dy2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,j+1)
+     >              - tmp1 * njac(2,3,j+1)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,j+1)
+     >              - tmp1 * njac(2,4,j+1)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,j+1)
+     >              - tmp1 * njac(2,5,j+1)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,j+1)
+     >              - tmp1 * njac(3,1,j+1)
+               lhsc(3,2,i,j,k,c) =  tmp2 * fjac(3,2,j+1)
+     >              - tmp1 * njac(3,2,j+1)
+               lhsc(3,3,i,j,k,c) =  tmp2 * fjac(3,3,j+1)
+     >              - tmp1 * njac(3,3,j+1)
+     >              - tmp1 * dy3
+               lhsc(3,4,i,j,k,c) =  tmp2 * fjac(3,4,j+1)
+     >              - tmp1 * njac(3,4,j+1)
+               lhsc(3,5,i,j,k,c) =  tmp2 * fjac(3,5,j+1)
+     >              - tmp1 * njac(3,5,j+1)
+
+               lhsc(4,1,i,j,k,c) =  tmp2 * fjac(4,1,j+1)
+     >              - tmp1 * njac(4,1,j+1)
+               lhsc(4,2,i,j,k,c) =  tmp2 * fjac(4,2,j+1)
+     >              - tmp1 * njac(4,2,j+1)
+               lhsc(4,3,i,j,k,c) =  tmp2 * fjac(4,3,j+1)
+     >              - tmp1 * njac(4,3,j+1)
+               lhsc(4,4,i,j,k,c) =  tmp2 * fjac(4,4,j+1)
+     >              - tmp1 * njac(4,4,j+1)
+     >              - tmp1 * dy4
+               lhsc(4,5,i,j,k,c) =  tmp2 * fjac(4,5,j+1)
+     >              - tmp1 * njac(4,5,j+1)
+
+               lhsc(5,1,i,j,k,c) =  tmp2 * fjac(5,1,j+1)
+     >              - tmp1 * njac(5,1,j+1)
+               lhsc(5,2,i,j,k,c) =  tmp2 * fjac(5,2,j+1)
+     >              - tmp1 * njac(5,2,j+1)
+               lhsc(5,3,i,j,k,c) =  tmp2 * fjac(5,3,j+1)
+     >              - tmp1 * njac(5,3,j+1)
+               lhsc(5,4,i,j,k,c) =  tmp2 * fjac(5,4,j+1)
+     >              - tmp1 * njac(5,4,j+1)
+               lhsc(5,5,i,j,k,c) =  tmp2 * fjac(5,5,j+1)
+     >              - tmp1 * njac(5,5,j+1)
+     >              - tmp1 * dy5
+
+            enddo
+
+
+c---------------------------------------------------------------------
+c     outer most do loops - sweeping in i direction
+c---------------------------------------------------------------------
+            if (first .eq. 1) then 
+
+c---------------------------------------------------------------------
+c     multiply c(i,jstart,k) by b_inverse and copy back to c
+c     multiply rhs(jstart) by b_inverse(jstart) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,jstart),
+     >                        lhsc(1,1,i,jstart,k,c),
+     >                        rhs(1,i,jstart,k,c) )
+
+            endif
+
+c---------------------------------------------------------------------
+c     begin inner most do loop
+c     do all the elements of the cell unless last 
+c---------------------------------------------------------------------
+            do j=jstart+first,jsize-last
+
+c---------------------------------------------------------------------
+c     subtract A*lhs_vector(j-1) from lhs_vector(j)
+c     
+c     rhs(j) = rhs(j) - A*rhs(j-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,j),
+     >                         rhs(1,i,j-1,k,c),rhs(1,i,j,k,c))
+
+c---------------------------------------------------------------------
+c     B(j) = B(j) - C(j-1)*A(j)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,j),
+     >                         lhsc(1,1,i,j-1,k,c),
+     >                         lhsb(1,1,j))
+
+c---------------------------------------------------------------------
+c     multiply c(i,j,k) by b_inverse and copy back to c
+c     multiply rhs(i,1,k) by b_inverse(i,1,k) and copy to rhs
+c---------------------------------------------------------------------
+               call binvcrhs( lhsb(1,1,j),
+     >                        lhsc(1,1,i,j,k,c),
+     >                        rhs(1,i,j,k,c) )
+
+            enddo
+
+c---------------------------------------------------------------------
+c     Now finish up special cases for last cell
+c---------------------------------------------------------------------
+            if (last .eq. 1) then
+
+c---------------------------------------------------------------------
+c     rhs(jsize) = rhs(jsize) - A*rhs(jsize-1)
+c---------------------------------------------------------------------
+               call matvec_sub(lhsa(1,1,jsize),
+     >                         rhs(1,i,jsize-1,k,c),rhs(1,i,jsize,k,c))
+
+c---------------------------------------------------------------------
+c     B(jsize) = B(jsize) - C(jsize-1)*A(jsize)
+c     call matmul_sub(aa,i,jsize,k,c,
+c     $              cc,i,jsize-1,k,c,bb,i,jsize,k,c)
+c---------------------------------------------------------------------
+               call matmul_sub(lhsa(1,1,jsize),
+     >                         lhsc(1,1,i,jsize-1,k,c),
+     >                         lhsb(1,1,jsize))
+
+c---------------------------------------------------------------------
+c     multiply rhs(jsize) by b_inverse(jsize) and copy to rhs
+c---------------------------------------------------------------------
+               call binvrhs( lhsb(1,1,jsize),
+     >                       rhs(1,i,jsize,k,c) )
+
+            endif
+         enddo
+      enddo
+
+
+      return
+      end
+      
+
+
diff --git a/examples/smpi/NAS/BT/y_solve_vec.f b/examples/smpi/NAS/BT/y_solve_vec.f
new file mode 100644 (file)
index 0000000..e21cfa3
--- /dev/null
@@ -0,0 +1,788 @@
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_solve
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Performs line solves in Y direction by first factoring
+c     the block-tridiagonal matrix into an upper triangular matrix, 
+c     and then performing back substitution to solve for the unknow
+c     vectors of each line.  
+c     
+c     Make sure we treat elements zero to cell_size in the direction
+c     of the sweep.
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer 
+     >     c, jstart, stage,
+     >     first, last, recv_id, error, r_status(MPI_STATUS_SIZE),
+     >     isize,jsize,ksize,send_id
+
+      jstart = 0
+
+c---------------------------------------------------------------------
+c     in our terminology stage is the number of the cell in the y-direct
+c     i.e. stage = 1 means the start of the line stage=ncells means end
+c---------------------------------------------------------------------
+      do stage = 1,ncells
+         c = slice(2,stage)
+         isize = cell_size(1,c) - 1
+         jsize = cell_size(2,c) - 1
+         ksize = cell_size(3,c) - 1
+
+c---------------------------------------------------------------------
+c     set last-cell flag
+c---------------------------------------------------------------------
+         if (stage .eq. ncells) then
+            last = 1
+         else
+            last = 0
+         endif
+
+         if (stage .eq. 1) then
+c---------------------------------------------------------------------
+c     This is the first cell, so solve without receiving data
+c---------------------------------------------------------------------
+            first = 1
+c            call lhsy(c)
+            call y_solve_cell(first,last,c)
+         else
+c---------------------------------------------------------------------
+c     Not the first cell of this line, so receive info from
+c     processor working on preceeding cell
+c---------------------------------------------------------------------
+            first = 0
+            call y_receive_solve_info(recv_id,c)
+c---------------------------------------------------------------------
+c     overlap computations and communications
+c---------------------------------------------------------------------
+c            call lhsy(c)
+c---------------------------------------------------------------------
+c     wait for completion
+c---------------------------------------------------------------------
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+c---------------------------------------------------------------------
+c     install C'(jstart+1) and rhs'(jstart+1) to be used in this cell
+c---------------------------------------------------------------------
+            call y_unpack_solve_info(c)
+            call y_solve_cell(first,last,c)
+         endif
+
+         if (last .eq. 0) call y_send_solve_info(send_id,c)
+      enddo
+
+c---------------------------------------------------------------------
+c     now perform backsubstitution in reverse direction
+c---------------------------------------------------------------------
+      do stage = ncells, 1, -1
+         c = slice(2,stage)
+         first = 0
+         last = 0
+         if (stage .eq. 1) first = 1
+         if (stage .eq. ncells) then
+            last = 1
+c---------------------------------------------------------------------
+c     last cell, so perform back substitute without waiting
+c---------------------------------------------------------------------
+            call y_backsubstitute(first, last,c)
+         else
+            call y_receive_backsub_info(recv_id,c)
+            call mpi_wait(send_id,r_status,error)
+            call mpi_wait(recv_id,r_status,error)
+            call y_unpack_backsub_info(c)
+            call y_backsubstitute(first,last,c)
+         endif
+         if (first .eq. 0) call y_send_backsub_info(send_id,c)
+      enddo
+
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine y_unpack_solve_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack C'(-1) and rhs'(-1) for
+c     all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,k,m,n,ptr,c,jstart 
+
+      jstart = 0
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  lhsc(m,n,i,jstart-1,k,c) = out_buffer(ptr+n)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               rhs(n,i,jstart-1,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+      
+      subroutine y_send_solve_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send C'(jend) and rhs'(jend) for
+c     all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,k,m,n,jsize,ptr,c,ip,kp
+      integer error,send_id,buffer_size 
+
+      jsize = cell_size(2,c)-1
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+
+c---------------------------------------------------------------------
+c     pack up buffer
+c---------------------------------------------------------------------
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do m=1,BLOCK_SIZE
+               do n=1,BLOCK_SIZE
+                  in_buffer(ptr+n) = lhsc(m,n,i,jsize,k,c)
+               enddo
+               ptr = ptr+BLOCK_SIZE
+            enddo
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,jsize,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+c---------------------------------------------------------------------
+c     send buffer 
+c---------------------------------------------------------------------
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, successor(2),
+     >     SOUTH+ip+kp*NCELLS, comm_solve,
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_send_backsub_info(send_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     pack up and send U(jstart) for all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer i,k,n,ptr,c,jstart,ip,kp
+      integer error,send_id,buffer_size
+
+c---------------------------------------------------------------------
+c     Send element 0 to previous processor
+c---------------------------------------------------------------------
+      jstart = 0
+      ip = cell_coord(1,c)-1
+      kp = cell_coord(3,c)-1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               in_buffer(ptr+n) = rhs(n,i,jstart,k,c)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+      call mpi_isend(in_buffer, buffer_size,
+     >     dp_type, predecessor(2), 
+     >     NORTH+ip+kp*NCELLS, comm_solve, 
+     >     send_id,error)
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_unpack_backsub_info(c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     unpack U(jsize) for all i and k
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer i,k,n,ptr,c 
+
+      ptr = 0
+      do k=0,KMAX-1
+         do i=0,IMAX-1
+            do n=1,BLOCK_SIZE
+               backsub_info(n,i,k,c) = out_buffer(ptr+n)
+            enddo
+            ptr = ptr+BLOCK_SIZE
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_receive_backsub_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer error,recv_id,ip,kp,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*BLOCK_SIZE
+      call mpi_irecv(out_buffer, buffer_size,
+     >     dp_type, successor(2), 
+     >     NORTH+ip+kp*NCELLS, comm_solve, 
+     >     recv_id, error)
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_receive_solve_info(recv_id,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     post mpi receives 
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'mpinpb.h'
+
+      integer ip,kp,recv_id,error,c,buffer_size
+      ip = cell_coord(1,c) - 1
+      kp = cell_coord(3,c) - 1
+      buffer_size=MAX_CELL_DIM*MAX_CELL_DIM*
+     >     (BLOCK_SIZE*BLOCK_SIZE + BLOCK_SIZE)
+      call mpi_irecv(out_buffer, buffer_size, 
+     >     dp_type, predecessor(2), 
+     >     SOUTH+ip+kp*NCELLS,  comm_solve, 
+     >     recv_id, error)
+
+      return
+      end
+      
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_backsubstitute(first, last, c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     back solve: if last cell, then generate U(jsize)=rhs(jsize)
+c     else assume U(jsize) is loaded in un pack backsub_info
+c     so just use it
+c     after call u(jstart) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+
+      integer first, last, c, i, k
+      integer m,n,j,jsize,isize,ksize,jstart
+      
+      jstart = 0
+      isize = cell_size(1,c)-end(1,c)-1      
+      jsize = cell_size(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+      if (last .eq. 0) then
+         do k=start(3,c),ksize
+            do i=start(1,c),isize
+c---------------------------------------------------------------------
+c     U(jsize) uses info from previous cell if not last cell
+c---------------------------------------------------------------------
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,jsize,k,c) = rhs(m,i,jsize,k,c) 
+     >                    - lhsc(m,n,i,jsize,k,c)*
+     >                    backsub_info(n,i,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      endif
+      do k=start(3,c),ksize
+         do j=jsize-1,jstart,-1
+            do i=start(1,c),isize
+               do m=1,BLOCK_SIZE
+                  do n=1,BLOCK_SIZE
+                     rhs(m,i,j,k,c) = rhs(m,i,j,k,c) 
+     >                    - lhsc(m,n,i,j,k,c)*rhs(n,i,j+1,k,c)
+                  enddo
+               enddo
+            enddo
+         enddo
+      enddo
+
+      return
+      end
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+      subroutine y_solve_cell(first,last,c)
+
+c---------------------------------------------------------------------
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     performs guaussian elimination on this cell.
+c     
+c     assumes that unpacking routines for non-first cells 
+c     preload C' and rhs' from previous cell.
+c     
+c     assumed send happens outside this routine, but that
+c     c'(JMAX) and rhs'(JMAX) will be sent to next cell
+c---------------------------------------------------------------------
+
+      include 'header.h'
+      include 'work_lhs_vec.h'
+
+      integer first,last,c
+      integer i,j,k,m,n,isize,ksize,jsize,jstart
+
+      jstart = 0
+      isize = cell_size(1,c)-end(1,c)-1
+      jsize = cell_size(2,c)-1
+      ksize = cell_size(3,c)-end(3,c)-1
+
+c---------------------------------------------------------------------
+c     zero the left hand side for starters
+c     set diagonal values to 1. This is overkill, but convenient
+c---------------------------------------------------------------------
+      do i = 0, isize
+         do m = 1, 5
+            do n = 1, 5
+               lhsa(m,n,i,0) = 0.0d0
+               lhsb(m,n,i,0) = 0.0d0
+               lhsa(m,n,i,jsize) = 0.0d0
+               lhsb(m,n,i,jsize) = 0.0d0
+            enddo
+            lhsb(m,m,i,0) = 1.0d0
+            lhsb(m,m,i,jsize) = 1.0d0
+         enddo
+      enddo
+
+      do k=start(3,c),ksize 
+
+c---------------------------------------------------------------------
+c     This function computes the left hand side for the three y-factors 
+c---------------------------------------------------------------------
+
+c---------------------------------------------------------------------
+c     Compute the indices for storing the tri-diagonal matrix;
+c     determine a (labeled f) and n jacobians for cell c
+c---------------------------------------------------------------------
+
+         do j = start(2,c)-1, cell_size(2,c)-end(2,c)
+            do i=start(1,c),isize
+
+               tmp1 = 1.0d0 / u(1,i,j,k,c)
+               tmp2 = tmp1 * tmp1
+               tmp3 = tmp1 * tmp2
+
+               fjac(1,1,i,j) = 0.0d+00
+               fjac(1,2,i,j) = 0.0d+00
+               fjac(1,3,i,j) = 1.0d+00
+               fjac(1,4,i,j) = 0.0d+00
+               fjac(1,5,i,j) = 0.0d+00
+
+               fjac(2,1,i,j) = - ( u(2,i,j,k,c)*u(3,i,j,k,c) )
+     >              * tmp2
+               fjac(2,2,i,j) = u(3,i,j,k,c) * tmp1
+               fjac(2,3,i,j) = u(2,i,j,k,c) * tmp1
+               fjac(2,4,i,j) = 0.0d+00
+               fjac(2,5,i,j) = 0.0d+00
+
+               fjac(3,1,i,j) = - ( u(3,i,j,k,c)*u(3,i,j,k,c)*tmp2)
+     >              + c2 * qs(i,j,k,c)
+               fjac(3,2,i,j) = - c2 *  u(2,i,j,k,c) * tmp1
+               fjac(3,3,i,j) = ( 2.0d+00 - c2 )
+     >              *  u(3,i,j,k,c) * tmp1 
+               fjac(3,4,i,j) = - c2 * u(4,i,j,k,c) * tmp1 
+               fjac(3,5,i,j) = c2
+
+               fjac(4,1,i,j) = - ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+     >              * tmp2
+               fjac(4,2,i,j) = 0.0d+00
+               fjac(4,3,i,j) = u(4,i,j,k,c) * tmp1
+               fjac(4,4,i,j) = u(3,i,j,k,c) * tmp1
+               fjac(4,5,i,j) = 0.0d+00
+
+               fjac(5,1,i,j) = ( c2 * 2.0d0 * qs(i,j,k,c)
+     >              - c1 * u(5,i,j,k,c) * tmp1 ) 
+     >              * u(3,i,j,k,c) * tmp1 
+               fjac(5,2,i,j) = - c2 * u(2,i,j,k,c)*u(3,i,j,k,c) 
+     >              * tmp2
+               fjac(5,3,i,j) = c1 * u(5,i,j,k,c) * tmp1 
+     >              - c2 * ( qs(i,j,k,c)
+     >              + u(3,i,j,k,c)*u(3,i,j,k,c) * tmp2 )
+               fjac(5,4,i,j) = - c2 * ( u(3,i,j,k,c)*u(4,i,j,k,c) )
+     >              * tmp2
+               fjac(5,5,i,j) = c1 * u(3,i,j,k,c) * tmp1 
+
+               njac(1,1,i,j) = 0.0d+00
+               njac(1,2,i,j) = 0.0d+00
+               njac(1,3,i,j) = 0.0d+00
+               njac(1,4,i,j) = 0.0d+00
+               njac(1,5,i,j) = 0.0d+00
+
+               njac(2,1,i,j) = - c3c4 * tmp2 * u(2,i,j,k,c)
+               njac(2,2,i,j) =   c3c4 * tmp1
+               njac(2,3,i,j) =   0.0d+00
+               njac(2,4,i,j) =   0.0d+00
+               njac(2,5,i,j) =   0.0d+00
+
+               njac(3,1,i,j) = - con43 * c3c4 * tmp2 * u(3,i,j,k,c)
+               njac(3,2,i,j) =   0.0d+00
+               njac(3,3,i,j) =   con43 * c3c4 * tmp1
+               njac(3,4,i,j) =   0.0d+00
+               njac(3,5,i,j) =   0.0d+00
+
+               njac(4,1,i,j) = - c3c4 * tmp2 * u(4,i,j,k,c)
+               njac(4,2,i,j) =   0.0d+00
+               njac(4,3,i,j) =   0.0d+00
+               njac(4,4,i,j) =   c3c4 * tmp1
+               njac(4,5,i,j) =   0.0d+00
+
+               njac(5,1,i,j) = - (  c3c4
+     >              - c1345 ) * tmp3 * (u(2,i,j,k,c)**2)
+     >              - ( con43 * c3c4
+     >              - c1345 ) * tmp3 * (u(3,i,j,k,c)**2)
+     >              - ( c3c4 - c1345 ) * tmp3 * (u(4,i,j,k,c)**2)
+     >              - c1345 * tmp2 * u(5,i,j,k,c)
+
+               njac(5,2,i,j) = (  c3c4 - c1345 ) * tmp2 * u(2,i,j,k,c)
+               njac(5,3,i,j) = ( con43 * c3c4
+     >              - c1345 ) * tmp2 * u(3,i,j,k,c)
+               njac(5,4,i,j) = ( c3c4 - c1345 ) * tmp2 * u(4,i,j,k,c)
+               njac(5,5,i,j) = ( c1345 ) * tmp1
+
+            enddo
+         enddo
+
+c---------------------------------------------------------------------
+c     now joacobians set, so form left hand side in y direction
+c---------------------------------------------------------------------
+         do j = start(2,c), jsize-end(2,c)
+            do i=start(1,c),isize
+
+               tmp1 = dt * ty1
+               tmp2 = dt * ty2
+
+               lhsa(1,1,i,j) = - tmp2 * fjac(1,1,i,j-1)
+     >              - tmp1 * njac(1,1,i,j-1)
+     >              - tmp1 * dy1 
+               lhsa(1,2,i,j) = - tmp2 * fjac(1,2,i,j-1)
+     >              - tmp1 * njac(1,2,i,j-1)
+               lhsa(1,3,i,j) = - tmp2 * fjac(1,3,i,j-1)
+     >              - tmp1 * njac(1,3,i,j-1)
+               lhsa(1,4,i,j) = - tmp2 * fjac(1,4,i,j-1)
+     >              - tmp1 * njac(1,4,i,j-1)
+               lhsa(1,5,i,j) = - tmp2 * fjac(1,5,i,j-1)
+     >              - tmp1 * njac(1,5,i,j-1)
+
+               lhsa(2,1,i,j) = - tmp2 * fjac(2,1,i,j-1)
+     >              - tmp1 * njac(2,1,i,j-1)
+               lhsa(2,2,i,j) = - tmp2 * fjac(2,2,i,j-1)
+     >              - tmp1 * njac(2,2,i,j-1)
+     >              - tmp1 * dy2
+               lhsa(2,3,i,j) = - tmp2 * fjac(2,3,i,j-1)
+     >              - tmp1 * njac(2,3,i,j-1)
+               lhsa(2,4,i,j) = - tmp2 * fjac(2,4,i,j-1)
+     >              - tmp1 * njac(2,4,i,j-1)
+               lhsa(2,5,i,j) = - tmp2 * fjac(2,5,i,j-1)
+     >              - tmp1 * njac(2,5,i,j-1)
+
+               lhsa(3,1,i,j) = - tmp2 * fjac(3,1,i,j-1)
+     >              - tmp1 * njac(3,1,i,j-1)
+               lhsa(3,2,i,j) = - tmp2 * fjac(3,2,i,j-1)
+     >              - tmp1 * njac(3,2,i,j-1)
+               lhsa(3,3,i,j) = - tmp2 * fjac(3,3,i,j-1)
+     >              - tmp1 * njac(3,3,i,j-1)
+     >              - tmp1 * dy3 
+               lhsa(3,4,i,j) = - tmp2 * fjac(3,4,i,j-1)
+     >              - tmp1 * njac(3,4,i,j-1)
+               lhsa(3,5,i,j) = - tmp2 * fjac(3,5,i,j-1)
+     >              - tmp1 * njac(3,5,i,j-1)
+
+               lhsa(4,1,i,j) = - tmp2 * fjac(4,1,i,j-1)
+     >              - tmp1 * njac(4,1,i,j-1)
+               lhsa(4,2,i,j) = - tmp2 * fjac(4,2,i,j-1)
+     >              - tmp1 * njac(4,2,i,j-1)
+               lhsa(4,3,i,j) = - tmp2 * fjac(4,3,i,j-1)
+     >              - tmp1 * njac(4,3,i,j-1)
+               lhsa(4,4,i,j) = - tmp2 * fjac(4,4,i,j-1)
+     >              - tmp1 * njac(4,4,i,j-1)
+     >              - tmp1 * dy4
+               lhsa(4,5,i,j) = - tmp2 * fjac(4,5,i,j-1)
+     >              - tmp1 * njac(4,5,i,j-1)
+
+               lhsa(5,1,i,j) = - tmp2 * fjac(5,1,i,j-1)
+     >              - tmp1 * njac(5,1,i,j-1)
+               lhsa(5,2,i,j) = - tmp2 * fjac(5,2,i,j-1)
+     >              - tmp1 * njac(5,2,i,j-1)
+               lhsa(5,3,i,j) = - tmp2 * fjac(5,3,i,j-1)
+     >              - tmp1 * njac(5,3,i,j-1)
+               lhsa(5,4,i,j) = - tmp2 * fjac(5,4,i,j-1)
+     >              - tmp1 * njac(5,4,i,j-1)
+               lhsa(5,5,i,j) = - tmp2 * fjac(5,5,i,j-1)
+     >              - tmp1 * njac(5,5,i,j-1)
+     >              - tmp1 * dy5
+
+               lhsb(1,1,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(1,1,i,j)
+     >              + tmp1 * 2.0d+00 * dy1
+               lhsb(1,2,i,j) = tmp1 * 2.0d+00 * njac(1,2,i,j)
+               lhsb(1,3,i,j) = tmp1 * 2.0d+00 * njac(1,3,i,j)
+               lhsb(1,4,i,j) = tmp1 * 2.0d+00 * njac(1,4,i,j)
+               lhsb(1,5,i,j) = tmp1 * 2.0d+00 * njac(1,5,i,j)
+
+               lhsb(2,1,i,j) = tmp1 * 2.0d+00 * njac(2,1,i,j)
+               lhsb(2,2,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(2,2,i,j)
+     >              + tmp1 * 2.0d+00 * dy2
+               lhsb(2,3,i,j) = tmp1 * 2.0d+00 * njac(2,3,i,j)
+               lhsb(2,4,i,j) = tmp1 * 2.0d+00 * njac(2,4,i,j)
+               lhsb(2,5,i,j) = tmp1 * 2.0d+00 * njac(2,5,i,j)
+
+               lhsb(3,1,i,j) = tmp1 * 2.0d+00 * njac(3,1,i,j)
+               lhsb(3,2,i,j) = tmp1 * 2.0d+00 * njac(3,2,i,j)
+               lhsb(3,3,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(3,3,i,j)
+     >              + tmp1 * 2.0d+00 * dy3
+               lhsb(3,4,i,j) = tmp1 * 2.0d+00 * njac(3,4,i,j)
+               lhsb(3,5,i,j) = tmp1 * 2.0d+00 * njac(3,5,i,j)
+
+               lhsb(4,1,i,j) = tmp1 * 2.0d+00 * njac(4,1,i,j)
+               lhsb(4,2,i,j) = tmp1 * 2.0d+00 * njac(4,2,i,j)
+               lhsb(4,3,i,j) = tmp1 * 2.0d+00 * njac(4,3,i,j)
+               lhsb(4,4,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(4,4,i,j)
+     >              + tmp1 * 2.0d+00 * dy4
+               lhsb(4,5,i,j) = tmp1 * 2.0d+00 * njac(4,5,i,j)
+
+               lhsb(5,1,i,j) = tmp1 * 2.0d+00 * njac(5,1,i,j)
+               lhsb(5,2,i,j) = tmp1 * 2.0d+00 * njac(5,2,i,j)
+               lhsb(5,3,i,j) = tmp1 * 2.0d+00 * njac(5,3,i,j)
+               lhsb(5,4,i,j) = tmp1 * 2.0d+00 * njac(5,4,i,j)
+               lhsb(5,5,i,j) = 1.0d+00
+     >              + tmp1 * 2.0d+00 * njac(5,5,i,j) 
+     >              + tmp1 * 2.0d+00 * dy5
+
+               lhsc(1,1,i,j,k,c) =  tmp2 * fjac(1,1,i,j+1)
+     >              - tmp1 * njac(1,1,i,j+1)
+     >              - tmp1 * dy1
+               lhsc(1,2,i,j,k,c) =  tmp2 * fjac(1,2,i,j+1)
+     >              - tmp1 * njac(1,2,i,j+1)
+               lhsc(1,3,i,j,k,c) =  tmp2 * fjac(1,3,i,j+1)
+     >              - tmp1 * njac(1,3,i,j+1)
+               lhsc(1,4,i,j,k,c) =  tmp2 * fjac(1,4,i,j+1)
+     >              - tmp1 * njac(1,4,i,j+1)
+               lhsc(1,5,i,j,k,c) =  tmp2 * fjac(1,5,i,j+1)
+     >              - tmp1 * njac(1,5,i,j+1)
+
+               lhsc(2,1,i,j,k,c) =  tmp2 * fjac(2,1,i,j+1)
+     >              - tmp1 * njac(2,1,i,j+1)
+               lhsc(2,2,i,j,k,c) =  tmp2 * fjac(2,2,i,j+1)
+     >              - tmp1 * njac(2,2,i,j+1)
+     >              - tmp1 * dy2
+               lhsc(2,3,i,j,k,c) =  tmp2 * fjac(2,3,i,j+1)
+     >              - tmp1 * njac(2,3,i,j+1)
+               lhsc(2,4,i,j,k,c) =  tmp2 * fjac(2,4,i,j+1)
+     >              - tmp1 * njac(2,4,i,j+1)
+               lhsc(2,5,i,j,k,c) =  tmp2 * fjac(2,5,i,j+1)
+     >              - tmp1 * njac(2,5,i,j+1)
+
+               lhsc(3,1,i,j,k,c) =  tmp2 * fjac(3,1,i,j+1)
+     >              - tmp1 * njac(3,1,i,j+1)
+               lhsc(3,2,i,j,k,c) =&n